You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@nutch.apache.org by th...@apache.org on 2016/07/16 19:48:17 UTC

[01/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build for nutch-core and nutch-plugins [Forced Update!]

Repository: nutch
Updated Branches:
  refs/heads/NUTCH-2292 9f3ba3eda -> 0bf453e57 (forced update)


http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-regex/src/test/resources/IntranetCrawling.rules
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-regex/src/test/resources/IntranetCrawling.rules b/nutch-plugins/urlfilter-regex/src/test/resources/IntranetCrawling.rules
new file mode 100644
index 0000000..705bdb2
--- /dev/null
+++ b/nutch-plugins/urlfilter-regex/src/test/resources/IntranetCrawling.rules
@@ -0,0 +1,27 @@
+# The url filter file used by the crawl command.
+
+# Better for intranet crawling.
+# Be sure to change MY.DOMAIN.NAME to your domain name.
+
+# Each non-comment, non-blank line contains a regular expression
+# prefixed by '+' or '-'.  The first matching pattern in the file
+# determines whether a URL is included or ignored.  If no pattern
+# matches, the URL is ignored.
+
+# skip file:, ftp:, & mailto: urls
+-^(file|ftp|mailto):
+
+# skip image and other suffixes we can't yet parse
+-\.(gif|GIF|jpg|JPG|ico|ICO|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe|png)$
+
+# skip URLs containing certain characters as probable queries, etc.
+-[?*!@=]
+
+# skip URLs with slash-delimited segment that repeats 3+ times, to break loops
+-.*(/.+?)/.*?\1/.*?\1/
+
+# accept hosts in MY.DOMAIN.NAME
++^http://([a-z0-9]*\.)*MY.DOMAIN.NAME/
+
+# skip everything else
+-.

[02/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build for nutch-core and nutch-plugins

Posted by th...@apache.org.

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-domain/src/test/java/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-domain/src/test/java/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java b/nutch-plugins/urlfilter-domain/src/test/java/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java
new file mode 100644
index 0000000..0be1e31
--- /dev/null
+++ b/nutch-plugins/urlfilter-domain/src/test/java/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlfilter.domain;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Assert;
+import org.junit.Test;
+
+public class TestDomainURLFilter {
+
+  private final static String SEPARATOR = System.getProperty("file.separator");
+  private final static String SAMPLES = System.getProperty("test.data", ".");
+
+  @Test
+  public void testFilter() throws Exception {
+
+    String domainFile = SAMPLES + SEPARATOR + "hosts.txt";
+    Configuration conf = NutchConfiguration.create();
+    DomainURLFilter domainFilter = new DomainURLFilter(domainFile);
+    domainFilter.setConf(conf);
+    Assert.assertNotNull(domainFilter.filter("http://lucene.apache.org"));
+    Assert.assertNotNull(domainFilter.filter("http://hadoop.apache.org"));
+    Assert.assertNotNull(domainFilter.filter("http://www.apache.org"));
+    Assert.assertNull(domainFilter.filter("http://www.google.com"));
+    Assert.assertNull(domainFilter.filter("http://mail.yahoo.com"));
+    Assert.assertNotNull(domainFilter.filter("http://www.foobar.net"));
+    Assert.assertNotNull(domainFilter.filter("http://www.foobas.net"));
+    Assert.assertNotNull(domainFilter.filter("http://www.yahoo.com"));
+    Assert.assertNotNull(domainFilter.filter("http://www.foobar.be"));
+    Assert.assertNull(domainFilter.filter("http://www.adobe.com"));
+  }
+  
+  @Test
+  public void testNoFilter() throws Exception {
+    // https://issues.apache.org/jira/browse/NUTCH-2189
+    String domainFile = SAMPLES + SEPARATOR + "this-file-does-not-exist.txt";
+    Configuration conf = NutchConfiguration.create();
+    DomainURLFilter domainFilter = new DomainURLFilter(domainFile);
+    domainFilter.setConf(conf);
+    Assert.assertNotNull(domainFilter.filter("http://lucene.apache.org"));
+    Assert.assertNotNull(domainFilter.filter("http://hadoop.apache.org"));
+    Assert.assertNotNull(domainFilter.filter("http://www.apache.org"));
+    Assert.assertNotNull(domainFilter.filter("http://www.google.com"));
+    Assert.assertNotNull(domainFilter.filter("http://mail.yahoo.com"));
+    Assert.assertNotNull(domainFilter.filter("http://www.foobar.net"));
+    Assert.assertNotNull(domainFilter.filter("http://www.foobas.net"));
+    Assert.assertNotNull(domainFilter.filter("http://www.yahoo.com"));
+    Assert.assertNotNull(domainFilter.filter("http://www.foobar.be"));
+    Assert.assertNotNull(domainFilter.filter("http://www.adobe.com"));
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-domain/src/test/resources/hosts.txt
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-domain/src/test/resources/hosts.txt b/nutch-plugins/urlfilter-domain/src/test/resources/hosts.txt
new file mode 100644
index 0000000..2b88c3b
--- /dev/null
+++ b/nutch-plugins/urlfilter-domain/src/test/resources/hosts.txt
@@ -0,0 +1,5 @@
+# comments start with the pound sign
+net
+apache.org
+be
+www.yahoo.com
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-domainblacklist/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-domainblacklist/build.xml b/nutch-plugins/urlfilter-domainblacklist/build.xml
new file mode 100644
index 0000000..19ea483
--- /dev/null
+++ b/nutch-plugins/urlfilter-domainblacklist/build.xml
@@ -0,0 +1,28 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="urlfilter-domainblacklist" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+  <!-- for junit test -->
+  <mkdir dir="${build.test}/data"/>
+  <copy todir="${build.test}/data">
+    <fileset dir="data" />
+  </copy>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-domainblacklist/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-domainblacklist/ivy.xml b/nutch-plugins/urlfilter-domainblacklist/ivy.xml
new file mode 100644
index 0000000..24d7606
--- /dev/null
+++ b/nutch-plugins/urlfilter-domainblacklist/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../../ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-domainblacklist/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-domainblacklist/plugin.xml b/nutch-plugins/urlfilter-domainblacklist/plugin.xml
new file mode 100644
index 0000000..04eee6e
--- /dev/null
+++ b/nutch-plugins/urlfilter-domainblacklist/plugin.xml
@@ -0,0 +1,43 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="urlfilter-domainblacklist"
+   name="Domain Blacklist URL Filter"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="urlfilter-domainblacklist.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+   <extension id="org.apache.nutch.net.urlfilter.domainblacklist"
+              name="Nutch Domain Blacklist URL Filter"
+              point="org.apache.nutch.net.URLFilter">
+      <implementation id="DomainBlacklistURLFilter"
+        class="org.apache.nutch.urlfilter.domainblacklist.DomainBlacklistURLFilter">
+        <parameter name="file" value="domainblacklist-urlfilter.txt"/>
+      </implementation>
+   </extension>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-domainblacklist/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-domainblacklist/pom.xml b/nutch-plugins/urlfilter-domainblacklist/pom.xml
new file mode 100644
index 0000000..a814579
--- /dev/null
+++ b/nutch-plugins/urlfilter-domainblacklist/pom.xml
@@ -0,0 +1,38 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>urlfilter-domainblacklist</artifactId>
+    <packaging>jar</packaging>
+
+    <name>urlfilter-domainblacklist</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-domainblacklist/src/main/java/org/apache/nutch/urlfilter/domainblacklist/DomainBlacklistURLFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-domainblacklist/src/main/java/org/apache/nutch/urlfilter/domainblacklist/DomainBlacklistURLFilter.java b/nutch-plugins/urlfilter-domainblacklist/src/main/java/org/apache/nutch/urlfilter/domainblacklist/DomainBlacklistURLFilter.java
new file mode 100644
index 0000000..37b1cdc
--- /dev/null
+++ b/nutch-plugins/urlfilter-domainblacklist/src/main/java/org/apache/nutch/urlfilter/domainblacklist/DomainBlacklistURLFilter.java
@@ -0,0 +1,210 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlfilter.domainblacklist;
+
+import java.io.BufferedReader;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+import java.util.LinkedHashSet;
+import java.util.Set;
+
+import org.apache.commons.lang.StringUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.net.URLFilter;
+import org.apache.nutch.plugin.Extension;
+import org.apache.nutch.plugin.PluginRepository;
+import org.apache.nutch.util.URLUtil;
+import org.apache.nutch.util.domain.DomainSuffix;
+
+/**
+ * <p>
+ * Filters URLs based on a file containing domain suffixes, domain names, and
+ * hostnames. A url that matches one of the suffixes, domains, or hosts present
+ * in the file is filtered out.
+ * </p>
+ * 
+ * <p>
+ * Urls are checked in order of domain suffix, domain name, and hostname against
+ * entries in the domain file. The domain file would be setup as follows with
+ * one entry per line:
+ * 
+ * <pre>
+ * com apache.org www.apache.org
+ * </pre>
+ * 
+ * <p>
+ * The first line is an example of a filter that would allow all .com domains.
+ * The second line allows all urls from apache.org and all of its subdomains
+ * such as lucene.apache.org and hadoop.apache.org. The third line would allow
+ * only urls from www.apache.org. There is no specific ordering to entries. The
+ * entries are from more general to more specific with the more general
+ * overridding the more specific.
+ * </p>
+ * 
+ * The domain file defaults to domainblacklist-urlfilter.txt in the classpath
+ * but can be overridden using the:
+ * 
+ * <ul>
+ * <ol>
+ * property "urlfilter.domainblacklist.file" in ./conf/nutch-*.xml, and
+ * </ol>
+ * <ol>
+ * attribute "file" in plugin.xml of this plugin
+ * </ol>
+ * </ul>
+ * 
+ * the attribute "file" has higher precedence if defined.
+ */
+public class DomainBlacklistURLFilter implements URLFilter {
+
+  private static final Logger LOG = LoggerFactory
+      .getLogger(DomainBlacklistURLFilter.class);
+
+  // read in attribute "file" of this plugin.
+  private static String attributeFile = null;
+  private Configuration conf;
+  private String domainFile = null;
+  private Set<String> domainSet = new LinkedHashSet<String>();
+
+  private void readConfiguration(Reader configReader) throws IOException {
+
+    // read the configuration file, line by line
+    BufferedReader reader = new BufferedReader(configReader);
+    String line = null;
+    while ((line = reader.readLine()) != null) {
+      if (StringUtils.isNotBlank(line) && !line.startsWith("#")) {
+        // add non-blank lines and non-commented lines
+        domainSet.add(StringUtils.lowerCase(line.trim()));
+      }
+    }
+  }
+
+  /**
+   * Default constructor.
+   */
+  public DomainBlacklistURLFilter() {
+
+  }
+
+  /**
+   * Constructor that specifies the domain file to use.
+   * 
+   * @param domainFile
+   *          The domain file, overrides domainblacklist-urlfilter.text default.
+   * 
+   * @throws IOException
+   */
+  public DomainBlacklistURLFilter(String domainFile) {
+    this.domainFile = domainFile;
+  }
+
+  /**
+   * Sets the configuration.
+   */
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+
+    // get the extensions for domain urlfilter
+    String pluginName = "urlfilter-domainblacklist";
+    Extension[] extensions = PluginRepository.get(conf)
+        .getExtensionPoint(URLFilter.class.getName()).getExtensions();
+    for (int i = 0; i < extensions.length; i++) {
+      Extension extension = extensions[i];
+      if (extension.getDescriptor().getPluginId().equals(pluginName)) {
+        attributeFile = extension.getAttribute("file");
+        break;
+      }
+    }
+
+    // handle blank non empty input
+    if (attributeFile != null && attributeFile.trim().equals("")) {
+      attributeFile = null;
+    }
+
+    if (attributeFile != null) {
+      if (LOG.isInfoEnabled()) {
+        LOG.info("Attribute \"file\" is defined for plugin " + pluginName
+            + " as " + attributeFile);
+      }
+    } else {
+      if (LOG.isWarnEnabled()) {
+        LOG.warn("Attribute \"file\" is not defined in plugin.xml for plugin "
+            + pluginName);
+      }
+    }
+
+    // domain file and attribute "file" take precedence if defined
+    String file = conf.get("urlfilter.domainblacklist.file");
+    String stringRules = conf.get("urlfilter.domainblacklist.rules");
+    if (domainFile != null) {
+      file = domainFile;
+    } else if (attributeFile != null) {
+      file = attributeFile;
+    }
+    Reader reader = null;
+    if (stringRules != null) { // takes precedence over files
+      reader = new StringReader(stringRules);
+    } else {
+      reader = conf.getConfResourceAsReader(file);
+    }
+    try {
+      if (reader == null) {
+        reader = new FileReader(file);
+      }
+      readConfiguration(reader);
+    } catch (IOException e) {
+      LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
+    }
+  }
+
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+  public String filter(String url) {
+    try {
+      // match for suffix, domain, and host in that order. more general will
+      // override more specific
+      String domain = URLUtil.getDomainName(url).toLowerCase().trim();
+      String host = URLUtil.getHost(url);
+      String suffix = null;
+      DomainSuffix domainSuffix = URLUtil.getDomainSuffix(url);
+      if (domainSuffix != null) {
+        suffix = domainSuffix.getDomain();
+      }
+
+      if (domainSet.contains(suffix) || domainSet.contains(domain)
+          || domainSet.contains(host)) {
+        // Matches, filter!
+        return null;
+      }
+
+      // doesn't match, allow
+      return url;
+    } catch (Exception e) {
+
+      // if an error happens, allow the url to pass
+      LOG.error("Could not apply filter on url: " + url + "\n"
+          + org.apache.hadoop.util.StringUtils.stringifyException(e));
+      return null;
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-domainblacklist/src/main/java/org/apache/nutch/urlfilter/domainblacklist/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-domainblacklist/src/main/java/org/apache/nutch/urlfilter/domainblacklist/package-info.java b/nutch-plugins/urlfilter-domainblacklist/src/main/java/org/apache/nutch/urlfilter/domainblacklist/package-info.java
new file mode 100644
index 0000000..1f0022c
--- /dev/null
+++ b/nutch-plugins/urlfilter-domainblacklist/src/main/java/org/apache/nutch/urlfilter/domainblacklist/package-info.java
@@ -0,0 +1,24 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * URL filter plugin to exclude URLs by domain suffixes, domain names, and/or host names.
+ * See {@link org.apache.nutch.urlfilter.domain} for the counterpart (include only URLs
+ * matching host or domain).
+ */
+package org.apache.nutch.urlfilter.domainblacklist;
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-domainblacklist/src/test/java/org/apache/nutch/urlfilter/domainblacklist/TestDomainBlacklistURLFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-domainblacklist/src/test/java/org/apache/nutch/urlfilter/domainblacklist/TestDomainBlacklistURLFilter.java b/nutch-plugins/urlfilter-domainblacklist/src/test/java/org/apache/nutch/urlfilter/domainblacklist/TestDomainBlacklistURLFilter.java
new file mode 100644
index 0000000..d253867
--- /dev/null
+++ b/nutch-plugins/urlfilter-domainblacklist/src/test/java/org/apache/nutch/urlfilter/domainblacklist/TestDomainBlacklistURLFilter.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlfilter.domainblacklist;
+
+import org.junit.Assert;
+import org.junit.Test;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.NutchConfiguration;
+
+public class TestDomainBlacklistURLFilter {
+
+  private final static String SEPARATOR = System.getProperty("file.separator");
+  private final static String SAMPLES = System.getProperty("test.data", ".");
+
+  @Test
+  public void testFilter() throws Exception {
+
+    String domainBlacklistFile = SAMPLES + SEPARATOR + "hosts.txt";
+    Configuration conf = NutchConfiguration.create();
+    DomainBlacklistURLFilter domainBlacklistFilter = new DomainBlacklistURLFilter(
+        domainBlacklistFile);
+    domainBlacklistFilter.setConf(conf);
+    Assert.assertNull(domainBlacklistFilter.filter("http://lucene.apache.org"));
+    Assert.assertNull(domainBlacklistFilter.filter("http://hadoop.apache.org"));
+    Assert.assertNull(domainBlacklistFilter.filter("http://www.apache.org"));
+    Assert.assertNotNull(domainBlacklistFilter.filter("http://www.google.com"));
+    Assert.assertNotNull(domainBlacklistFilter.filter("http://mail.yahoo.com"));
+    Assert.assertNull(domainBlacklistFilter.filter("http://www.foobar.net"));
+    Assert.assertNull(domainBlacklistFilter.filter("http://www.foobas.net"));
+    Assert.assertNull(domainBlacklistFilter.filter("http://www.yahoo.com"));
+    Assert.assertNull(domainBlacklistFilter.filter("http://www.foobar.be"));
+    Assert.assertNotNull(domainBlacklistFilter.filter("http://www.adobe.com"));
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-domainblacklist/src/test/resources/hosts.txt
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-domainblacklist/src/test/resources/hosts.txt b/nutch-plugins/urlfilter-domainblacklist/src/test/resources/hosts.txt
new file mode 100644
index 0000000..2b88c3b
--- /dev/null
+++ b/nutch-plugins/urlfilter-domainblacklist/src/test/resources/hosts.txt
@@ -0,0 +1,5 @@
+# comments start with the pound sign
+net
+apache.org
+be
+www.yahoo.com
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-ignoreexempt/README.md
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-ignoreexempt/README.md b/nutch-plugins/urlfilter-ignoreexempt/README.md
new file mode 100644
index 0000000..d48b672
--- /dev/null
+++ b/nutch-plugins/urlfilter-ignoreexempt/README.md
@@ -0,0 +1,43 @@
+urlfilter-ignoreexempt
+======================
+  This plugin allows certain urls to be exempted when the external links are configured to be ignored.
+  This is useful when focused crawl is setup but some resources like static files are linked from CDNs (external domains).
+
+# How to enable ?
+Add `urlfilter-ignoreexempt` value to `plugin.includes` property
+```xml
+<property>
+  <name>plugin.includes</name>
+  <value>protocol-http|urlfilter-(regex|ignoreexempt)...</value>
+</property>
+```
+
+# How to configure rules?
+
+open `conf/db-ignore-external-exemptions.txt` and add the regex rules.
+
+## Format :
+
+The format is same same as `regex-urlfilter.txt`.
+ Each non-comment, non-blank line contains a regular expression
+ prefixed by '+' or '-'.  The first matching pattern in the file
+ determines whether a URL is exempted or ignored.  If no pattern
+ matches, the URL is ignored.
+
+
+## Example :
+
+ To exempt urls ending with image extensions, use this rule
+
+`+(?i)\.(jpg|png|gif)$`
+
+   
+   
+## Testing the Rules :
+
+After enabling the plugin and adding your rules to `conf/db-ignore-external-exemptions.txt`, run:
+   
+`bin/nutch plugin urlfilter-ignoreexempt  org.apache.nutch.urlfilter.ignoreexempt.ExemptionUrlFilter http://yoururl.here`
+
+
+This should print `true` for urls which are accepted by configured rules.
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-ignoreexempt/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-ignoreexempt/build.xml b/nutch-plugins/urlfilter-ignoreexempt/build.xml
new file mode 100644
index 0000000..105f551
--- /dev/null
+++ b/nutch-plugins/urlfilter-ignoreexempt/build.xml
@@ -0,0 +1,55 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="urlfilter-ignoreexempt" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+  <!-- Build compilation dependencies -->
+  <target name="deps-jar">
+    <ant target="jar" inheritall="false" dir="../lib-regex-filter"/>
+  </target>
+
+  <!-- Add compilation dependencies to classpath -->
+  <path id="plugin.deps">
+    <fileset dir="${nutch.root}/build">
+      <include name="**/lib-regex-filter/*.jar" />
+      <include name="**/urlfilter-regex/*.jar" />
+    </fileset>
+    <pathelement location="${nutch.root}/build/lib-regex-filter/test"/>
+    <pathelement location="${nutch.root}/build/urlfilter-regex/test"/>
+  </path>
+
+  <!-- Compile test classes for dependencies -->
+  <target name="deps-test-compile">
+    <ant target="compile-test" inheritall="false" dir="../lib-regex-filter"/>
+    <ant target="compile-test" inheritall="false" dir="../urlfilter-regex"/>
+  </target>
+
+  <!-- Deploy Unit test dependencies -->
+  <target name="deps-test">
+    <ant target="deploy" inheritall="false" dir="../lib-regex-filter"/>
+    <ant target="deploy" inheritall="false" dir="../urlfilter-regex"/>
+  </target>
+
+  <!-- for junit test -->
+  <mkdir dir="${build.test}/data"/>
+  <copy todir="${build.test}/data">
+    <fileset dir="data" />
+  </copy>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-ignoreexempt/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-ignoreexempt/ivy.xml b/nutch-plugins/urlfilter-ignoreexempt/ivy.xml
new file mode 100644
index 0000000..1a86d68
--- /dev/null
+++ b/nutch-plugins/urlfilter-ignoreexempt/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-ignoreexempt/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-ignoreexempt/plugin.xml b/nutch-plugins/urlfilter-ignoreexempt/plugin.xml
new file mode 100644
index 0000000..4139ca4
--- /dev/null
+++ b/nutch-plugins/urlfilter-ignoreexempt/plugin.xml
@@ -0,0 +1,45 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="urlfilter-ignoreexempt"
+   name="External Domain Ignore Exemption"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="urlfilter-ignoreexempt.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+      <import plugin="lib-regex-filter"/>
+      <import plugin="urlfilter-regex"/>
+   </requires>
+
+   <extension id="org.apache.nutch.net.urlfilter.ignoreexempt"
+              name="Ignore Exemption Url Filter"
+              point="org.apache.nutch.net.URLExemptionFilter">
+      <implementation id="ExemptionUrlFilter"
+        class="org.apache.nutch.urlfilter.ignoreexempt.ExemptionUrlFilter">
+        <parameter name="file" value="db-ignore-external-exemptions.txt"/>
+      </implementation>
+   </extension>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-ignoreexempt/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-ignoreexempt/pom.xml b/nutch-plugins/urlfilter-ignoreexempt/pom.xml
new file mode 100644
index 0000000..fd26587
--- /dev/null
+++ b/nutch-plugins/urlfilter-ignoreexempt/pom.xml
@@ -0,0 +1,45 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>urlfilter-ignoreexempt</artifactId>
+    <packaging>jar</packaging>
+
+    <name>urlfilter-ignoreexempt</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+    <dependencies>
+        <dependency>
+            <groupId>org.apache.nutch</groupId>
+            <artifactId>urlfilter-regex</artifactId>
+            <version>${project.parent.version}</version>
+        </dependency>
+    </dependencies>
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-ignoreexempt/src/main/java/org/apache/nutch/urlfilter/ignoreexempt/ExemptionUrlFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-ignoreexempt/src/main/java/org/apache/nutch/urlfilter/ignoreexempt/ExemptionUrlFilter.java b/nutch-plugins/urlfilter-ignoreexempt/src/main/java/org/apache/nutch/urlfilter/ignoreexempt/ExemptionUrlFilter.java
new file mode 100644
index 0000000..bbac300
--- /dev/null
+++ b/nutch-plugins/urlfilter-ignoreexempt/src/main/java/org/apache/nutch/urlfilter/ignoreexempt/ExemptionUrlFilter.java
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlfilter.ignoreexempt;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.net.URLExemptionFilter;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.urlfilter.regex.RegexURLFilter;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Reader;
+import java.util.Arrays;
+import java.util.regex.Pattern;
+import java.util.List;
+import java.util.ArrayList;
+
+
+/**
+ * This implementation of {@link org.apache.nutch.net.URLExemptionFilter} uses regex configuration
+ * to check if URL is eligible for exemption from 'db.ignore.external'.
+ * When this filter is enabled, the external urls will be checked against configured sequence of regex rules.
+ *<p>
+ * The exemption rule file defaults to db-ignore-external-exemptions.txt in the classpath but can be
+ * overridden using the property  <code>"db.ignore.external.exemptions.file" in ./conf/nutch-*.xml</code>
+ *</p>
+ *
+ * The exemption rules are specified in plain text file where each line is a rule.
+ * The format is same same as `regex-urlfilter.txt`.
+ * Each non-comment, non-blank line contains a regular expression
+ * prefixed by '+' or '-'.  The first matching pattern in the file
+ * determines whether a URL is exempted or ignored.  If no pattern
+ * matches, the URL is ignored.
+ *
+ * @since Feb 10, 2016
+ * @version 1
+ * @see org.apache.nutch.net.URLExemptionFilter
+ * @see org.apache.nutch.urlfilter.regex.RegexURLFilter
+ */
+public class ExemptionUrlFilter extends RegexURLFilter
+    implements URLExemptionFilter {
+
+  public static final String DB_IGNORE_EXTERNAL_EXEMPTIONS_FILE
+      = "db.ignore.external.exemptions.file";
+  private static final Logger LOG =
+      LoggerFactory.getLogger(ExemptionUrlFilter.class);
+
+  private List<Pattern> exemptions;
+  private Configuration conf;
+
+  public List<Pattern> getExemptions() {
+    return exemptions;
+  }
+
+  @Override
+  public boolean filter(String fromUrl, String toUrl) {
+    //this implementation does not consider fromUrl param.
+    //the regex rules are applied to toUrl.
+    return this.filter(toUrl) != null;
+  }
+
+  /**
+   * Gets reader for regex rules
+   */
+  protected Reader getRulesReader(Configuration conf)
+      throws IOException {
+    String fileRules = conf.get(DB_IGNORE_EXTERNAL_EXEMPTIONS_FILE);
+    return conf.getConfResourceAsReader(fileRules);
+  }
+
+  public static void main(String[] args) {
+
+    if (args.length != 1) {
+      System.out.println("Error: Invalid Args");
+      System.out.println("Usage: " +
+          ExemptionUrlFilter.class.getName() + " <url>");
+      return;
+    }
+    String url = args[0];
+    ExemptionUrlFilter instance = new ExemptionUrlFilter();
+    instance.setConf(NutchConfiguration.create());
+    System.out.println(instance.filter(null, url));
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-ignoreexempt/src/main/java/org/apache/nutch/urlfilter/ignoreexempt/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-ignoreexempt/src/main/java/org/apache/nutch/urlfilter/ignoreexempt/package-info.java b/nutch-plugins/urlfilter-ignoreexempt/src/main/java/org/apache/nutch/urlfilter/ignoreexempt/package-info.java
new file mode 100644
index 0000000..ee949c5
--- /dev/null
+++ b/nutch-plugins/urlfilter-ignoreexempt/src/main/java/org/apache/nutch/urlfilter/ignoreexempt/package-info.java
@@ -0,0 +1,24 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * URL filter plugin which identifies exemptions to external urls when
+ * when external urls are set to ignore.
+ *
+ */
+package org.apache.nutch.urlfilter.ignoreexempt;
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-prefix/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-prefix/build.xml b/nutch-plugins/urlfilter-prefix/build.xml
new file mode 100644
index 0000000..33faa48
--- /dev/null
+++ b/nutch-plugins/urlfilter-prefix/build.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="urlfilter-prefix" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-prefix/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-prefix/ivy.xml b/nutch-plugins/urlfilter-prefix/ivy.xml
new file mode 100644
index 0000000..1a86d68
--- /dev/null
+++ b/nutch-plugins/urlfilter-prefix/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-prefix/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-prefix/plugin.xml b/nutch-plugins/urlfilter-prefix/plugin.xml
new file mode 100644
index 0000000..22cfcaf
--- /dev/null
+++ b/nutch-plugins/urlfilter-prefix/plugin.xml
@@ -0,0 +1,47 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="urlfilter-prefix"
+   name="Prefix URL Filter"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="urlfilter-prefix.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+   <extension id="org.apache.nutch.net.urlfilter.prefix"
+              name="Nutch Prefix URL Filter"
+              point="org.apache.nutch.net.URLFilter">
+      <implementation id="PrefixURLFilter"
+                      class="org.apache.nutch.urlfilter.prefix.PrefixURLFilter"/>
+      <!-- by default, attribute "file" is undefined, to keep classic behavior.
+      <implementation id="PrefixURLFilter"
+                      class="org.apache.nutch.net.PrefixURLFilter">
+        <parameter name="file" value="urlfilter-prefix.txt"/>
+      </implementation>
+      -->
+   </extension>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-prefix/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-prefix/pom.xml b/nutch-plugins/urlfilter-prefix/pom.xml
new file mode 100644
index 0000000..65ad019
--- /dev/null
+++ b/nutch-plugins/urlfilter-prefix/pom.xml
@@ -0,0 +1,38 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>urlfilter-prefix</artifactId>
+    <packaging>jar</packaging>
+
+    <name>urlfilter-prefix</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-prefix/src/main/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-prefix/src/main/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java b/nutch-plugins/urlfilter-prefix/src/main/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java
new file mode 100644
index 0000000..2e955b5
--- /dev/null
+++ b/nutch-plugins/urlfilter-prefix/src/main/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java
@@ -0,0 +1,178 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlfilter.prefix;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.net.*;
+
+import org.apache.nutch.util.PrefixStringMatcher;
+import org.apache.nutch.util.TrieStringMatcher;
+
+import org.apache.nutch.plugin.Extension;
+import org.apache.nutch.plugin.PluginRepository;
+
+import java.io.Reader;
+import java.io.FileReader;
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
+import java.io.IOException;
+import java.io.StringReader;
+
+import java.util.List;
+import java.util.ArrayList;
+
+/**
+ * Filters URLs based on a file of URL prefixes. The file is named by (1)
+ * property "urlfilter.prefix.file" in ./conf/nutch-default.xml, and (2)
+ * attribute "file" in plugin.xml of this plugin Attribute "file" has higher
+ * precedence if defined.
+ * 
+ * <p>
+ * The format of this file is one URL prefix per line.
+ * </p>
+ */
+public class PrefixURLFilter implements URLFilter {
+
+  private static final Logger LOG = LoggerFactory
+      .getLogger(PrefixURLFilter.class);
+
+  // read in attribute "file" of this plugin.
+  private static String attributeFile = null;
+
+  private TrieStringMatcher trie;
+
+  private Configuration conf;
+
+  public PrefixURLFilter() throws IOException {
+
+  }
+
+  public PrefixURLFilter(String stringRules) throws IOException {
+    trie = readConfiguration(new StringReader(stringRules));
+  }
+
+  public String filter(String url) {
+    if (trie.shortestMatch(url) == null)
+      return null;
+    else
+      return url;
+  }
+
+  private TrieStringMatcher readConfiguration(Reader reader) throws IOException {
+
+    BufferedReader in = new BufferedReader(reader);
+    List<String> urlprefixes = new ArrayList<String>();
+    String line;
+
+    while ((line = in.readLine()) != null) {
+      if (line.length() == 0)
+        continue;
+
+      char first = line.charAt(0);
+      switch (first) {
+      case ' ':
+      case '\n':
+      case '#': // skip blank & comment lines
+        continue;
+      default:
+        urlprefixes.add(line);
+      }
+    }
+
+    return new PrefixStringMatcher(urlprefixes);
+  }
+
+  public static void main(String args[]) throws IOException {
+
+    PrefixURLFilter filter;
+    if (args.length >= 1)
+      filter = new PrefixURLFilter(args[0]);
+    else
+      filter = new PrefixURLFilter();
+
+    BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
+    String line;
+    while ((line = in.readLine()) != null) {
+      String out = filter.filter(line);
+      if (out != null) {
+        System.out.println(out);
+      }
+    }
+  }
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+
+    String pluginName = "urlfilter-prefix";
+    Extension[] extensions = PluginRepository.get(conf)
+        .getExtensionPoint(URLFilter.class.getName()).getExtensions();
+    for (int i = 0; i < extensions.length; i++) {
+      Extension extension = extensions[i];
+      if (extension.getDescriptor().getPluginId().equals(pluginName)) {
+        attributeFile = extension.getAttribute("file");
+        break;
+      }
+    }
+    if (attributeFile != null && attributeFile.trim().equals(""))
+      attributeFile = null;
+    if (attributeFile != null) {
+      if (LOG.isInfoEnabled()) {
+        LOG.info("Attribute \"file\" is defined for plugin " + pluginName
+            + " as " + attributeFile);
+      }
+    } else {
+      // if (LOG.isWarnEnabled()) {
+      // LOG.warn("Attribute \"file\" is not defined in plugin.xml for
+      // plugin "+pluginName);
+      // }
+    }
+
+    String file = conf.get("urlfilter.prefix.file");
+    String stringRules = conf.get("urlfilter.prefix.rules");
+    // attribute "file" takes precedence if defined
+    if (attributeFile != null)
+      file = attributeFile;
+    Reader reader = null;
+    if (stringRules != null) { // takes precedence over files
+      reader = new StringReader(stringRules);
+    } else {
+      reader = conf.getConfResourceAsReader(file);
+    }
+
+    if (reader == null) {
+      trie = new PrefixStringMatcher(new String[0]);
+    } else {
+      try {
+        trie = readConfiguration(reader);
+      } catch (IOException e) {
+        if (LOG.isErrorEnabled()) {
+          LOG.error(e.getMessage());
+        }
+        // TODO mb@media-style.com: throw Exception? Because broken api.
+        throw new RuntimeException(e.getMessage(), e);
+      }
+    }
+  }
+
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-prefix/src/main/java/org/apache/nutch/urlfilter/prefix/package.html
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-prefix/src/main/java/org/apache/nutch/urlfilter/prefix/package.html b/nutch-plugins/urlfilter-prefix/src/main/java/org/apache/nutch/urlfilter/prefix/package.html
new file mode 100644
index 0000000..dbed0be
--- /dev/null
+++ b/nutch-plugins/urlfilter-prefix/src/main/java/org/apache/nutch/urlfilter/prefix/package.html
@@ -0,0 +1,5 @@
+<html>
+<body>
+<p>URL filter plugin to include only URLs which match one of a given list of URL prefixes.</p>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-prefix/src/test/java/org/apache/nutch/urlfilter/prefix/TestPrefixURLFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-prefix/src/test/java/org/apache/nutch/urlfilter/prefix/TestPrefixURLFilter.java b/nutch-plugins/urlfilter-prefix/src/test/java/org/apache/nutch/urlfilter/prefix/TestPrefixURLFilter.java
new file mode 100644
index 0000000..b7a7ce4
--- /dev/null
+++ b/nutch-plugins/urlfilter-prefix/src/test/java/org/apache/nutch/urlfilter/prefix/TestPrefixURLFilter.java
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlfilter.prefix;
+
+import junit.framework.Test;
+import junit.framework.TestCase;
+import junit.framework.TestSuite;
+import junit.textui.TestRunner;
+
+import java.io.IOException;
+
+
+/**
+ * JUnit test for <code>PrefixURLFilter</code>.
+ *
+ * @author Talat Uyarer
+ * @author Cihad Guzel
+ */
+public class TestPrefixURLFilter extends TestCase {
+  private static final String prefixes =
+    "# this is a comment\n" +
+    "\n" +
+    "http://\n" +
+    "https://\n" +
+    "file://\n" +
+    "ftp://\n";
+
+  private static final String[] urls = new String[] {
+    "http://www.example.com/",
+    "https://www.example.com/",
+    "ftp://www.example.com/",
+    "file://www.example.com/",
+    "abcd://www.example.com/",
+    "www.example.com/",
+  };
+
+  private static String[] urlsModeAccept = new String[] {
+    urls[0],
+    urls[1],
+    urls[2],
+    urls[3],
+    null,
+    null
+  };
+
+  private PrefixURLFilter filter = null;
+
+  public static Test suite() {
+    return new TestSuite(TestPrefixURLFilter.class);
+  }
+
+  public static void main(String[] args) {
+    TestRunner.run(suite());
+  }
+
+  public void setUp() throws IOException {
+    filter = new PrefixURLFilter(prefixes);
+  }
+
+  public void testModeAccept() {
+    for (int i = 0; i < urls.length; i++) {
+      assertTrue(urlsModeAccept[i] == filter.filter(urls[i]));
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-regex/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-regex/build.xml b/nutch-plugins/urlfilter-regex/build.xml
new file mode 100644
index 0000000..5b80d08
--- /dev/null
+++ b/nutch-plugins/urlfilter-regex/build.xml
@@ -0,0 +1,51 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="urlfilter-regex" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+  <!-- Build compilation dependencies -->
+  <target name="deps-jar">
+    <ant target="jar" inheritall="false" dir="../lib-regex-filter"/>
+  </target>
+
+  <!-- Add compilation dependencies to classpath -->
+  <path id="plugin.deps">
+    <fileset dir="${nutch.root}/build">
+      <include name="**/lib-regex-filter/*.jar" />
+    </fileset>
+    <pathelement location="${nutch.root}/build/lib-regex-filter/test"/>
+  </path>
+
+  <!-- Compile test classes for dependencies -->
+  <target name="deps-test-compile">
+    <ant target="compile-test" inheritall="false" dir="../lib-regex-filter"/>
+  </target>
+
+  <!-- Deploy Unit test dependencies -->
+  <target name="deps-test">
+    <ant target="deploy" inheritall="false" dir="../lib-regex-filter"/>
+  </target>
+
+  <!-- for junit test -->
+  <mkdir dir="${build.test}/data"/>
+  <copy todir="${build.test}/data">
+    <fileset dir="sample" includes="**/*.rules, **/*.urls"/>
+  </copy>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-regex/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-regex/ivy.xml b/nutch-plugins/urlfilter-regex/ivy.xml
new file mode 100644
index 0000000..1a86d68
--- /dev/null
+++ b/nutch-plugins/urlfilter-regex/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-regex/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-regex/plugin.xml b/nutch-plugins/urlfilter-regex/plugin.xml
new file mode 100644
index 0000000..34f4a91
--- /dev/null
+++ b/nutch-plugins/urlfilter-regex/plugin.xml
@@ -0,0 +1,48 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="urlfilter-regex"
+   name="Regex URL Filter"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="urlfilter-regex.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+      <import plugin="lib-regex-filter"/>
+   </requires>
+
+   <extension id="org.apache.nutch.net.urlfilter.regex"
+              name="Nutch Regex URL Filter"
+              point="org.apache.nutch.net.URLFilter">
+      <implementation id="RegexURLFilter"
+                      class="org.apache.nutch.urlfilter.regex.RegexURLFilter"/>
+      <!-- by default, attribute "file" is undefined, to keep classic behavior.
+      <implementation id="RegexURLFilter"
+                      class="org.apache.nutch.net.RegexURLFilter">
+        <parameter name="file" value="urlfilter-regex.txt"/>
+      </implementation>
+      -->
+   </extension>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-regex/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-regex/pom.xml b/nutch-plugins/urlfilter-regex/pom.xml
new file mode 100644
index 0000000..db9e7bd
--- /dev/null
+++ b/nutch-plugins/urlfilter-regex/pom.xml
@@ -0,0 +1,53 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>urlfilter-regex</artifactId>
+    <packaging>jar</packaging>
+
+    <name>urlfilter-regex</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+    <dependencies>
+        <dependency>
+            <groupId>org.apache.nutch</groupId>
+            <artifactId>lib-regex-filter</artifactId>
+            <version>${project.parent.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.nutch</groupId>
+            <artifactId>lib-regex-filter</artifactId>
+            <version>${project.parent.version}</version>
+            <scope>test</scope>
+            <type>test-jar</type>
+        </dependency>
+    </dependencies>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-regex/src/main/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-regex/src/main/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java b/nutch-plugins/urlfilter-regex/src/main/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java
new file mode 100644
index 0000000..2988114
--- /dev/null
+++ b/nutch-plugins/urlfilter-regex/src/main/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java
@@ -0,0 +1,111 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlfilter.regex;
+
+// JDK imports
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+import java.util.regex.Pattern;
+import java.util.regex.PatternSyntaxException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.urlfilter.api.RegexRule;
+import org.apache.nutch.urlfilter.api.RegexURLFilterBase;
+import org.apache.nutch.util.NutchConfiguration;
+
+/**
+ * Filters URLs based on a file of regular expressions using the
+ * {@link java.util.regex Java Regex implementation}.
+ */
+public class RegexURLFilter extends RegexURLFilterBase {
+
+  public static final String URLFILTER_REGEX_FILE = "urlfilter.regex.file";
+  public static final String URLFILTER_REGEX_RULES = "urlfilter.regex.rules";
+
+  public RegexURLFilter() {
+    super();
+  }
+
+  public RegexURLFilter(String filename) throws IOException,
+      PatternSyntaxException {
+    super(filename);
+  }
+
+  RegexURLFilter(Reader reader) throws IOException, IllegalArgumentException {
+    super(reader);
+  }
+
+  /*
+   * ----------------------------------- * <implementation:RegexURLFilterBase> *
+   * -----------------------------------
+   */
+
+  /**
+   * Rules specified as a config property will override rules specified as a
+   * config file.
+   */
+  protected Reader getRulesReader(Configuration conf) throws IOException {
+    String stringRules = conf.get(URLFILTER_REGEX_RULES);
+    if (stringRules != null) {
+      return new StringReader(stringRules);
+    }
+    String fileRules = conf.get(URLFILTER_REGEX_FILE);
+    return conf.getConfResourceAsReader(fileRules);
+  }
+
+  // Inherited Javadoc
+  protected RegexRule createRule(boolean sign, String regex) {
+    return new Rule(sign, regex);
+  }
+  
+  protected RegexRule createRule(boolean sign, String regex, String hostOrDomain) {
+    return new Rule(sign, regex, hostOrDomain);
+  }
+  
+  
+
+  /*
+   * ------------------------------------ * </implementation:RegexURLFilterBase>
+   * * ------------------------------------
+   */
+
+  public static void main(String args[]) throws IOException {
+    RegexURLFilter filter = new RegexURLFilter();
+    filter.setConf(NutchConfiguration.create());
+    main(filter, args);
+  }
+
+  private class Rule extends RegexRule {
+
+    private Pattern pattern;
+
+    Rule(boolean sign, String regex) {
+      this(sign, regex, null);
+    }
+    
+    Rule(boolean sign, String regex, String hostOrDomain) {
+      super(sign, regex, hostOrDomain);
+      pattern = Pattern.compile(regex);
+    }
+
+    protected boolean match(String url) {
+      return pattern.matcher(url).find();
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-regex/src/main/java/org/apache/nutch/urlfilter/regex/package.html
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-regex/src/main/java/org/apache/nutch/urlfilter/regex/package.html b/nutch-plugins/urlfilter-regex/src/main/java/org/apache/nutch/urlfilter/regex/package.html
new file mode 100644
index 0000000..7acf73b
--- /dev/null
+++ b/nutch-plugins/urlfilter-regex/src/main/java/org/apache/nutch/urlfilter/regex/package.html
@@ -0,0 +1,5 @@
+<html>
+<body>
+<p>URL filter plugin to include and/or exclude URLs matching Java regular expressions.</p>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-regex/src/test/java/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-regex/src/test/java/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java b/nutch-plugins/urlfilter-regex/src/test/java/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java
new file mode 100644
index 0000000..b86181e
--- /dev/null
+++ b/nutch-plugins/urlfilter-regex/src/test/java/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java
@@ -0,0 +1,61 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlfilter.regex;
+
+// JDK imports
+import java.io.IOException;
+import java.io.Reader;
+
+import org.apache.nutch.net.*;
+// Nutch imports
+import org.apache.nutch.urlfilter.api.RegexURLFilterBaseTest;
+import org.junit.Assert;
+import org.junit.Test;
+
+/**
+ * JUnit based test of class <code>RegexURLFilter</code>.
+ * 
+ * @author J&eacute;r&ocirc;me Charron
+ */
+public class TestRegexURLFilter extends RegexURLFilterBaseTest {
+
+  protected URLFilter getURLFilter(Reader rules) {
+    try {
+      return new RegexURLFilter(rules);
+    } catch (IOException e) {
+      Assert.fail(e.toString());
+      return null;
+    }
+  }
+
+  @Test
+  public void test() {
+    test("WholeWebCrawling");
+    test("IntranetCrawling");
+    bench(50, "Benchmarks");
+    bench(100, "Benchmarks");
+    bench(200, "Benchmarks");
+    bench(400, "Benchmarks");
+    bench(800, "Benchmarks");
+  }
+  
+  @Test
+  public void test1838() {
+    test("nutch1838");
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-regex/src/test/resources/Benchmarks.rules
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-regex/src/test/resources/Benchmarks.rules b/nutch-plugins/urlfilter-regex/src/test/resources/Benchmarks.rules
new file mode 100644
index 0000000..c8901e2
--- /dev/null
+++ b/nutch-plugins/urlfilter-regex/src/test/resources/Benchmarks.rules
@@ -0,0 +1,26 @@
+# The url filter file used by the crawl command.
+
+# Better for intranet crawling.
+# Be sure to change MY.DOMAIN.NAME to your domain name.
+
+# Each non-comment, non-blank line contains a regular expression
+# prefixed by '+' or '-'.  The first matching pattern in the file
+# determines whether a URL is included or ignored.  If no pattern
+# matches, the URL is ignored.
+
+# skip file:, ftp:, & mailto: urls
+-^(file|ftp|mailto):
+
+# skip image and other suffixes we can't yet parse
+-\.(gif|GIF|jpg|JPG|ico|ICO|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe|png)$
+
+# skip URLs containing certain characters as probable queries, etc.
+-[?*!@=]
+
+# skip .fr .org and .net domains
+-^.*//.*\.fr/
+-^.*//.*\.org/
+-^.*//.*\.net/
+
+# skip everything else
++.

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-regex/src/test/resources/Benchmarks.urls
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-regex/src/test/resources/Benchmarks.urls b/nutch-plugins/urlfilter-regex/src/test/resources/Benchmarks.urls
new file mode 100644
index 0000000..40bf4ee
--- /dev/null
+++ b/nutch-plugins/urlfilter-regex/src/test/resources/Benchmarks.urls
@@ -0,0 +1,297 @@
++http://www.hostip.info/
+-http://www.elanceur.org/Articles/OntologieSurfaite.html
++http://www.opensymphony.com/quartz/
+-http://www.portletbridge.org/saxbenchmark/index.html
++http://www.lesmotsdelinfo.com/
++http://usefulinc.com/doap/
++http://www.codezoo.com/
++http://search.infocious.com/
+-http://pedagogie.ac-montpellier.fr/disciplines/anglais/tice/sms.html
++http://www.brics.dk/%7Eamoeller/automaton/
++http://jazzz.com/wp.html
++http://www.maxkiesler.com/index.php
++http://adscriptum.blogspot.com/2006/03/google-et-la-prsentation-deric-schmidt.html
++http://www.alias-i.com/lingpipe/
+-http://johnny.ihackstuff.com/index.php?module=prodreviews
+-http://www.spurl.net/
++http://www.dropload.com/
++http://vivisimo.com/
++http://www.marumushi.com/apps/newsmap/newsmap.cfm
++http://www.ixquick.com/
+-http://today.java.net/pub/a/today/2003/07/30/LuceneIntro.html
++http://www.mail-archive.com/
++http://www.spymac.com/
+-http://browsers.evolt.org/
+-http://www.oswd.org/
++http://www.stayinvisible.com/index.pl
++http://java.sun.com/j2se/1.4.2/docs/api/index.html
++http://www.microsoft.com/resources/documentation/windows/xp/all/proddocs/en-us/ntcmds.mspx
++http://www.bloglines.com/
+-http://www.fckeditor.net/
++http://search.msn.com/
+-http://www.grub.org/
++http://www.xml.com/pub/a/2000/11/29/schemas/part1.html
+-http://www.mnot.net/cache_docs/
+-http://www.furl.net/
++http://www.blogpulse.com/
++http://www.googlefight.com/
++http://www.rokulabs.com/
+-http://mightylegends.zapto.org/dvd/dvdauthor_howto.php
+-http://www.batbox.org/wrt54g-linux.html
+-http://en.wikipedia.org/wiki/%s
++http://www.sipcenter.com/
++http://www.merriampark.com/ld.htm
++http://anon.inf.tu-dresden.de/index_en.html
++http://www.pluck.com/
++http://www.tiddlywiki.com/
++http://www.jux2.com/
++http://clusty.com/
+-http://findability.org/
++http://www.searchengineshowdown.com/
++http://www.nhacks.com/email/index.php
++http://www.koders.com/
++http://www.cs.rochester.edu/sosp2003/papers/p125-ghemawat.pdf
++http://www.gmailwiki.com/index.php/Main_Page
++http://www.tadalist.com/
++http://www.net2ftp.com/
++http://www.streamload.com/
++http://www.lucazappa.com/brilliantMaker/buttonImage.php
++http://www.hybernaut.com/bdv/delicious-import.html
++http://www.gtmcknight.com/buttons/
++http://amb.vis.ne.jp/mozilla/scrapbook/
++http://g-metrics.com/index.php
+-http://tor.eff.org/
++http://www.search-this.com/search_engine_decoder.asp
++http://www.onjava.com/pub/a/onjava/2005/01/26/classloading.html
++http://www.adaptivepath.com/publications/essays/archives/000385.php
+-http://isnoop.net/gmail/
+-http://openweb.eu.org/
++http://www.mistergooddeal.com/
++http://javatoolbox.com/
+-http://www.freenews.fr/
++http://www.wikiwax.com/
+-http://today.java.net/pub/a/today/2005/04/21/farm.html
++http://users.skynet.be/J.Beever/pave.htm
++http://www.lundi8h.com/
++http://www.snap.com/
++http://www.goosee.com/puppy/index.shtml
+-http://www.softwarefreedom.org/index.html
+-http://y.20q.net/
++http://www.bitty.com/
++http://www.lafraise.com/
+-http://www.liquidinformation.org/
++http://www.searchtools.com/
++http://www.martinfowler.com/articles/injection.html
++http://pdos.csail.mit.edu/scigen/
+-http://developer.yahoo.net/blog/
++http://blogger-templates.blogspot.com/
++http://phpadsnew.com/two/
++http://www.langreiter.com/exec/yahoo-vs-google.html
+-http://www.dataparksearch.org/
+-http://www.yubnub.org/
+-http://www.fing.org/
+-http://www.swish-e.org/
+-http://www.openajax.net/wordpress/
++http://crypto.stanford.edu/PwdHash/
++http://www.html-kit.com/favicon/
+-http://today.java.net/pub/a/today/2005/08/09/didyoumean.html?page=1
++http://www.durhamtownship.com/
++http://jiwire.com/
++http://www.insilmaril.de/vym/
+-http://www.spreadshirt.net/
++http://www.goffice.com/
++http://www.writely.com/
++http://www.milindparikh.com/
++http://www.onjava.com/pub/a/onjava/2005/02/02/bitsets.html
++http://www.wikyblog.com/Map/Guest/Home
+-http://www.kottke.org/05/08/googleos-webos
++http://www.rollyo.com/
++http://www.meebo.com/
++http://www.factbites.com/
++http://www.placeopedia.com/
++http://swoogle.umbc.edu/
++http://www.viaduc.com/
+-http://demo.wikiwyg.net/wikiwyg/demo/standalone/
++http://podcasts.yahoo.com/
+-http://beaglewiki.org/Main_Page
++http://yq.search.yahoo.com/
+-http://www.onlamp.com/pub/a/onlamp/2005/10/13/what_is_rails.html?page=1
++http://www.onlamp.com/pub/a/onlamp/2005/10/13/what_is_rails.html
++http://socialight.com/
++http://www.lexxe.com/
++http://www.xom.nu/
++http://www.turboprint.de/
++http://www.whatdoesthatmean.com/index.php/Welcome_to_%27Whatdoesthatmean%3F%27
++http://www.wi-fiplanet.com/tutorials/article.php/3562391
++http://particletree.com/features/10-tips-to-a-better-form/
++http://www.songbirdnest.com/
+-http://www.w3.org/Talks/Tools/Slidy/
+-http://www.compassframework.org/display/SITE/Home
++http://motrech.blogspot.com/
++http://www.moteurzine.com/
++http://www.mex-search.com/
+-http://beta.previewseek.com/?mdc=y&amp;twin=n&amp;ilang=french
++http://www.goshme.com/
++http://rialto.application-servers.com/
++http://www.multe-pass.com/
++http://www.tailrank.com/
++http://www.vandertramp.com/INTERNETDOWN/
++http://www.letterjames.de/index.html
++http://code.google.com/index.html
++http://www.kritx.com/
++http://performancing.com/firefox
++http://www.mywebsearch.com/
+-http://en.wikibooks.org/w/index.php?title=Wikimania05/IM1
++http://www.lukew.com/resources/articles/blogs2.asp
+-http://www.hyperwords.net/
++http://ajax.parish.ath.cx/translator/
++http://www.maplandia.com/
+-http://www.tbray.org/ongoing/When/200x/2006/01/08/No-New-XML-Languages
++http://onefeed.com/index.php
++http://www.file-swap.com/
+-http://opennlp.org/
++http://mindprod.com/jgloss/encoding.html
++http://code.google.com/webstats/index.html
++http://www.freeweb-hosting.com/google_pagerank_pr_checker/
+-http://www.framakey.org/
+-http://microformats.org/wiki/hreview
+-http://www.ashesandsnow.org/index2.html
+-http://uima-framework.sourceforge.net/
++http://sethgodin.typepad.com/seths_blog/2006/01/flipping_the_fu.html
+-http://www.anandtech.com/IT/showdoc.aspx?i=2523&amp;p=2
++http://fr.techcrunch.com/
+-http://developer.yahoo.net/yui/
++http://www.fredrikodman.com/
++http://www.mpirical.com/companion/mpirical_companion.html
++http://www.onjava.com/pub/a/onjava/2005/08/03/drools.html
+-http://k9copy.free.fr/
+-http://lespetitescases.net/comment-organiser-l-information-pour-y-naviguer-efficacement-3
+-http://www.tbray.org/ongoing/When/200x/2006/01/09/On-XML-Language-Design
+-http://lespetitescases.net/structurer-decrire-et-organiser-l-information-2
++http://blogokat.canalblog.com/archives/2005/11/02/882454.html
++http://robur.slu.se/jensl/xmlclitools/
+-http://www.internetactu.net/?p=6291
+-http://www.xml.com/pub/a/2005/10/19/microformats-and-web-2.0.html?page=1
++http://www.memodata.com/2004/fr/alexandria/
+-http://presse-citron.net/?2006/01/23/654-joomla-pete-grave
++http://www.randomerror.com/
++http://www.i-cherubini.it/mauro/blog/2006/01/05/techniques-for-determining-the-location-on-umts-networks/
+-http://fr.newsgator.com/ngs/subscriber/WebEd2.aspx?fid=368395
+-http://interstices.info/display.jsp?id=c_15918
++http://www.tech-invite.com/
++http://www.croczilla.com/zap
+-http://www.libervis.com/modules/wordpress/?p=13
++http://www.searchmorph.com/wp/2005/07/19/recent-discovery-clickfraud-tools/
+-http://savoirscdi.cndp.fr/CulturePro/actualisation/Serres/Serres.htm
++http://www.influo.com/
++http://www.dsi-info.ca/chroniques/chroniques-recherche-web.html
+-http://www.addnb.org/fr/docs/webinvisible.htm
+-http://manhack.net/
+-http://www.jibaku.net/
++http://www.pipologie.com/
++http://christophenoel.blogspot.com/
+-http://www.seekport.fr/seekbot/
++http://beta.exalead.com/
+-http://www.boolgum.fr/index.html
++http://www.kesako.canalblog.com/
++http://loran.blogspot.com/
++http://outils-recherche.blogspot.com/
++http://www.art-dept.com/artists/giacobbe/
++http://www.meggould.netfirms.com/site_seeingIII.htm
++http://www.freedpi.com/
++http://www.frenchfred.com/
++http://www.photoways.com/
+-http://freco.free.fr/index.htm
+-http://triturages.free.fr/index.htm
+-http://www.qsos.org/
++http://www.alvis.info/alvis/
++http://www.i-cherubini.it/mauro/blog/2005/12/16/open-source-information-retrieval-systems/
+-http://www.shinux.org/
++http://www.linuxlinks.com/Distributions/Mini_Distributions/index.shtml
++http://www.kurobox.com/online/tiki-index.php
+-http://news.gmane.org/gmane.comp.misc.linkstation.linux
++http://www.imsbook.com/SIP-IMS-Standards-List.html
+-http://incubator.apache.org/directory/subprojects/snickers/
+-http://www.mozilla.org/projects/security/pki/jss/javadoc/org/mozilla/jss/asn1/package-summary.html
+-http://sourceforge.net/projects/cryptix-asn1/
+-http://sourceforge.net/projects/basn/
+-http://asn1.elibel.tm.fr/fr/index.htm
+-http://sourceforge.net/projects/a2j/
++http://www.degrouptest.com/
++http://interstices.info/
++http://louvre-boite.viabloga.com/news/18.shtml
+-http://tel.ccsd.cnrs.fr/documents/archives0/00/00/62/60/index_fr.html
++http://poiplace.oabsoftware.nl/
+-http://www.gpspassion.com/forumsen/topic.asp?TOPIC_ID=7759
+-http://www.yoono.com/favorites.jsp?user-id=lquerel
+-http://www.librecours.org/cgi-bin/main
+-http://www.onjava.com/pub/a/onjava/2006/01/18/using-lucene-to-search-java-source.html?page=1
+-http://limo.sourceforge.net/
++http://www-scf.usc.edu/%7Emattmann/
++http://spaces.msn.com/members/famillezen/
+-http://photos.joune.org/
+-http://www.canon.fr/paperart/
++http://flash.eastweb.ru/files/20051024092150.swf
++http://www.xsltwiki.com/index.php/Main_Page
++http://www.i-cherubini.it/mauro/blog/2005/12/08/software-that-goes-on-a-stick/
+-http://www.webrankinfo.com/forums/forum_15.htm?sid=307384cdbce813aa19ba017513cbbc31
++http://www.loiclemeur.com/france/2006/01/eric_tenin_se_f.html
+-http://member.openmobilealliance.org/ftp/Public_documents/MCC/2005/
++http://www.aeliosfinance.com/
++http://www.capital-it.com/
+-http://www.tradedoubler.fr/pan/public/solutions/publisher
+-http://www.recherche.gouv.fr/technologie/concours/2006/index.htm
++http://www.techcrunch.com/2005/12/21/gravee-takes-a-new-approach-to-search/
++http://wanabo.com/
+-http://www.lespetitescases.net/structurer-decrire-et-organiser-l-information-1
+-http://presse-citron.net/?2006/02/07/705-joue-la-comme-stickam
++http://aeliosfinance.com/
++http://www.centreincubation.com/
++http://www.franceincubation.com/
+-http://www.oseo.fr/
++http://www.i18nfaq.com/chardet.html
+-http://cpdetector.sourceforge.net/
++http://www.jeremi.info/index.php/2005/07/21/7-introduction-aux-methodes-agiles
++http://chezlorry.ca/Accueil.htm
++http://cetnia.blogs.com/d_lires/
+-http://www.directwine.fr/
++http://www.new-phenix.com/
+-http://upnp.sourceforge.net/
+-http://www.pixmania.fr/
+-http://www.lespetitescases.net/comment-organiser-l-information-pour-y-naviguer-efficacement-3
++http://www.i-cherubini.it/mauro/blog/2006/01/25/kwmap-a-keyword-search-visualization-tool/
++http://www.stepnewz.com/sn/default.asp
++http://opquast.com/
+-http://www.freeplayer.org/
+-http://www.cafe-clope.net/orangeamere/index.php/2005/08/24/5-le-modele-contributif-une-utopie
+-http://atomcomputer.free.fr/fbox/
+-http://www.internetactu.net/index.php?p=6100
+-http://mammouthland.free.fr/cours/css/genecss.php
+-http://www.xml.com/pub/a/2006/02/01/doing-http-caching-right-introducing-httplib2.html?page=1
++http://www-106.ibm.com/developerworks/xml/library/x-xapi.html
+-http://xml.apache.org/xalan-j/extensions.html
++http://developers.sun.com/foryourbusiness/jcc/
++http://blogs.sun.com/roller/page/roumen/Weblog
+-http://www.onjava.com/pub/a/onjava/2005/10/12/diagnostic-tests-with-ant.html?page=1
+-http://blog.developpez.com/index.php?blog=51&amp;p=1389&amp;more=1&amp;c=1&amp;tb=1&amp;pb=1
++http://dcabasson.developpez.com/articles/javascript/ajax/ajax-autocompletion-pas-a-pas/
++http://odur.let.rug.nl/%7Evannoord/
+-http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html
+-http://artist.inist.fr/
++http://www.elra.info/
+-http://beinecke.library.yale.edu/dl_crosscollex/SearchExecXC.asp?srchtype=CNO
++http://www.i-cherubini.it/mauro/blog/2005/12/13/information-retrieval-system-evaluation-effort-sensitivity-and-reliability
++http://www.i-cherubini.it/mauro/blog/2005/12/13/trec-datasets-text-retrieval-conference-datasets-for-information-retrieval
++http://www.i-cherubini.it/mauro/blog/2005/12/12/focused-crawling-using-context-graphs/
++http://www.i-cherubini.it/mauro/blog/2005/12/08/spam-filtering-using-contextual-network-graphs/
++http://www.cs.northwestern.edu/%7Evidya/semanticons/IconsWebPage/
++http://www.i-cherubini.it/mauro/blog/2006/01/05/social-information-retrieval/
++http://www.i-cherubini.it/mauro/blog/2006/01/04/an-introduction-to-random-indexing/
++http://dossierdoc.typepad.com/descripteurs/2006/01/liste_de_thsaur.html
+-http://www.lexique.org/
++http://www.i-cherubini.it/mauro/blog/2006/01/22/montylingua-a-commonsense-enriched-part-of-speech-tagger/
++http://www.streamium.com/products/mx6000i/
+-http://www.p4c.philips.com/cgi-bin/dcbint/cpindex.pl?ctn=MX6000I/22S&amp;scy=FR&amp;slg=fr
+-http://store.interact-tv.com/store/product_info.php?cPath=9&amp;products_id=73
++http://www.tversity.com/
+-http://www.aspseek.org/index.php
\ No newline at end of file

[28/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build for nutch-core and nutch-plugins

Posted by th...@apache.org.

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/metadata/TestSpellCheckedMetadata.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/metadata/TestSpellCheckedMetadata.java b/nutch-core/src/test/java/org/apache/nutch/metadata/TestSpellCheckedMetadata.java
new file mode 100644
index 0000000..ae73ae1
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/metadata/TestSpellCheckedMetadata.java
@@ -0,0 +1,303 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.metadata;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.io.IOException;
+import java.util.Properties;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+/**
+ * JUnit based tests of class
+ * {@link org.apache.nutch.metadata.SpellCheckedMetadata}.
+ * 
+ * @author Chris Mattmann
+ * @author J&eacute;r&ocirc;me Charron
+ */
+public class TestSpellCheckedMetadata {
+
+  private static final int NUM_ITERATIONS = 10000;
+
+  /** Test for the <code>getNormalizedName(String)</code> method. */
+  @Test
+  public void testGetNormalizedName() {
+    Assert.assertEquals("Content-Type",
+        SpellCheckedMetadata.getNormalizedName("Content-Type"));
+    Assert.assertEquals("Content-Type",
+        SpellCheckedMetadata.getNormalizedName("ContentType"));
+    Assert.assertEquals("Content-Type",
+        SpellCheckedMetadata.getNormalizedName("Content-type"));
+    Assert.assertEquals("Content-Type",
+        SpellCheckedMetadata.getNormalizedName("contenttype"));
+    Assert.assertEquals("Content-Type",
+        SpellCheckedMetadata.getNormalizedName("contentype"));
+    Assert.assertEquals("Content-Type",
+        SpellCheckedMetadata.getNormalizedName("contntype"));
+  }
+
+  /** Test for the <code>add(String, String)</code> method. */
+  @Test
+  public void testAdd() {
+    String[] values = null;
+    SpellCheckedMetadata meta = new SpellCheckedMetadata();
+
+    values = meta.getValues("contentype");
+    Assert.assertEquals(0, values.length);
+
+    meta.add("contentype", "value1");
+    values = meta.getValues("contentype");
+    Assert.assertEquals(1, values.length);
+    Assert.assertEquals("value1", values[0]);
+
+    meta.add("Content-Type", "value2");
+    values = meta.getValues("contentype");
+    Assert.assertEquals(2, values.length);
+    Assert.assertEquals("value1", values[0]);
+    Assert.assertEquals("value2", values[1]);
+
+    // NOTE : For now, the same value can be added many times.
+    // Should it be changed?
+    meta.add("ContentType", "value1");
+    values = meta.getValues("Content-Type");
+    Assert.assertEquals(3, values.length);
+    Assert.assertEquals("value1", values[0]);
+    Assert.assertEquals("value2", values[1]);
+    Assert.assertEquals("value1", values[2]);
+  }
+
+  /** Test for the <code>set(String, String)</code> method. */
+  @Test
+  public void testSet() {
+    String[] values = null;
+    SpellCheckedMetadata meta = new SpellCheckedMetadata();
+
+    values = meta.getValues("contentype");
+    Assert.assertEquals(0, values.length);
+
+    meta.set("contentype", "value1");
+    values = meta.getValues("contentype");
+    Assert.assertEquals(1, values.length);
+    Assert.assertEquals("value1", values[0]);
+
+    meta.set("Content-Type", "value2");
+    values = meta.getValues("contentype");
+    Assert.assertEquals(1, values.length);
+    Assert.assertEquals("value2", values[0]);
+
+    meta.set("contenttype", "new value 1");
+    meta.add("contenttype", "new value 2");
+    values = meta.getValues("contentype");
+    Assert.assertEquals(2, values.length);
+    Assert.assertEquals("new value 1", values[0]);
+    Assert.assertEquals("new value 2", values[1]);
+  }
+
+  /** Test for <code>setAll(Properties)</code> method. */
+  @Test
+  public void testSetProperties() {
+    String[] values = null;
+    SpellCheckedMetadata meta = new SpellCheckedMetadata();
+    Properties props = new Properties();
+
+    meta.setAll(props);
+    Assert.assertEquals(0, meta.size());
+
+    props.setProperty("name-one", "value1.1");
+    meta.setAll(props);
+    Assert.assertEquals(1, meta.size());
+    values = meta.getValues("name-one");
+    Assert.assertEquals(1, values.length);
+    Assert.assertEquals("value1.1", values[0]);
+
+    props.setProperty("name-two", "value2.1");
+    meta.setAll(props);
+    Assert.assertEquals(2, meta.size());
+    values = meta.getValues("name-one");
+    Assert.assertEquals(1, values.length);
+    Assert.assertEquals("value1.1", values[0]);
+    values = meta.getValues("name-two");
+    Assert.assertEquals(1, values.length);
+    Assert.assertEquals("value2.1", values[0]);
+  }
+
+  /** Test for <code>get(String)</code> method. */
+  @Test
+  public void testGet() {
+    SpellCheckedMetadata meta = new SpellCheckedMetadata();
+    Assert.assertNull(meta.get("a-name"));
+
+    meta.add("a-name", "value-1");
+    Assert.assertEquals("value-1", meta.get("a-name"));
+    meta.add("a-name", "value-2");
+    Assert.assertEquals("value-1", meta.get("a-name"));
+  }
+
+  /** Test for <code>isMultiValued()</code> method. */
+  @Test
+  public void testIsMultiValued() {
+    SpellCheckedMetadata meta = new SpellCheckedMetadata();
+    Assert.assertFalse(meta.isMultiValued("key"));
+    meta.add("key", "value1");
+    Assert.assertFalse(meta.isMultiValued("key"));
+    meta.add("key", "value2");
+    Assert.assertTrue(meta.isMultiValued("key"));
+  }
+
+  /** Test for <code>names</code> method. */
+  @Test
+  public void testNames() {
+    String[] names = null;
+    SpellCheckedMetadata meta = new SpellCheckedMetadata();
+    names = meta.names();
+    Assert.assertEquals(0, names.length);
+
+    meta.add("name-one", "value");
+    names = meta.names();
+    Assert.assertEquals(1, names.length);
+    Assert.assertEquals("name-one", names[0]);
+    meta.add("name-two", "value");
+    names = meta.names();
+    Assert.assertEquals(2, names.length);
+  }
+
+  /** Test for <code>remove(String)</code> method. */
+  @Test
+  public void testRemove() {
+    SpellCheckedMetadata meta = new SpellCheckedMetadata();
+    meta.remove("name-one");
+    Assert.assertEquals(0, meta.size());
+    meta.add("name-one", "value-1.1");
+    meta.add("name-one", "value-1.2");
+    meta.add("name-two", "value-2.2");
+    Assert.assertEquals(2, meta.size());
+    Assert.assertNotNull(meta.get("name-one"));
+    Assert.assertNotNull(meta.get("name-two"));
+    meta.remove("name-one");
+    Assert.assertEquals(1, meta.size());
+    Assert.assertNull(meta.get("name-one"));
+    Assert.assertNotNull(meta.get("name-two"));
+    meta.remove("name-two");
+    Assert.assertEquals(0, meta.size());
+    Assert.assertNull(meta.get("name-one"));
+    Assert.assertNull(meta.get("name-two"));
+  }
+
+  /** Test for <code>equals(Object)</code> method. */
+  @Test
+  public void testObject() {
+    SpellCheckedMetadata meta1 = new SpellCheckedMetadata();
+    SpellCheckedMetadata meta2 = new SpellCheckedMetadata();
+    Assert.assertFalse(meta1.equals(null));
+    Assert.assertFalse(meta1.equals("String"));
+    Assert.assertTrue(meta1.equals(meta2));
+    meta1.add("name-one", "value-1.1");
+    Assert.assertFalse(meta1.equals(meta2));
+    meta2.add("name-one", "value-1.1");
+    Assert.assertTrue(meta1.equals(meta2));
+    meta1.add("name-one", "value-1.2");
+    Assert.assertFalse(meta1.equals(meta2));
+    meta2.add("name-one", "value-1.2");
+    Assert.assertTrue(meta1.equals(meta2));
+    meta1.add("name-two", "value-2.1");
+    Assert.assertFalse(meta1.equals(meta2));
+    meta2.add("name-two", "value-2.1");
+    Assert.assertTrue(meta1.equals(meta2));
+    meta1.add("name-two", "value-2.2");
+    Assert.assertFalse(meta1.equals(meta2));
+    meta2.add("name-two", "value-2.x");
+    Assert.assertFalse(meta1.equals(meta2));
+  }
+
+  /** Test for <code>Writable</code> implementation. */
+  @Test
+  public void testWritable() {
+    SpellCheckedMetadata result = null;
+    SpellCheckedMetadata meta = new SpellCheckedMetadata();
+    result = writeRead(meta);
+    Assert.assertEquals(0, result.size());
+    meta.add("name-one", "value-1.1");
+    result = writeRead(meta);
+    meta.add("Contenttype", "text/html");
+    Assert.assertEquals(1, result.size());
+    Assert.assertEquals(1, result.getValues("name-one").length);
+    Assert.assertEquals("value-1.1", result.get("name-one"));
+    meta.add("name-two", "value-2.1");
+    meta.add("name-two", "value-2.2");
+    result = writeRead(meta);
+    Assert.assertEquals(3, result.size());
+    Assert.assertEquals(1, result.getValues("name-one").length);
+    Assert.assertEquals("value-1.1", result.getValues("name-one")[0]);
+    Assert.assertEquals(2, result.getValues("name-two").length);
+    Assert.assertEquals("value-2.1", result.getValues("name-two")[0]);
+    Assert.assertEquals("value-2.2", result.getValues("name-two")[1]);
+    Assert.assertEquals("text/html", result.get(Metadata.CONTENT_TYPE));
+  }
+
+  /**
+   * IO Test method, usable only when you plan to do changes in metadata to
+   * measure relative performance impact.
+   */
+  @Test
+  public final void testHandlingSpeed() {
+    @SuppressWarnings("unused")
+    SpellCheckedMetadata result;
+    long start = System.currentTimeMillis();
+    for (int i = 0; i < NUM_ITERATIONS; i++) {
+      SpellCheckedMetadata scmd = constructSpellCheckedMetadata();
+      result = writeRead(scmd);
+    }
+    System.out.println(NUM_ITERATIONS + " spellchecked metadata I/O time:"
+        + (System.currentTimeMillis() - start) + "ms.");
+  }
+
+  private SpellCheckedMetadata writeRead(SpellCheckedMetadata meta) {
+    SpellCheckedMetadata readed = new SpellCheckedMetadata();
+    try {
+      ByteArrayOutputStream out = new ByteArrayOutputStream();
+      meta.write(new DataOutputStream(out));
+      readed.readFields(new DataInputStream(new ByteArrayInputStream(out
+          .toByteArray())));
+    } catch (IOException ioe) {
+      Assert.fail(ioe.toString());
+    }
+    return readed;
+  }
+
+  /**
+   * Assembles a Spellchecked metadata Object.
+   */
+  public static final SpellCheckedMetadata constructSpellCheckedMetadata() {
+    SpellCheckedMetadata scmd = new SpellCheckedMetadata();
+    scmd.add("Content-type", "foo/bar");
+    scmd.add("Connection", "close");
+    scmd.add("Last-Modified", "Sat, 09 Dec 2006 15:09:57 GMT");
+    scmd.add("Server", "Foobar");
+    scmd.add("Date", "Sat, 09 Dec 2006 18:07:20 GMT");
+    scmd.add("Accept-Ranges", "bytes");
+    scmd.add("ETag", "\"1234567-89-01234567\"");
+    scmd.add("Content-Length", "123");
+    scmd.add(Nutch.SEGMENT_NAME_KEY, "segmentzzz");
+    scmd.add(Nutch.SIGNATURE_KEY, "123");
+    return scmd;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/net/TestURLFilters.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/net/TestURLFilters.java b/nutch-core/src/test/java/org/apache/nutch/net/TestURLFilters.java
new file mode 100644
index 0000000..ef07907
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/net/TestURLFilters.java
@@ -0,0 +1,44 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.net;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.test.IntegrationTest;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+
+@Category(IntegrationTest.class)
+public class TestURLFilters {
+
+  /**
+   * Testcase for NUTCH-325.
+   * 
+   * @throws URLFilterException
+   */
+  @Test
+  public void testNonExistingUrlFilter() throws URLFilterException {
+    Configuration conf = NutchConfiguration.create();
+    String class1 = "NonExistingFilter";
+    String class2 = "org.apache.nutch.urlfilter.prefix.PrefixURLFilter";
+    conf.set(URLFilters.URLFILTER_ORDER, class1 + " " + class2);
+
+    URLFilters normalizers = new URLFilters(conf);
+    normalizers.filter("http://someurl/");
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/net/TestURLNormalizers.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/net/TestURLNormalizers.java b/nutch-core/src/test/java/org/apache/nutch/net/TestURLNormalizers.java
new file mode 100644
index 0000000..d29e9d3
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/net/TestURLNormalizers.java
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.net;
+
+import java.net.MalformedURLException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.test.IntegrationTest;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Assert;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+
+@Category(IntegrationTest.class)
+public class TestURLNormalizers {
+
+  @Test
+  public void testURLNormalizers() {
+    Configuration conf = NutchConfiguration.create();
+    String clazz1 = "org.apache.nutch.net.urlnormalizer.regex.RegexURLNormalizer";
+    String clazz2 = "org.apache.nutch.net.urlnormalizer.basic.BasicURLNormalizer";
+    conf.set("urlnormalizer.order", clazz1 + " " + clazz2);
+
+    URLNormalizers normalizers = new URLNormalizers(conf,
+        URLNormalizers.SCOPE_DEFAULT);
+
+    Assert.assertNotNull(normalizers);
+    try {
+      normalizers.normalize("http://www.example.com/",
+          URLNormalizers.SCOPE_DEFAULT);
+    } catch (MalformedURLException mue) {
+      Assert.fail(mue.toString());
+    }
+
+    // NUTCH-1011 - Get rid of superfluous slashes
+    try {
+      String normalizedSlashes = normalizers.normalize(
+          "http://www.example.com//path/to//somewhere.html",
+          URLNormalizers.SCOPE_DEFAULT);
+      Assert.assertEquals(normalizedSlashes,
+          "http://www.example.com/path/to/somewhere.html");
+    } catch (MalformedURLException mue) {
+      Assert.fail(mue.toString());
+    }
+
+    // HostNormalizer NUTCH-1319
+    try {
+      String normalizedHost = normalizers.normalize(
+          "http://www.example.org//path/to//somewhere.html",
+          URLNormalizers.SCOPE_DEFAULT);
+      Assert.assertEquals(normalizedHost,
+          "http://www.example.org/path/to/somewhere.html");
+    } catch (MalformedURLException mue) {
+      Assert.fail(mue.toString());
+    }
+
+    // check the order
+    int pos1 = -1, pos2 = -1;
+    URLNormalizer[] impls = normalizers
+        .getURLNormalizers(URLNormalizers.SCOPE_DEFAULT);
+    for (int i = 0; i < impls.length; i++) {
+      if (impls[i].getClass().getName().equals(clazz1))
+        pos1 = i;
+      if (impls[i].getClass().getName().equals(clazz2))
+        pos2 = i;
+    }
+    if (pos1 != -1 && pos2 != -1) {
+      Assert.assertTrue("RegexURLNormalizer before BasicURLNormalizer",
+          pos1 < pos2);
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/parse/TestOutlinkExtractor.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/parse/TestOutlinkExtractor.java b/nutch-core/src/test/java/org/apache/nutch/parse/TestOutlinkExtractor.java
new file mode 100644
index 0000000..1f2c833
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/parse/TestOutlinkExtractor.java
@@ -0,0 +1,99 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse;
+
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.OutlinkExtractor;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Assert;
+import org.junit.Test;
+
+/**
+ * TestCase to check regExp extraction of URLs.
+ * 
+ * @author Stephan Strittmatter - http://www.sybit.de
+ * 
+ * @version 1.0
+ */
+public class TestOutlinkExtractor {
+
+  private static Configuration conf = NutchConfiguration.create();
+
+  @Test
+  public void testGetNoOutlinks() {
+    Outlink[] outlinks = null;
+
+    outlinks = OutlinkExtractor.getOutlinks(null, conf);
+    Assert.assertNotNull(outlinks);
+    Assert.assertEquals(0, outlinks.length);
+
+    outlinks = OutlinkExtractor.getOutlinks("", conf);
+    Assert.assertNotNull(outlinks);
+    Assert.assertEquals(0, outlinks.length);
+  }
+
+  @Test
+  public void testGetOutlinksHttp() {
+    Outlink[] outlinks = OutlinkExtractor
+        .getOutlinks(
+            "Test with http://www.nutch.org/index.html is it found? "
+                + "What about www.google.com at http://www.google.de "
+                + "A longer URL could be http://www.sybit.com/solutions/portals.html",
+            conf);
+
+    Assert.assertTrue("Url not found!", outlinks.length == 3);
+    Assert.assertEquals("Wrong URL", "http://www.nutch.org/index.html",
+        outlinks[0].getToUrl());
+    Assert.assertEquals("Wrong URL", "http://www.google.de",
+        outlinks[1].getToUrl());
+    Assert.assertEquals("Wrong URL",
+        "http://www.sybit.com/solutions/portals.html", outlinks[2].getToUrl());
+  }
+
+  @Test
+  public void testGetOutlinksHttp2() {
+    Outlink[] outlinks = OutlinkExtractor
+        .getOutlinks(
+            "Test with http://www.nutch.org/index.html is it found? "
+                + "What about www.google.com at http://www.google.de "
+                + "A longer URL could be http://www.sybit.com/solutions/portals.html",
+            "http://www.sybit.de", conf);
+
+    Assert.assertTrue("Url not found!", outlinks.length == 3);
+    Assert.assertEquals("Wrong URL", "http://www.nutch.org/index.html",
+        outlinks[0].getToUrl());
+    Assert.assertEquals("Wrong URL", "http://www.google.de",
+        outlinks[1].getToUrl());
+    Assert.assertEquals("Wrong URL",
+        "http://www.sybit.com/solutions/portals.html", outlinks[2].getToUrl());
+  }
+
+  @Test
+  public void testGetOutlinksFtp() {
+    Outlink[] outlinks = OutlinkExtractor.getOutlinks(
+        "Test with ftp://www.nutch.org is it found? "
+            + "What about www.google.com at ftp://www.google.de", conf);
+
+    Assert.assertTrue("Url not found!", outlinks.length > 1);
+    Assert.assertEquals("Wrong URL", "ftp://www.nutch.org",
+        outlinks[0].getToUrl());
+    Assert.assertEquals("Wrong URL", "ftp://www.google.de",
+        outlinks[1].getToUrl());
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/parse/TestParseData.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/parse/TestParseData.java b/nutch-core/src/test/java/org/apache/nutch/parse/TestParseData.java
new file mode 100644
index 0000000..550a260
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/parse/TestParseData.java
@@ -0,0 +1,58 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse;
+
+import org.apache.nutch.util.WritableTestUtils;
+import org.apache.nutch.metadata.Metadata;
+import org.junit.Assert;
+import org.junit.Test;
+
+/** Unit tests for ParseData. */
+
+public class TestParseData {
+
+  @Test
+  public void testParseData() throws Exception {
+
+    String title = "The Foo Page";
+
+    Outlink[] outlinks = new Outlink[] { new Outlink("http://foo.com/", "Foo"),
+        new Outlink("http://bar.com/", "Bar") };
+
+    Metadata metaData = new Metadata();
+    metaData.add("Language", "en/us");
+    metaData.add("Charset", "UTF-8");
+
+    ParseData r = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks,
+        metaData);
+
+    WritableTestUtils.testWritable(r, null);
+  }
+
+  @Test
+  public void testMaxOutlinks() throws Exception {
+    Outlink[] outlinks = new Outlink[128];
+    for (int i = 0; i < outlinks.length; i++) {
+      outlinks[i] = new Outlink("http://outlink.com/" + i, "Outlink" + i);
+    }
+    ParseData original = new ParseData(ParseStatus.STATUS_SUCCESS,
+        "Max Outlinks Title", outlinks, new Metadata());
+    ParseData data = (ParseData) WritableTestUtils.writeRead(original, null);
+    Assert.assertEquals(outlinks.length, data.getOutlinks().length);
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/parse/TestParseText.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/parse/TestParseText.java b/nutch-core/src/test/java/org/apache/nutch/parse/TestParseText.java
new file mode 100644
index 0000000..241b293
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/parse/TestParseText.java
@@ -0,0 +1,34 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse;
+
+import org.apache.nutch.util.WritableTestUtils;
+import org.junit.Test;
+
+/** Unit tests for ParseText. */
+
+public class TestParseText {
+
+  @Test
+  public void testParseText() throws Exception {
+
+    String page = "Hello World The Quick Brown Fox Jumped Over the Lazy Fox";
+    ParseText s = new ParseText(page);
+    WritableTestUtils.testWritable(s);
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/parse/TestParserFactory.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/parse/TestParserFactory.java b/nutch-core/src/test/java/org/apache/nutch/parse/TestParserFactory.java
new file mode 100644
index 0000000..198e284
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/parse/TestParserFactory.java
@@ -0,0 +1,108 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse;
+
+// Nutch imports
+import org.apache.nutch.plugin.Extension;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.test.IntegrationTest;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+
+/**
+ * Unit test for new parse plugin selection.
+ * 
+ * @author Sebastien Le Callonnec
+ * @version 1.0
+ */
+@Category(IntegrationTest.class)
+public class TestParserFactory {
+
+  private Configuration conf;
+  private ParserFactory parserFactory;
+
+  /** Inits the Test Case with the test parse-plugin file */
+  @Before
+  public void setUp() throws Exception {
+    conf = NutchConfiguration.create();
+    conf.set("plugin.includes", ".*");
+    conf.set("parse.plugin.file",
+        "org/apache/nutch/parse/parse-plugin-test.xml");
+    parserFactory = new ParserFactory(conf);
+  }
+
+  /** Unit test for <code>getExtensions(String)</code> method. */
+  @Test
+  public void testGetExtensions() throws Exception {
+    Extension ext = parserFactory.getExtensions("text/html").get(0);
+    Assert.assertEquals("parse-tika", ext.getDescriptor().getPluginId());
+    ext = parserFactory.getExtensions("text/html; charset=ISO-8859-1").get(0);
+    Assert.assertEquals("parse-tika", ext.getDescriptor().getPluginId());
+    ext = parserFactory.getExtensions("foo/bar").get(0);
+    Assert.assertEquals("parse-tika", ext.getDescriptor().getPluginId());
+  }
+
+  /** Unit test to check <code>getParsers</code> method */
+  @Test
+  public void testGetParsers() throws Exception {
+    Parser[] parsers = parserFactory.getParsers("text/html", "http://foo.com");
+    Assert.assertNotNull(parsers);
+    Assert.assertEquals(1, parsers.length);
+    Assert.assertEquals("org.apache.nutch.parse.tika.TikaParser", parsers[0]
+        .getClass().getName());
+
+    parsers = parserFactory.getParsers("text/html; charset=ISO-8859-1",
+        "http://foo.com");
+    Assert.assertNotNull(parsers);
+    Assert.assertEquals(1, parsers.length);
+    Assert.assertEquals("org.apache.nutch.parse.tika.TikaParser", parsers[0]
+        .getClass().getName());
+
+    parsers = parserFactory.getParsers("application/x-javascript",
+        "http://foo.com");
+    Assert.assertNotNull(parsers);
+    Assert.assertEquals(1, parsers.length);
+    Assert.assertEquals("org.apache.nutch.parse.js.JSParseFilter", parsers[0]
+        .getClass().getName());
+
+    parsers = parserFactory.getParsers("text/plain", "http://foo.com");
+    Assert.assertNotNull(parsers);
+    Assert.assertEquals(1, parsers.length);
+    Assert.assertEquals("org.apache.nutch.parse.tika.TikaParser", parsers[0]
+        .getClass().getName());
+
+    Parser parser1 = parserFactory.getParsers("text/plain", "http://foo.com")[0];
+    Parser parser2 = parserFactory.getParsers("*", "http://foo.com")[0];
+
+    Assert.assertEquals("Different instances!", parser1.hashCode(),
+        parser2.hashCode());
+
+    // test and make sure that the rss parser is loaded even though its
+    // plugin.xml
+    // doesn't claim to support text/rss, only application/rss+xml
+    parsers = parserFactory.getParsers("text/rss", "http://foo.com");
+    Assert.assertNotNull(parsers);
+    Assert.assertEquals(1, parsers.length);
+    Assert.assertEquals("org.apache.nutch.parse.tika.TikaParser", parsers[0]
+        .getClass().getName());
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/parse/parse-plugin-test.xml
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/parse/parse-plugin-test.xml b/nutch-core/src/test/java/org/apache/nutch/parse/parse-plugin-test.xml
new file mode 100644
index 0000000..b748905
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/parse/parse-plugin-test.xml
@@ -0,0 +1,58 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+    Licensed to the Apache Software Foundation (ASF) under one or more
+    contributor license agreements.  See the NOTICE file distributed with
+    this work for additional information regarding copyright ownership.
+    The ASF licenses this file to You under the Apache License, Version 2.0
+    (the "License"); you may not use this file except in compliance with
+    the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+
+    Author     : mattmann 
+    Description: Test parse-plugins.xml file. 
+-->
+
+<parse-plugins>
+
+  <!--  by default if the mimeType is set to *, or 
+        if it can't be determined, use parse-tika -->
+  <mimeType name="*">
+    <plugin id="parse-tika" />
+  </mimeType>
+	
+  <!--  test these 4 plugins -->
+  <mimeType name="text/html">
+    <!--
+     ! Test that if a parser cannot be instanciated,
+     ! it should not block the process and then the next one is used
+     !-->
+    <plugin id="parse-plugin-that-not-exist"/>
+  </mimeType>
+ 	 
+  <mimeType name="application/x-javascript">
+    <plugin id="parse-js"/>
+  </mimeType>
+ 	 
+  <mimeType name="text/rss">
+    <!-- Test that an extension-id can be directly used here -->
+    <plugin id="org.apache.nutch.parse.rss.RSSParser"/>
+  </mimeType>
+
+  <!--  alias mappings for parse-xxx names to the actual extension implementation 
+  ids described in each plugin's plugin.xml file -->
+  <aliases>
+    <alias name="parse-js"
+           extension-id="JSParser" />
+    <alias name="parse-rss"
+           extension-id="org.apache.nutch.parse.rss.RSSParser" />
+    <alias name="parse-tika"
+           extension-id="org.apache.nutch.parse.tika.TikaParser" />	
+  </aliases>
+</parse-plugins>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/plugin/HelloWorldExtension.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/plugin/HelloWorldExtension.java b/nutch-core/src/test/java/org/apache/nutch/plugin/HelloWorldExtension.java
new file mode 100644
index 0000000..fa564c4
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/plugin/HelloWorldExtension.java
@@ -0,0 +1,36 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.plugin;
+
+/**
+ * Simple Test-extensions
+ * 
+ * @author joa23
+ */
+public class HelloWorldExtension implements ITestExtension {
+
+  /*
+   * (non-Javadoc)
+   * 
+   * @see
+   * org.apache.nutch.plugin.ITestExtension#testGetExtension(java.lang.String)
+   */
+  public String testGetExtension(String hello) {
+    return hello + " World";
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/plugin/ITestExtension.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/plugin/ITestExtension.java b/nutch-core/src/test/java/org/apache/nutch/plugin/ITestExtension.java
new file mode 100644
index 0000000..b6aa81d
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/plugin/ITestExtension.java
@@ -0,0 +1,27 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.plugin;
+
+/**
+ * A Simple Test Extension Interface.
+ * 
+ * @author joa23
+ * 
+ */
+public interface ITestExtension {
+  public String testGetExtension(String hello);
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/plugin/SimpleTestPlugin.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/plugin/SimpleTestPlugin.java b/nutch-core/src/test/java/org/apache/nutch/plugin/SimpleTestPlugin.java
new file mode 100644
index 0000000..080142d
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/plugin/SimpleTestPlugin.java
@@ -0,0 +1,57 @@
+/*
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.plugin;
+
+import org.apache.hadoop.conf.Configuration;
+
+/**
+ * Simple Test plugin
+ * 
+ * @author joa23
+ */
+public class SimpleTestPlugin extends Plugin {
+
+  /**
+   * @param pDescriptor
+   * @param conf
+   */
+  public SimpleTestPlugin(PluginDescriptor pDescriptor, Configuration conf) {
+
+    super(pDescriptor, conf);
+  }
+
+  /*
+   * @see org.apache.nutch.plugin.Plugin#startUp()
+   */
+  public void startUp() throws PluginRuntimeException {
+    System.err.println("start up Plugin: " + getDescriptor().getPluginId());
+
+  }
+
+  /*
+   * (non-Javadoc)
+   * 
+   * @see org.apache.nutch.plugin.Plugin#shutDown()
+   */
+  public void shutDown() throws PluginRuntimeException {
+    System.err.println("shutdown Plugin: " + getDescriptor().getPluginId());
+
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/plugin/TestPluginSystem.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/plugin/TestPluginSystem.java b/nutch-core/src/test/java/org/apache/nutch/plugin/TestPluginSystem.java
new file mode 100644
index 0000000..7bcc9ab
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/plugin/TestPluginSystem.java
@@ -0,0 +1,305 @@
+/*
+ /**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.plugin;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.util.LinkedList;
+import java.util.Locale;
+import java.util.Properties;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.nutch.test.IntegrationTest;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.NutchJob;
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+
+/**
+ * Unit tests for the plugin system
+ */
+@Category(IntegrationTest.class)
+public class TestPluginSystem {
+  private int fPluginCount;
+
+  private LinkedList<File> fFolders = new LinkedList<File>();
+  private Configuration conf;
+  private PluginRepository repository;
+
+  @Before
+  public void setUp() throws Exception {
+    this.conf = NutchConfiguration.create();
+    conf.set("plugin.includes", ".*");
+    // String string = this.conf.get("plugin.includes", "");
+    // conf.set("plugin.includes", string + "|Dummy*");
+    fPluginCount = 5;
+    createDummyPlugins(fPluginCount);
+    this.repository = PluginRepository.get(conf);
+  }
+
+  /*
+   * (non-Javadoc)
+   * 
+   * @see junit.framework.TestCase#tearDown()
+   */
+  @After
+  public void tearDown() throws Exception {
+    for (int i = 0; i < fFolders.size(); i++) {
+      File folder = fFolders.get(i);
+      delete(folder);
+      folder.delete();
+    }
+  }
+
+  /**
+   */
+  @Test
+  public void testPluginConfiguration() {
+    String string = getPluginFolder();
+    File file = new File(string);
+    if (!file.exists()) {
+      file.mkdir();
+    }
+    Assert.assertTrue(file.exists());
+  }
+
+  /**
+   */
+  @Test
+  public void testLoadPlugins() {
+    PluginDescriptor[] descriptors = repository.getPluginDescriptors();
+    int k = descriptors.length;
+    Assert.assertTrue(fPluginCount <= k);
+    for (int i = 0; i < descriptors.length; i++) {
+      PluginDescriptor descriptor = descriptors[i];
+      if (!descriptor.getPluginId().startsWith("getPluginFolder()")) {
+        continue;
+      }
+      Assert.assertEquals(1, descriptor.getExportedLibUrls().length);
+      Assert.assertEquals(1, descriptor.getNotExportedLibUrls().length);
+    }
+  }
+
+  @Test
+  public void testRepositoryCache() {
+    Configuration config = NutchConfiguration.create();
+    PluginRepository repo = PluginRepository.get(config);
+    JobConf job = new NutchJob(config);
+    PluginRepository repo1 = PluginRepository.get(job);
+    Assert.assertTrue(repo == repo1);
+    // now construct a config without UUID
+    config = new Configuration();
+    config.addResource("nutch-default.xml");
+    config.addResource("nutch-site.xml");
+    repo = PluginRepository.get(config);
+    job = new NutchJob(config);
+    repo1 = PluginRepository.get(job);
+    Assert.assertTrue(repo1 != repo);
+  }
+
+  /**
+   *  
+   */
+  @Test
+  public void testGetExtensionAndAttributes() {
+    String xpId = " sdsdsd";
+    ExtensionPoint extensionPoint = repository.getExtensionPoint(xpId);
+    Assert.assertEquals(extensionPoint, null);
+    Extension[] extension1 = repository.getExtensionPoint(getGetExtensionId())
+        .getExtensions();
+    Assert.assertEquals(extension1.length, fPluginCount);
+    for (int i = 0; i < extension1.length; i++) {
+      Extension extension2 = extension1[i];
+      String string = extension2.getAttribute(getGetConfigElementName());
+      Assert.assertEquals(string, getParameterValue());
+    }
+  }
+
+  /**
+   * @throws PluginRuntimeException
+   */
+  @Test
+  public void testGetExtensionInstances() throws PluginRuntimeException {
+    Extension[] extensions = repository.getExtensionPoint(getGetExtensionId())
+        .getExtensions();
+    Assert.assertEquals(extensions.length, fPluginCount);
+    for (int i = 0; i < extensions.length; i++) {
+      Extension extension = extensions[i];
+      Object object = extension.getExtensionInstance();
+      if (!(object instanceof HelloWorldExtension))
+        Assert.fail(" object is not a instance of HelloWorldExtension");
+      ((ITestExtension) object).testGetExtension("Bla ");
+      String string = ((ITestExtension) object).testGetExtension("Hello");
+      Assert.assertEquals("Hello World", string);
+    }
+  }
+
+  /**
+   * 
+   *  
+   */
+  @Test
+  public void testGetClassLoader() {
+    PluginDescriptor[] descriptors = repository.getPluginDescriptors();
+    for (int i = 0; i < descriptors.length; i++) {
+      PluginDescriptor descriptor = descriptors[i];
+      Assert.assertNotNull(descriptor.getClassLoader());
+    }
+  }
+
+  /**
+   * @throws IOException
+   */
+  @Test
+  public void testGetResources() throws IOException {
+    PluginDescriptor[] descriptors = repository.getPluginDescriptors();
+    for (int i = 0; i < descriptors.length; i++) {
+      PluginDescriptor descriptor = descriptors[i];
+      if (!descriptor.getPluginId().startsWith("getPluginFolder()")) {
+        continue;
+      }
+      String value = descriptor.getResourceString("key", Locale.UK);
+      Assert.assertEquals("value", value);
+      value = descriptor.getResourceString("key", Locale.TRADITIONAL_CHINESE);
+      Assert.assertEquals("value", value);
+
+    }
+  }
+
+  /**
+   * @return a PluginFolderPath
+   */
+  private String getPluginFolder() {
+    String[] strings = conf.getStrings("plugin.folders");
+    if (strings == null || strings.length == 0)
+      Assert.fail("no plugin directory setuped..");
+
+    String name = strings[0];
+    return new PluginManifestParser(conf, this.repository)
+        .getPluginFolder(name).toString();
+  }
+
+  /**
+   * Creates some Dummy Plugins
+   * 
+   * @param pCount
+   */
+  private void createDummyPlugins(int pCount) {
+    String string = getPluginFolder();
+    try {
+      File folder = new File(string);
+      folder.mkdir();
+      for (int i = 0; i < pCount; i++) {
+        String pluginFolder = string + File.separator + "DummyPlugin" + i;
+        File file = new File(pluginFolder);
+        file.mkdir();
+        fFolders.add(file);
+        createPluginManifest(i, file.getAbsolutePath());
+        createResourceFile(file.getAbsolutePath());
+      }
+    } catch (IOException e) {
+      e.printStackTrace();
+    }
+  }
+
+  /**
+   * Creates an ResourceFile
+   * 
+   * @param pFolderPath
+   * @throws FileNotFoundException
+   * @throws IOException
+   */
+  private void createResourceFile(String pFolderPath)
+      throws FileNotFoundException, IOException {
+    Properties properties = new Properties();
+    properties.setProperty("key", "value");
+    properties.store(new FileOutputStream(pFolderPath + File.separator
+        + "messages" + ".properties"), "");
+  }
+
+  /**
+   * Deletes files in path
+   * 
+   * @param path
+   * @throws IOException
+   */
+  private void delete(File path) throws IOException {
+    File[] files = path.listFiles();
+    for (int i = 0; i < files.length; ++i) {
+      if (files[i].isDirectory())
+        delete(files[i]);
+      files[i].delete();
+    }
+  }
+
+  /**
+   * Creates an Plugin Manifest File
+   * 
+   * @param i
+   * @param pFolderPath
+   * @throws IOException
+   */
+  private void createPluginManifest(int i, String pFolderPath)
+      throws IOException {
+    FileWriter out = new FileWriter(pFolderPath + File.separator + "plugin.xml");
+    String xml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>"
+        + "<!--this is just a simple plugin for testing issues.-->"
+        + "<plugin id=\"org.apache.nutch.plugin."
+        + i
+        + "\" name=\""
+        + i
+        + "\" version=\"1.0\" provider-name=\"joa23\" "
+        + "class=\"org.apache.nutch.plugin.SimpleTestPlugin\">"
+        + "<extension-point id=\"aExtensioID\" "
+        + "name=\"simple Parser Extension\" "
+        + "schema=\"schema/testExtensionPoint.exsd\"/>"
+        + "<runtime><library name=\"libs/exported.jar\"><extport/></library>"
+        + "<library name=\"libs/not_exported.jar\"/></runtime>"
+        + "<extension point=\"aExtensioID\">"
+        + "<implementation name=\"simple Parser Extension\" "
+        + "id=\"aExtensionId.\" class=\"org.apache.nutch.plugin.HelloWorldExtension\">"
+        + "<parameter name=\"dummy-name\" value=\"a simple param value\"/>"
+        + "</implementation></extension></plugin>";
+    out.write(xml);
+    out.flush();
+    out.close();
+  }
+
+  private String getParameterValue() {
+    return "a simple param value";
+  }
+
+  private static String getGetExtensionId() {
+    return "aExtensioID";
+  }
+
+  private static String getGetConfigElementName() {
+    return "dummy-name";
+  }
+
+  public static void main(String[] args) throws IOException {
+    new TestPluginSystem().createPluginManifest(1, "/");
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/protocol/TestContent.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/protocol/TestContent.java b/nutch-core/src/test/java/org/apache/nutch/protocol/TestContent.java
new file mode 100644
index 0000000..1475cda
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/protocol/TestContent.java
@@ -0,0 +1,94 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol;
+
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.metadata.SpellCheckedMetadata;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.WritableTestUtils;
+import org.apache.tika.mime.MimeTypes;
+import org.junit.Assert;
+import org.junit.Test;
+
+/** Unit tests for Content. */
+
+public class TestContent {
+
+  private static Configuration conf = NutchConfiguration.create();
+
+  @Test
+  public void testContent() throws Exception {
+
+    String page = "<HTML><BODY><H1>Hello World</H1><P>The Quick Brown Fox Jumped Over the Lazy Fox.</BODY></HTML>";
+
+    String url = "http://www.foo.com/";
+
+    SpellCheckedMetadata metaData = new SpellCheckedMetadata();
+    metaData.add("Host", "www.foo.com");
+    metaData.add("Content-Type", "text/html");
+
+    Content r = new Content(url, url, page.getBytes("UTF8"), "text/html",
+        metaData, conf);
+
+    WritableTestUtils.testWritable(r);
+    Assert.assertEquals("text/html", r.getMetadata().get("Content-Type"));
+    Assert.assertEquals("text/html", r.getMetadata().get("content-type"));
+    Assert.assertEquals("text/html", r.getMetadata().get("CONTENTYPE"));
+  }
+
+  /** Unit tests for getContentType(String, String, byte[]) method. */
+  @Test
+  public void testGetContentType() throws Exception {
+    Content c = null;
+    Metadata p = new Metadata();
+
+    c = new Content("http://www.foo.com/", "http://www.foo.com/",
+        "".getBytes("UTF8"), "text/html; charset=UTF-8", p, conf);
+    Assert.assertEquals("text/html", c.getContentType());
+
+    c = new Content("http://www.foo.com/foo.html", "http://www.foo.com/",
+        "".getBytes("UTF8"), "", p, conf);
+    Assert.assertEquals("text/html", c.getContentType());
+
+    c = new Content("http://www.foo.com/foo.html", "http://www.foo.com/",
+        "".getBytes("UTF8"), null, p, conf);
+    Assert.assertEquals("text/html", c.getContentType());
+
+    c = new Content("http://www.foo.com/", "http://www.foo.com/",
+        "<html></html>".getBytes("UTF8"), "", p, conf);
+    Assert.assertEquals("text/html", c.getContentType());
+
+    c = new Content("http://www.foo.com/foo.html", "http://www.foo.com/",
+        "<html></html>".getBytes("UTF8"), "text/plain", p, conf);
+    Assert.assertEquals("text/html", c.getContentType());
+
+    c = new Content("http://www.foo.com/foo.png", "http://www.foo.com/",
+        "<html></html>".getBytes("UTF8"), "text/plain", p, conf);
+    Assert.assertEquals("text/html", c.getContentType());
+
+    c = new Content("http://www.foo.com/", "http://www.foo.com/",
+        "".getBytes("UTF8"), "", p, conf);
+    Assert.assertEquals(MimeTypes.OCTET_STREAM, c.getContentType());
+
+    c = new Content("http://www.foo.com/", "http://www.foo.com/",
+        "".getBytes("UTF8"), null, p, conf);
+    Assert.assertNotNull(c.getContentType());
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/protocol/TestProtocolFactory.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/protocol/TestProtocolFactory.java b/nutch-core/src/test/java/org/apache/nutch/protocol/TestProtocolFactory.java
new file mode 100644
index 0000000..6b4c8fd
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/protocol/TestProtocolFactory.java
@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.test.IntegrationTest;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.ObjectCache;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+
+@Category(IntegrationTest.class)
+public class TestProtocolFactory {
+
+  Configuration conf;
+  ProtocolFactory factory;
+
+  @Before
+  public void setUp() throws Exception {
+    conf = NutchConfiguration.create();
+    conf.set("plugin.includes", ".*");
+    conf.set("http.agent.name", "test-bot");
+    factory = new ProtocolFactory(conf);
+  }
+
+  @Test
+  public void testGetProtocol() {
+
+    // non existing protocol
+    try {
+      factory.getProtocol("xyzxyz://somehost");
+      Assert.fail("Must throw ProtocolNotFound");
+    } catch (ProtocolNotFound e) {
+      // all is ok
+    } catch (Exception ex) {
+      Assert.fail("Must not throw any other exception");
+    }
+
+    Protocol httpProtocol = null;
+
+    // existing protocol
+    try {
+      httpProtocol = factory.getProtocol("http://somehost");
+      Assert.assertNotNull(httpProtocol);
+    } catch (Exception ex) {
+      Assert.fail("Must not throw any other exception");
+    }
+
+    // cache key
+    Object protocol = ObjectCache.get(conf).getObject(
+        Protocol.X_POINT_ID + "http");
+    Assert.assertNotNull(protocol);
+    Assert.assertEquals(httpProtocol, protocol);
+
+    // test same object instance
+    try {
+      Assert.assertTrue(httpProtocol == factory.getProtocol("http://somehost"));
+    } catch (ProtocolNotFound e) {
+      Assert.fail("Must not throw any exception");
+    }
+  }
+
+  @Test
+  public void testContains() {
+    Assert.assertTrue(factory.contains("http", "http"));
+    Assert.assertTrue(factory.contains("http", "http,ftp"));
+    Assert.assertTrue(factory.contains("http", "   http ,   ftp"));
+    Assert.assertTrue(factory.contains("smb", "ftp,smb,http"));
+    Assert.assertFalse(factory.contains("smb", "smbb"));
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/segment/TestSegmentMerger.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/segment/TestSegmentMerger.java b/nutch-core/src/test/java/org/apache/nutch/segment/TestSegmentMerger.java
new file mode 100644
index 0000000..6657c42
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/segment/TestSegmentMerger.java
@@ -0,0 +1,131 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.segment;
+
+import java.text.DecimalFormat;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.MapFile;
+import org.apache.hadoop.io.MapFile.Writer.Option;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.MapFileOutputFormat;
+import org.apache.nutch.parse.ParseText;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+public class TestSegmentMerger {
+  Configuration conf;
+  FileSystem fs;
+  Path testDir;
+  Path seg1;
+  Path seg2;
+  Path out;
+  int countSeg1, countSeg2;
+
+  @Before
+  public void setUp() throws Exception {
+    conf = NutchConfiguration.create();
+    fs = FileSystem.get(conf);
+    testDir = new Path(conf.get("hadoop.tmp.dir"), "merge-"
+        + System.currentTimeMillis());
+    seg1 = new Path(testDir, "seg1");
+    seg2 = new Path(testDir, "seg2");
+    out = new Path(testDir, "out");
+
+    // create large parse-text segments
+    System.err.println("Creating large segment 1...");
+    DecimalFormat df = new DecimalFormat("0000000");
+    Text k = new Text();
+    Path ptPath = new Path(new Path(seg1, ParseText.DIR_NAME), "part-00000");
+    Option kOpt = MapFile.Writer.keyClass(Text.class);
+    org.apache.hadoop.io.SequenceFile.Writer.Option vOpt = SequenceFile.Writer.valueClass(ParseText.class);
+    MapFile.Writer w = new MapFile.Writer(conf, ptPath, kOpt, vOpt);
+    long curSize = 0;
+    countSeg1 = 0;
+    FileStatus fileStatus = fs.getFileStatus(ptPath);
+    long blkSize = fileStatus.getBlockSize();
+
+    while (curSize < blkSize * 2) {
+      k.set("seg1-" + df.format(countSeg1));
+      w.append(k, new ParseText("seg1 text " + countSeg1));
+      countSeg1++;
+      curSize += 40; // roughly ...
+    }
+    w.close();
+    System.err.println(" - done: " + countSeg1 + " records.");
+    System.err.println("Creating large segment 2...");
+    ptPath = new Path(new Path(seg2, ParseText.DIR_NAME), "part-00000");
+    Option wKeyOpt = MapFile.Writer.keyClass(Text.class);
+    org.apache.hadoop.io.SequenceFile.Writer.Option wValueOpt = SequenceFile.Writer.valueClass(ParseText.class);
+    w = new MapFile.Writer(conf, ptPath, wKeyOpt, wValueOpt);
+    curSize = 0;
+    countSeg2 = 0;
+    while (curSize < blkSize * 2) {
+      k.set("seg2-" + df.format(countSeg2));
+      w.append(k, new ParseText("seg2 text " + countSeg2));
+      countSeg2++;
+      curSize += 40; // roughly ...
+    }
+    w.close();
+    System.err.println(" - done: " + countSeg2 + " records.");
+  }
+
+  @After
+  public void tearDown() throws Exception {
+    fs.delete(testDir, true);
+  }
+
+  @Test
+  public void testLargeMerge() throws Exception {
+    SegmentMerger merger = new SegmentMerger(conf);
+    merger.merge(out, new Path[] { seg1, seg2 }, false, false, -1);
+    // verify output
+    FileStatus[] stats = fs.listStatus(out);
+    // there should be just one path
+    Assert.assertEquals(1, stats.length);
+    Path outSeg = stats[0].getPath();
+    Text k = new Text();
+    ParseText v = new ParseText();
+    MapFile.Reader[] readers = MapFileOutputFormat.getReaders(fs, new Path(
+        outSeg, ParseText.DIR_NAME), conf);
+    int cnt1 = 0, cnt2 = 0;
+    for (MapFile.Reader r : readers) {
+      while (r.next(k, v)) {
+        String ks = k.toString();
+        String vs = v.getText();
+        if (ks.startsWith("seg1-")) {
+          cnt1++;
+          Assert.assertTrue(vs.startsWith("seg1 "));
+        } else if (ks.startsWith("seg2-")) {
+          cnt2++;
+          Assert.assertTrue(vs.startsWith("seg2 "));
+        }
+      }
+      r.close();
+    }
+    Assert.assertEquals(countSeg1, cnt1);
+    Assert.assertEquals(countSeg2, cnt2);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/segment/TestSegmentMergerCrawlDatums.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/segment/TestSegmentMergerCrawlDatums.java b/nutch-core/src/test/java/org/apache/nutch/segment/TestSegmentMergerCrawlDatums.java
new file mode 100644
index 0000000..aaed8bc
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/segment/TestSegmentMergerCrawlDatums.java
@@ -0,0 +1,427 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.segment;
+
+import java.text.DecimalFormat;
+import java.util.Random;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.MapFile;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.MapFile.Writer.Option;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.MapFileOutputFormat;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * New SegmentMerger unit test focusing on several crappy issues with the
+ * segment merger. The general problem is disappearing records and incorrect
+ * CrawlDatum status values. This unit test performs random sequences of segment
+ * merging where we're looking for an expected status. A second test is able to
+ * randomly inject redirects in segment, likely causing the segment merger to
+ * fail resulting in a bad merged segment.
+ * 
+ * See also:
+ * 
+ * https://issues.apache.org/jira/browse/NUTCH-1113
+ * https://issues.apache.org/jira/browse/NUTCH-1616
+ * https://issues.apache.org/jira/browse/NUTCH-1520
+ * 
+ * Cheers!
+ */
+public class TestSegmentMergerCrawlDatums {
+  Configuration conf;
+  FileSystem fs;
+  Random rnd;
+
+  private static final Logger LOG = LoggerFactory
+      .getLogger(TestSegmentMergerCrawlDatums.class);
+
+  @Before
+  public void setUp() throws Exception {
+    conf = NutchConfiguration.create();
+    fs = FileSystem.get(conf);
+    rnd = new Random();
+  }
+
+  /**
+   *
+   */
+  @Test
+  public void testSingleRandomSequence() throws Exception {
+    Assert.assertEquals(
+        new Byte(CrawlDatum.STATUS_FETCH_SUCCESS),
+        new Byte(executeSequence(CrawlDatum.STATUS_FETCH_GONE,
+            CrawlDatum.STATUS_FETCH_SUCCESS, 256, false)));
+  }
+
+  /**
+   *
+   */
+  @Test
+  public void testMostlyRedirects() throws Exception {
+    // Our test directory
+    Path testDir = new Path(conf.get("hadoop.tmp.dir"), "merge-"
+        + System.currentTimeMillis());
+
+    Path segment1 = new Path(testDir, "20140110114943");
+    Path segment2 = new Path(testDir, "20140110114832");
+    Path segment3 = new Path(testDir, "20140110114558");
+    Path segment4 = new Path(testDir, "20140110114930");
+    Path segment5 = new Path(testDir, "20140110114545");
+    Path segment6 = new Path(testDir, "20140110114507");
+    Path segment7 = new Path(testDir, "20140110114903");
+    Path segment8 = new Path(testDir, "20140110114724");
+
+    createSegment(segment1, CrawlDatum.STATUS_FETCH_SUCCESS, true);
+    createSegment(segment2, CrawlDatum.STATUS_FETCH_SUCCESS, true);
+    createSegment(segment3, CrawlDatum.STATUS_FETCH_SUCCESS, true);
+    createSegment(segment4, CrawlDatum.STATUS_FETCH_SUCCESS, true);
+    createSegment(segment5, CrawlDatum.STATUS_FETCH_SUCCESS, true);
+    createSegment(segment6, CrawlDatum.STATUS_FETCH_SUCCESS, false);
+    createSegment(segment7, CrawlDatum.STATUS_FETCH_SUCCESS, true);
+    createSegment(segment8, CrawlDatum.STATUS_FETCH_SUCCESS, true);
+
+    // Merge the segments and get status
+    Path mergedSegment = merge(testDir, new Path[] { segment1, segment2,
+        segment3, segment4, segment5, segment6, segment7, segment8 });
+    Byte status = new Byte(status = checkMergedSegment(testDir, mergedSegment));
+
+    Assert.assertEquals(new Byte(CrawlDatum.STATUS_FETCH_SUCCESS), status);
+  }
+
+  /**
+   *
+   */
+  @Test
+  public void testRandomizedSequences() throws Exception {
+    for (int i = 0; i < rnd.nextInt(16) + 16; i++) {
+      byte expectedStatus = (byte) (rnd.nextInt(6) + 0x21);
+      while (expectedStatus == CrawlDatum.STATUS_FETCH_RETRY
+          || expectedStatus == CrawlDatum.STATUS_FETCH_NOTMODIFIED) {
+        // fetch_retry and fetch_notmodified never remain in a merged segment
+        expectedStatus = (byte) (rnd.nextInt(6) + 0x21);
+      }
+      byte randomStatus = (byte) (rnd.nextInt(6) + 0x21);
+      int rounds = rnd.nextInt(16) + 32;
+      boolean withRedirects = rnd.nextBoolean();
+
+      byte resultStatus = executeSequence(randomStatus, expectedStatus, rounds,
+          withRedirects);
+      Assert.assertEquals(
+          "Expected status = " + CrawlDatum.getStatusName(expectedStatus)
+              + ", but got " + CrawlDatum.getStatusName(resultStatus)
+              + " when merging " + rounds + " segments"
+              + (withRedirects ? " with redirects" : ""), expectedStatus,
+          resultStatus);
+    }
+  }
+
+  /**
+   *
+   */
+  @Test
+  public void testRandomTestSequenceWithRedirects() throws Exception {
+    Assert.assertEquals(
+        new Byte(CrawlDatum.STATUS_FETCH_SUCCESS),
+        new Byte(executeSequence(CrawlDatum.STATUS_FETCH_GONE,
+            CrawlDatum.STATUS_FETCH_SUCCESS, 128, true)));
+  }
+
+  /**
+   * Check a fixed sequence!
+   */
+  @Test
+  public void testFixedSequence() throws Exception {
+    // Our test directory
+    Path testDir = new Path(conf.get("hadoop.tmp.dir"), "merge-"
+        + System.currentTimeMillis());
+
+    Path segment1 = new Path(testDir, "00001");
+    Path segment2 = new Path(testDir, "00002");
+    Path segment3 = new Path(testDir, "00003");
+
+    createSegment(segment1, CrawlDatum.STATUS_FETCH_GONE, false);
+    createSegment(segment2, CrawlDatum.STATUS_FETCH_GONE, true);
+    createSegment(segment3, CrawlDatum.STATUS_FETCH_SUCCESS, false);
+
+    // Merge the segments and get status
+    Path mergedSegment = merge(testDir, new Path[] { segment1, segment2,
+        segment3 });
+    Byte status = new Byte(status = checkMergedSegment(testDir, mergedSegment));
+
+    Assert.assertEquals(new Byte(CrawlDatum.STATUS_FETCH_SUCCESS), status);
+  }
+
+  /**
+   * Check a fixed sequence!
+   */
+  @Test
+  public void testRedirFetchInOneSegment() throws Exception {
+    // Our test directory
+    Path testDir = new Path(conf.get("hadoop.tmp.dir"), "merge-"
+        + System.currentTimeMillis());
+
+    Path segment = new Path(testDir, "00001");
+
+    createSegment(segment, CrawlDatum.STATUS_FETCH_SUCCESS, true, true);
+
+    // Merge the segments and get status
+    Path mergedSegment = merge(testDir, new Path[] { segment });
+    Byte status = new Byte(status = checkMergedSegment(testDir, mergedSegment));
+
+    Assert.assertEquals(new Byte(CrawlDatum.STATUS_FETCH_SUCCESS), status);
+  }
+
+  /**
+   * Check a fixed sequence!
+   */
+  @Test
+  public void testEndsWithRedirect() throws Exception {
+    // Our test directory
+    Path testDir = new Path(conf.get("hadoop.tmp.dir"), "merge-"
+        + System.currentTimeMillis());
+
+    Path segment1 = new Path(testDir, "00001");
+    Path segment2 = new Path(testDir, "00002");
+
+    createSegment(segment1, CrawlDatum.STATUS_FETCH_SUCCESS, false);
+    createSegment(segment2, CrawlDatum.STATUS_FETCH_SUCCESS, true);
+
+    // Merge the segments and get status
+    Path mergedSegment = merge(testDir, new Path[] { segment1, segment2 });
+    Byte status = new Byte(status = checkMergedSegment(testDir, mergedSegment));
+
+    Assert.assertEquals(new Byte(CrawlDatum.STATUS_FETCH_SUCCESS), status);
+  }
+
+  /**
+   * Execute a sequence of creating segments, merging them and checking the
+   * final output
+   * 
+   * @param status
+   *          to start with
+   * @param status
+   *          to end with
+   * @param number
+   *          of rounds
+   * @param whether
+   *          redirects are injected randomly
+   * @return the CrawlDatum status
+   */
+  protected byte executeSequence(byte firstStatus, byte lastStatus, int rounds,
+      boolean redirect) throws Exception {
+    // Our test directory
+    Path testDir = new Path(conf.get("hadoop.tmp.dir"), "merge-"
+        + System.currentTimeMillis());
+
+    // Format for the segments
+    DecimalFormat df = new DecimalFormat("0000000");
+
+    // Create our segment paths
+    Path[] segmentPaths = new Path[rounds];
+    for (int i = 0; i < rounds; i++) {
+      String segmentName = df.format(i);
+      segmentPaths[i] = new Path(testDir, segmentName);
+    }
+
+    // Create the first segment according to the specified status
+    createSegment(segmentPaths[0], firstStatus, false);
+
+    // Create N segments with random status and optionally with randomized
+    // redirect injection
+    for (int i = 1; i < rounds - 1; i++) {
+      // Status, 6 possibilities incremented with 33 hex
+      byte status = (byte) (rnd.nextInt(6) + 0x21);
+
+      // Whether this is going to be a redirect
+      boolean addRedirect = redirect ? rnd.nextBoolean() : false;
+      // If it's a redirect we add a datum resulting from a fetch at random,
+      // if not: always add a fetch datum to avoid empty segments
+      boolean addFetch = addRedirect ? rnd.nextBoolean() : true;
+
+      createSegment(segmentPaths[i], status, addFetch, addRedirect);
+    }
+
+    // Create the last segment according to the specified status
+    // (additionally, add a redirect at random)
+    createSegment(segmentPaths[rounds - 1], lastStatus, true,
+        redirect ? rnd.nextBoolean() : false);
+
+    // Merge the segments!
+    Path mergedSegment = merge(testDir, segmentPaths);
+
+    // Check the status of the final record and return it
+    return checkMergedSegment(testDir, mergedSegment);
+  }
+
+  /**
+   * Checks the merged segment and removes the stuff again.
+   * 
+   * @param the
+   *          test directory
+   * @param the
+   *          merged segment
+   * @return the final status
+   */
+  protected byte checkMergedSegment(Path testDir, Path mergedSegment)
+      throws Exception {
+    // Get a MapFile reader for the <Text,CrawlDatum> pairs
+    MapFile.Reader[] readers = MapFileOutputFormat.getReaders(fs, new Path(
+        mergedSegment, CrawlDatum.FETCH_DIR_NAME), conf);
+
+    Text key = new Text();
+    CrawlDatum value = new CrawlDatum();
+    byte finalStatus = 0x0;
+
+    for (MapFile.Reader reader : readers) {
+      while (reader.next(key, value)) {
+        LOG.info("Reading status for: " + key.toString() + " > "
+            + CrawlDatum.getStatusName(value.getStatus()));
+
+        // Only consider fetch status
+        if (CrawlDatum.hasFetchStatus(value)
+            && key.toString().equals("http://nutch.apache.org/")) {
+          finalStatus = value.getStatus();
+        }
+      }
+
+      // Close the reader again
+      reader.close();
+    }
+
+    // Remove the test directory again
+    fs.delete(testDir, true);
+
+    LOG.info("Final fetch status for: http://nutch.apache.org/ > "
+        + CrawlDatum.getStatusName(finalStatus));
+
+    // Return the final status
+    return finalStatus;
+  }
+
+  /**
+   * Merge some segments!
+   * 
+   * @param the
+   *          test directory
+   * @param the
+   *          segments to merge
+   * @return Path to the merged segment
+   */
+  protected Path merge(Path testDir, Path[] segments) throws Exception {
+    // Our merged output directory
+    Path out = new Path(testDir, "out");
+
+    // Merge
+    SegmentMerger merger = new SegmentMerger(conf);
+    merger.merge(out, segments, false, false, -1);
+
+    FileStatus[] stats = fs.listStatus(out);
+    Assert.assertEquals(1, stats.length);
+
+    return stats[0].getPath();
+  }
+
+  /**
+   * Create a segment with the specified status.
+   * 
+   * @param the
+   *          segment's paths
+   * @param the
+   *          status of the record, ignored if redirect is true
+   * @param whether
+   *          we're doing a redirect as well
+   */
+  protected void createSegment(Path segment, byte status, boolean redirect)
+      throws Exception {
+    if (redirect) {
+      createSegment(segment, status, false, true);
+    } else {
+      createSegment(segment, status, true, false);
+    }
+  }
+
+  protected void createSegment(Path segment, byte status, boolean fetch,
+      boolean redirect) throws Exception {
+    LOG.info("\nSegment: " + segment.toString());
+
+    // The URL of our main record
+    String url = "http://nutch.apache.org/";
+
+    // The URL of our redirecting URL
+    String redirectUrl = "http://nutch.apache.org/i_redirect_to_the_root/";
+
+    // Our value
+    CrawlDatum value = new CrawlDatum();
+
+    // Path of the segment's crawl_fetch directory
+    Path crawlFetchPath = new Path(
+        new Path(segment, CrawlDatum.FETCH_DIR_NAME), "part-00000");
+
+    // Get a writer for map files containing <Text,CrawlDatum> pairs
+    Option wKeyOpt = MapFile.Writer.keyClass(Text.class);
+    org.apache.hadoop.io.SequenceFile.Writer.Option wValueOpt = SequenceFile.Writer.valueClass(CrawlDatum.class);
+    MapFile.Writer writer = new MapFile.Writer(conf, crawlFetchPath, wKeyOpt, wValueOpt);
+
+    // Whether we're handling a redirect now
+    // first add the linked datum
+    // - before redirect status because url sorts before redirectUrl
+    // - before fetch status to check whether fetch datum is preferred over
+    // linked datum when merging
+    if (redirect) {
+      // We're writing our our main record URL with status linked
+      LOG.info(url + " > " + CrawlDatum.getStatusName(CrawlDatum.STATUS_LINKED));
+      value = new CrawlDatum();
+      value.setStatus(CrawlDatum.STATUS_LINKED);
+      writer.append(new Text(url), value);
+    }
+
+    // Whether we're fetching now
+    if (fetch) {
+      LOG.info(url + " > " + CrawlDatum.getStatusName(status));
+
+      // Set the status
+      value.setStatus(status);
+
+      // Write the pair and ok
+      writer.append(new Text(url), value);
+    }
+
+    // Whether we're handing a redirect now
+    if (redirect) {
+      // And the redirect URL with redirect status, pointing to our main URL
+      LOG.info(redirectUrl + " > "
+          + CrawlDatum.getStatusName(CrawlDatum.STATUS_FETCH_REDIR_TEMP));
+      value.setStatus(CrawlDatum.STATUS_FETCH_REDIR_TEMP);
+      writer.append(new Text(redirectUrl), value);
+    }
+
+    // Close the stuff
+    writer.close();
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/service/TestNutchServer.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/service/TestNutchServer.java b/nutch-core/src/test/java/org/apache/nutch/service/TestNutchServer.java
new file mode 100644
index 0000000..1ee16c4
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/service/TestNutchServer.java
@@ -0,0 +1,65 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.service;
+
+import javax.ws.rs.core.Response;
+
+import org.apache.cxf.jaxrs.client.WebClient;
+import org.junit.Assert;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class TestNutchServer {
+
+  private static final Logger LOG = LoggerFactory.getLogger(TestNutchServer.class);
+  NutchServer server = NutchServer.getInstance();
+
+  private int port[] = {8081, 9999, 9100, 8900};
+  private final String ENDPOINT_ADDRESS = "http://localhost:";
+
+  @Test
+  public void testNutchServerStartup() {
+    boolean isRunning = false;
+    for(int i=0;i<port.length; i++) {
+      try {
+        startServer(port[i]);
+        isRunning = true;
+        break;
+      }catch(Exception e) {
+        LOG.info("Could not start server on port: {}. Tries remaining {}", port[i], port.length-i);
+      }
+    }
+    if(!isRunning) {
+      LOG.info("Could not start server, all ports in use");
+    }
+    else {
+      LOG.info("Testing admin endpoint");
+      WebClient client = WebClient.create(ENDPOINT_ADDRESS + server.getPort());
+      Response response = client.path("admin").get();
+      //Assert.assertTrue(response.readEntity(String.class).contains("startDate"));
+      response = client.path("stop").get();
+      //Assert.assertTrue(response.readEntity(String.class).contains("Stopping"));
+    }
+  }
+
+  private void startServer(int port) throws Exception{
+    NutchServer.setPort(port);
+    NutchServer.startServer();
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/test/IntegrationTest.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/test/IntegrationTest.java b/nutch-core/src/test/java/org/apache/nutch/test/IntegrationTest.java
new file mode 100644
index 0000000..131b667
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/test/IntegrationTest.java
@@ -0,0 +1,6 @@
+package org.apache.nutch.test;
+
+/**
+ * A marker interface for marking integration tests
+ */
+public interface IntegrationTest {}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/test/TestUtils.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/test/TestUtils.java b/nutch-core/src/test/java/org/apache/nutch/test/TestUtils.java
new file mode 100644
index 0000000..87d37a5
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/test/TestUtils.java
@@ -0,0 +1,29 @@
+package org.apache.nutch.test;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.net.URISyntaxException;
+import java.net.URL;
+
+public class TestUtils {
+
+  /**
+   *
+   * @param obj an object whose class's loader should be used
+   * @param fileName name of file
+   * @return File instance
+   * @throws FileNotFoundException when an error occurs or file is not found
+   */
+  public static File getFile(Object obj, String fileName)
+      throws FileNotFoundException {
+    try {
+      URL resource = obj.getClass().getClassLoader().getResource(fileName);
+      if (resource == null) {
+        throw new FileNotFoundException(fileName + " not known to classloader of " + obj);
+      }
+      return new File(resource.toURI());
+    } catch (URISyntaxException e) {
+      throw new FileNotFoundException(e.getMessage());
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/tools/TestCommonCrawlDataDumper.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/tools/TestCommonCrawlDataDumper.java b/nutch-core/src/test/java/org/apache/nutch/tools/TestCommonCrawlDataDumper.java
new file mode 100644
index 0000000..fef0e69
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/tools/TestCommonCrawlDataDumper.java
@@ -0,0 +1,126 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tools;
+
+//Junit imports
+import static org.junit.Assert.*;
+
+import org.apache.nutch.test.TestUtils;
+import org.junit.Test;
+
+//Commons imports
+import org.apache.commons.io.FileUtils;
+import org.apache.commons.io.filefilter.FileFilterUtils;
+
+//JDK imports
+import java.io.File;
+import java.nio.file.Files;
+import java.util.Collection;
+
+//Nutch imports
+import org.apache.nutch.tools.CommonCrawlDataDumper;
+import org.apache.nutch.tools.CommonCrawlConfig;
+
+/**
+ * 
+ * Test harness for the {@link CommonCrawlDataDumper}.
+ *
+ */
+public class TestCommonCrawlDataDumper {
+
+  @Test
+  public void testDump() throws Exception {
+    File sampleSegmentDir = TestUtils.getFile(this, "test-segments");
+    File tempDir = Files.createTempDirectory("temp").toFile();
+
+    String[] crawledFiles = {
+        "c463a4381eb837f9f5d45978cfbde79e_.html",
+        "a974b8d74f7779ab6c6f90b9b279467e_.html",
+        "6bc6497314656a3129732efd708e9f96_.html",
+        "6e88c40abe26cad0a726102997aed048_.html",
+        "5cafdd88f4e9cf3f0cd4c298c6873358_apachecon-europe.html",
+        "932dc10a76e894a2baa8ea4086ad72a8_apachecon-north-america.html",
+        "8540187d75b9cd405b8fa97d665f9f90_.html",
+        "e501bc976c8693b4d28a55b79c390a32_.html",
+        "6add662f9f5758b7d75eec5cfa1f340b_.html",
+        "d4f20df3c37033dc516067ee1f424e4e_.html",
+        "d7b8fa9a02cdc95546030d04be4a98f3_solr.html",
+        "3cbe876e3a8e7a397811de3bb6a945cd_.html",
+        "5b987dde0da79d7f2e3f22b46437f514_bot.html",
+        "3d742820d9a701a1f02e10d5bf5ae633_credits.html",
+        "693673f3c73d04a26276effdea69b7ee_downloads.html",
+        "4f7e3469dafabb4c3b87b00531f81aa4_index.html",
+        "15c5330675be8a69995aab18ff9859e0_javadoc.html",
+        "bc624e1b49e29870ef095819bb0e977a_mailing_lists.html",
+        "a7d66b68754c3665c66e62225255e3fd_version_control.html",
+        "32fb7fe362e1a0d8a1b15addf2a00bdc_1.9-rel",
+        "54ab3db10fe7b26415a04e21045125a8_1zE.html",
+        "1012a41c08092c40340598bd8ee0bfa6_PGa.html",
+        "c830cfc5c28bed10e69d5b83e9c1bcdc_nutch_2.3",
+        "687d915dc264a77f35c61ba841936730_oHY.html",
+        "2bf1afb650010128b4cf4afe677db3c5_1pav9xl.html",
+        "550cab79e14110bbee61c36c61c830b0_1pbE15n.html",
+        "664ff07b46520cc1414494ae49da91f6_.html",
+        "04223714e648a6a43d7c8af8b095f733_.html",
+        "3c8ccb865cd72cca06635d74c7f2f3c4_.html",
+        "90fe47b28716a2230c5122c83f0b8562_Becoming_A_Nutch_Developer.html",
+        "ac0fefe70007d40644e2b8bd5da3c305_FAQ.html",
+        "bc9bc7f11c1262e8924032ab1c7ce112_NutchPropertiesCompleteList.html",
+        "78d04611985e7375b441e478fa36f610_.html",
+        "64adaebadd44e487a8b58894e979dc70_CHANGES.txt",
+        "a48e9c2659b703fdea3ad332877708d8_.html",
+        "159d66d679dd4442d2d8ffe6a83b2912_sponsorship.html",
+        "66f1ce6872c9195c665fc8bdde95f6dc_thanks.html",
+        "ef7ee7e929a048c4a119af78492095b3_.html",
+        "e4251896a982c2b2b68678b5c9c57f4d_.html",
+        "5384764a16fab767ebcbc17d87758a24_.html",
+        "a6ba75a218ef2a09d189cb7dffcecc0f_.html",
+        "f2fa63bd7a3aca63841eed4cd10fb519_SolrCloud.html",
+        "f8de0fbda874e1a140f1b07dcebab374_NUTCH-1047.html",
+        "9c120e94f52d690e9cfd044c34134649_NUTCH-1591.html",
+        "7dd70378379aa452279ce9200d0a5fed_NUTCH-841.html",
+        "ddf78b1fe5c268d59fd62bc745815b92_.html",
+        "401c9f04887dbbf8d29ad52841b8bdb3_ApacheNutch.html",
+        "8f984e2d3c2ba68d1695288f1738deaf_Nutch.html",
+        "c2ef09a95a956207cea073a515172be2_FrontPage.html",
+    "90d9b76e8eabdab1cbcc29bea437c7ae_NutchRESTAPI.html" };
+
+    CommonCrawlDataDumper dumper = new CommonCrawlDataDumper(
+        new CommonCrawlConfig());
+    dumper.dump(tempDir, sampleSegmentDir, null, false, null, false, "", false);
+
+    Collection<File> tempFiles = FileUtils.listFiles(tempDir,
+        FileFilterUtils.fileFileFilter(),
+        FileFilterUtils.directoryFileFilter());
+
+    for (String expectedFileName : crawledFiles) {
+      assertTrue("Missed file " + expectedFileName + " in dump", 
+          hasFile(expectedFileName, tempFiles));
+    }
+
+  }
+
+  private boolean hasFile(String fileName, Collection<File> files) {
+    for (File f : files) {
+      if (f.getName().equals(fileName)) {
+        return true;
+      }
+    }
+    return false;
+  }
+}

[30/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build for nutch-core and nutch-plugins

Posted by th...@apache.org.

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/pages/seed/SeedPage.html
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/pages/seed/SeedPage.html b/nutch-core/src/main/java/org/apache/nutch/webui/pages/seed/SeedPage.html
new file mode 100644
index 0000000..d6f33a3
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/pages/seed/SeedPage.html
@@ -0,0 +1,91 @@
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
+	license agreements. See the NOTICE file distributed with this work for additional 
+	information regarding copyright ownership. The ASF licenses this file to 
+	You under the Apache License, Version 2.0 (the "License"); you may not use 
+	this file except in compliance with the License. You may obtain a copy of 
+	the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
+	by applicable law or agreed to in writing, software distributed under the 
+	License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
+	OF ANY KIND, either express or implied. See the License for the specific 
+	language governing permissions and limitations under the License. -->
+<!DOCTYPE html>
+<html xmlns:wicket="http://wicket.apache.org">
+<head>
+<meta charset="utf-8" />
+<title>Wicket extend</title>
+</head>
+
+<body>
+	<wicket:extend>
+		<h2>
+			<wicket:message key="page.header.seedList">Seed list</wicket:message>
+		</h2>
+
+		<div class="row">
+			<div class="col-lg-8">
+				<form class="form-horizontal" wicket:id="seedList">
+					<fieldset>
+						<!-- Text input-->
+						<div class="form-group">
+							<label class="col-md-4 control-label" for="textinput">Seed list name</label>
+							<div class="col-md-4">
+								<input wicket:id="name" name="textinput" class="form-control input-md" type="text">
+							</div>
+						</div>
+						<div class="form-group">
+							<div class="col-md-offset-4 col-md-4">
+								<button type="submit" class="btn btn-primary">Save</button>
+							</div>
+						</div>
+					</fieldset>
+				</form>
+				<h3>Seed urls</h3>
+				<table class="table table-hover table-striped tablesorter">
+					<thead>
+						<tr>
+							<th class="header col-md-3">Url</th>
+							<th></th>
+						</tr>
+					</thead>
+
+					<tbody wicket:id="seedUrlsTable">
+						<tr wicket:id="seedUrls">
+							<td>
+								<span wicket:id="url">http://google.com</span>
+							</td>
+							<td>
+								<button wicket:id="delete" class="btn btn-sm btn-danger" type="button">
+									<span class="fa fa-trash-o"></span>
+								</button>
+							</td>
+						</tr>
+					</tbody>
+				</table>
+				<form class="form-horizontal" wicket:id="urlForm">
+					<fieldset>
+						<div class="form-group">
+							<div class="col-md-4">
+								<input wicket:id="url" name="textinput" class="form-control input-md" type="text">
+							</div>
+							<div>
+								<button wicket:id="addUrl" class="btn btn-primary">Add url</button>
+							</div>
+						</div>
+					</fieldset>
+				</form>
+			</div>
+			<div class="col-lg-4">
+				<div class="panel panel-primary">
+					<div class="panel-heading">
+						<h3 class="panel-title">Help</h3>
+					</div>
+					<div class="panel-body">
+						<p>Some help about seed management</p>
+					</div>
+				</div>
+			</div>
+		</div>
+		<!--row-->
+	</wicket:extend>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/pages/seed/SeedPage.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/pages/seed/SeedPage.java b/nutch-core/src/main/java/org/apache/nutch/webui/pages/seed/SeedPage.java
new file mode 100644
index 0000000..fba07ab
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/pages/seed/SeedPage.java
@@ -0,0 +1,153 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.webui.pages.seed;
+
+import java.util.Iterator;
+
+import org.apache.nutch.webui.model.SeedList;
+import org.apache.nutch.webui.model.SeedUrl;
+import org.apache.nutch.webui.pages.AbstractBasePage;
+import org.apache.nutch.webui.pages.components.CpmIteratorAdapter;
+import org.apache.nutch.webui.service.SeedListService;
+import org.apache.wicket.ajax.AjaxRequestTarget;
+import org.apache.wicket.ajax.markup.html.AjaxLink;
+import org.apache.wicket.ajax.markup.html.form.AjaxSubmitLink;
+import org.apache.wicket.markup.html.WebMarkupContainer;
+import org.apache.wicket.markup.html.basic.Label;
+import org.apache.wicket.markup.html.form.Form;
+import org.apache.wicket.markup.html.form.TextField;
+import org.apache.wicket.markup.repeater.Item;
+import org.apache.wicket.markup.repeater.RefreshingView;
+import org.apache.wicket.model.CompoundPropertyModel;
+import org.apache.wicket.model.IModel;
+import org.apache.wicket.model.LoadableDetachableModel;
+import org.apache.wicket.model.Model;
+import org.apache.wicket.request.mapper.parameter.PageParameters;
+import org.apache.wicket.spring.injection.annot.SpringBean;
+
+import com.google.common.collect.Lists;
+
+/**
+ * This page is for seed urls management
+ * 
+ * @author feodor
+ * 
+ */
+public class SeedPage extends AbstractBasePage<SeedList> {
+
+  @SpringBean
+  private SeedListService seedListService;
+
+  private Form<SeedUrl> urlForm;
+
+  private WebMarkupContainer seedUrlsTable;
+
+  public SeedPage() {
+    SeedList list = new SeedList();
+    list.setSeedUrls(Lists.<SeedUrl> newArrayList());
+    initPage(Model.of(list));
+  }
+
+  public SeedPage(final PageParameters parameters) {
+    initPage(new LoadableDetachableModel<SeedList>() {
+
+      @Override
+      protected SeedList load() {
+        Long seedListId = parameters.get("id").toLongObject();
+        return seedListService.getSeedList(seedListId);
+      }
+    });
+  }
+
+  public void initPage(IModel<SeedList> model) {
+    setModel(new CompoundPropertyModel<SeedList>(model));
+
+    addBaseForm();
+    addSeedUrlsList();
+    addUrlForm();
+  }
+
+  private void addBaseForm() {
+    Form<SeedList> form = new Form<SeedList>("seedList", getModel()) {
+      @Override
+      protected void onSubmit() {
+        seedListService.save(getModelObject());
+        setResponsePage(SeedListsPage.class);
+      }
+    };
+    form.add(new TextField<String>("name"));
+    add(form);
+  }
+
+  private void addSeedUrlsList() {
+    seedUrlsTable = new WebMarkupContainer("seedUrlsTable");
+    seedUrlsTable.setOutputMarkupId(true);
+
+    RefreshingView<SeedUrl> seedUrls = new RefreshingView<SeedUrl>("seedUrls") {
+
+      @Override
+      protected Iterator<IModel<SeedUrl>> getItemModels() {
+        return new CpmIteratorAdapter<SeedUrl>(getModelObject().getSeedUrls());
+      }
+
+      @Override
+      protected void populateItem(Item<SeedUrl> item) {
+        item.add(new Label("url"));
+        item.add(new AjaxLink<SeedUrl>("delete", item.getModel()) {
+
+          @Override
+          public void onClick(AjaxRequestTarget target) {
+            deleteSeedUrl(getModelObject());
+            target.add(seedUrlsTable);
+          }
+        });
+      }
+    };
+    seedUrlsTable.add(seedUrls);
+    add(seedUrlsTable);
+  }
+
+  private void addUrlForm() {
+    urlForm = new Form<SeedUrl>("urlForm", CompoundPropertyModel.of(Model
+        .of(new SeedUrl())));
+    urlForm.setOutputMarkupId(true);
+    urlForm.add(new TextField<String>("url"));
+    urlForm.add(new AjaxSubmitLink("addUrl", urlForm) {
+      @Override
+      protected void onSubmit(AjaxRequestTarget target, Form<?> form) {
+        addSeedUrl();
+        urlForm.setModelObject(new SeedUrl());
+        target.add(urlForm);
+        target.add(seedUrlsTable);
+      }
+    });
+    add(urlForm);
+  }
+
+  private void addSeedUrl() {
+    SeedUrl url = urlForm.getModelObject();
+    SeedList seedList = getModelObject();
+    url.setSeedList(seedList);
+    seedList.getSeedUrls().add(url);
+  }
+
+  private void deleteSeedUrl(SeedUrl url) {
+    SeedList seedList = getModelObject();
+    seedList.getSeedUrls().remove(url);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/pages/settings/SettingsPage.html
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/pages/settings/SettingsPage.html b/nutch-core/src/main/java/org/apache/nutch/webui/pages/settings/SettingsPage.html
new file mode 100644
index 0000000..8810371
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/pages/settings/SettingsPage.html
@@ -0,0 +1,43 @@
+<!DOCTYPE html>
+<html xmlns:wicket="http://wicket.apache.org">
+<head>
+<meta charset="utf-8" />
+<title>Wicket extend</title>
+</head>
+
+<body>
+	<wicket:extend>
+		<h2>
+			<wicket:message key="settings">Settings</wicket:message>
+		</h2>
+		<div class="row">
+			<div class="col-lg-12">
+				<table class="table table-hover tablesorter table-bordered">
+					<thead>
+						<tr>
+							<th class="header col-lg-3">
+								<wicket:message key="settings.header.name">Name</wicket:message>
+							</th>
+							<th class="header col-lg-9">
+								<wicket:message key="settings.header.value">Value</wicket:message>
+							</th>
+						</tr>
+					</thead>
+					<tbody wicket:id="settingsTable">
+						<tr wicket:id="settings">
+							<td>
+								<span wicket:id="name">Name</span>
+							</td>
+							<td>
+<!-- 								<span wicket:id="value">Value</span> -->
+					<input class="col-lg-12" wicket:id="value" placeholder="http://localhost:8080">
+							</td>
+						</tr>
+					</tbody>
+				</table>
+			</div>
+		</div>
+	</wicket:extend>
+
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/pages/settings/SettingsPage.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/pages/settings/SettingsPage.java b/nutch-core/src/main/java/org/apache/nutch/webui/pages/settings/SettingsPage.java
new file mode 100644
index 0000000..29e46f7
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/pages/settings/SettingsPage.java
@@ -0,0 +1,59 @@
+package org.apache.nutch.webui.pages.settings;
+
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.nutch.webui.model.NutchConfig;
+import org.apache.nutch.webui.pages.AbstractBasePage;
+import org.apache.nutch.webui.pages.components.CpmIteratorAdapter;
+import org.apache.nutch.webui.service.NutchService;
+import org.apache.wicket.markup.html.WebMarkupContainer;
+import org.apache.wicket.markup.html.basic.Label;
+import org.apache.wicket.markup.html.form.TextField;
+import org.apache.wicket.markup.repeater.Item;
+import org.apache.wicket.markup.repeater.RefreshingView;
+import org.apache.wicket.model.IModel;
+import org.apache.wicket.spring.injection.annot.SpringBean;
+
+public class SettingsPage extends AbstractBasePage<Void> {
+  @SpringBean
+  private NutchService nutchService;
+
+  private WebMarkupContainer settingsTable;
+
+  public SettingsPage() {
+    settingsTable = new WebMarkupContainer("settingsTable");
+    settingsTable.setOutputMarkupId(true);
+    RefreshingView<NutchConfig> nutchConfig = new RefreshingView<NutchConfig>(
+        "settings") {
+
+      @Override
+      protected Iterator<IModel<NutchConfig>> getItemModels() {
+        return new CpmIteratorAdapter<NutchConfig>(
+            convertNutchConfig(nutchService.getNutchConfig(getCurrentInstance()
+                .getId())));
+      }
+
+      @Override
+      protected void populateItem(Item<NutchConfig> item) {
+        item.add(new Label("name"));
+        item.add(new TextField<String>("value"));
+      }
+    };
+    settingsTable.add(nutchConfig);
+    add(settingsTable);
+  }
+
+  private List<NutchConfig> convertNutchConfig(Map<String, String> map) {
+    List<NutchConfig> listNutchConfigs = new LinkedList<NutchConfig>();
+    for (String key : map.keySet()) {
+      NutchConfig conf = new NutchConfig();
+      conf.setName(key);
+      conf.setValue(map.get(key));
+      listNutchConfigs.add(conf);
+    }
+    return listNutchConfigs;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/service/CrawlService.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/service/CrawlService.java b/nutch-core/src/main/java/org/apache/nutch/webui/service/CrawlService.java
new file mode 100644
index 0000000..c742b48
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/service/CrawlService.java
@@ -0,0 +1,33 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.webui.service;
+
+import java.util.List;
+
+import org.apache.nutch.webui.client.model.Crawl;
+import org.apache.nutch.webui.model.NutchInstance;
+
+public interface CrawlService {
+
+  public void saveCrawl(Crawl crawl);
+
+  public List<Crawl> getCrawls();
+
+  void startCrawl(Long crawlId, NutchInstance instance);
+
+  void deleteCrawl(Long crawlId);
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/service/NutchInstanceService.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/service/NutchInstanceService.java b/nutch-core/src/main/java/org/apache/nutch/webui/service/NutchInstanceService.java
new file mode 100644
index 0000000..23f27e8
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/service/NutchInstanceService.java
@@ -0,0 +1,33 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.webui.service;
+
+import java.util.List;
+
+import org.apache.nutch.webui.model.NutchInstance;
+
+public interface NutchInstanceService {
+
+  public List<NutchInstance> getInstances();
+
+  public void saveInstance(NutchInstance instance);
+
+  public void removeInstance(Long id);
+
+  public NutchInstance getInstance(Long id);
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/service/NutchService.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/service/NutchService.java b/nutch-core/src/main/java/org/apache/nutch/webui/service/NutchService.java
new file mode 100644
index 0000000..643236a
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/service/NutchService.java
@@ -0,0 +1,31 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.webui.service;
+
+import java.util.Map;
+
+import org.apache.nutch.webui.client.model.ConnectionStatus;
+import org.apache.nutch.webui.client.model.NutchStatus;
+
+public interface NutchService {
+  public ConnectionStatus getConnectionStatus(Long instanceId);
+
+  public Map<String, String> getNutchConfig(Long instanceId);
+
+  public NutchStatus getNutchStatus(Long instanceId);
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/service/SeedListService.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/service/SeedListService.java b/nutch-core/src/main/java/org/apache/nutch/webui/service/SeedListService.java
new file mode 100644
index 0000000..dda8c71
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/service/SeedListService.java
@@ -0,0 +1,33 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.webui.service;
+
+import java.util.List;
+
+import org.apache.nutch.webui.model.SeedList;
+
+public interface SeedListService {
+
+  public void save(SeedList seedList);
+
+  public void delete(Long seedListId);
+
+  public List<SeedList> findAll();
+
+  public SeedList getSeedList(Long seedListId);
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/service/impl/CrawlServiceImpl.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/service/impl/CrawlServiceImpl.java b/nutch-core/src/main/java/org/apache/nutch/webui/service/impl/CrawlServiceImpl.java
new file mode 100644
index 0000000..7bb133b
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/service/impl/CrawlServiceImpl.java
@@ -0,0 +1,132 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.webui.service.impl;
+
+import java.sql.SQLException;
+import java.util.List;
+
+import javax.annotation.Resource;
+
+import org.apache.nutch.webui.client.NutchClient;
+import org.apache.nutch.webui.client.NutchClientFactory;
+import org.apache.nutch.webui.client.impl.CrawlingCycle;
+import org.apache.nutch.webui.client.impl.RemoteCommandsBatchFactory;
+import org.apache.nutch.webui.client.impl.CrawlingCycleListener;
+import org.apache.nutch.webui.client.impl.RemoteCommand;
+import org.apache.nutch.webui.client.impl.RemoteCommandExecutor;
+import org.apache.nutch.webui.client.model.Crawl;
+import org.apache.nutch.webui.client.model.Crawl.CrawlStatus;
+import org.apache.nutch.webui.model.NutchInstance;
+import org.apache.nutch.webui.service.CrawlService;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.springframework.scheduling.annotation.Async;
+import org.springframework.stereotype.Service;
+
+import com.j256.ormlite.dao.Dao;
+
+@Service
+public class CrawlServiceImpl implements CrawlService, CrawlingCycleListener {
+  private Logger log = LoggerFactory.getLogger(CrawlServiceImpl.class);
+
+  @Resource
+  private Dao<Crawl, Long> crawlDao;
+
+  @Resource
+  private NutchClientFactory nutchClientFactory;
+
+  @Resource
+  private RemoteCommandsBatchFactory commandFactory;
+
+  @Override
+  @Async
+  public void startCrawl(Long crawlId, NutchInstance instance) {
+    Crawl crawl = null;
+    try {
+      crawl = crawlDao.queryForId(crawlId);
+      if(crawl.getCrawlId()==null) {
+        crawl.setCrawlId("crawl-" + crawlId.toString());
+      }
+      NutchClient client = nutchClientFactory.getClient(instance);
+      String seedDirectory = client.createSeed(crawl.getSeedList());
+      crawl.setSeedDirectory(seedDirectory);
+
+      List<RemoteCommand> commands = commandFactory.createCommands(crawl);
+      RemoteCommandExecutor executor = new RemoteCommandExecutor(client);
+
+      CrawlingCycle cycle = new CrawlingCycle(this, executor, crawl, commands);
+      cycle.executeCrawlCycle();
+
+    } catch (Exception e) {
+      crawl.setStatus(CrawlStatus.ERROR);
+      saveCrawl(crawl);
+      log.error("exception occured", e);
+    }
+  }
+
+  @Override
+  public List<Crawl> getCrawls() {
+    try {
+      return crawlDao.queryForAll();
+    } catch (SQLException e) {
+      throw new RuntimeException(e);
+    }
+  }
+
+  @Override
+  public void saveCrawl(Crawl crawl) {
+    try {
+      crawlDao.createOrUpdate(crawl);
+    } catch (SQLException e) {
+      throw new RuntimeException(e);
+    }
+  }
+
+  @Override
+  public void deleteCrawl(Long crawlId) {
+    try {
+      crawlDao.deleteById(crawlId);
+    } catch (SQLException e) {
+      throw new RuntimeException(e);
+    }
+  }
+
+  @Override
+  public void crawlingStarted(Crawl crawl) {
+    crawl.setStatus(CrawlStatus.CRAWLING);
+    crawl.setProgress(0);
+    saveCrawl(crawl);
+  }
+
+  @Override
+  public void onCrawlError(Crawl crawl, String msg) {
+    crawl.setStatus(CrawlStatus.ERROR);
+    saveCrawl(crawl);
+  }
+
+  @Override
+  public void commandExecuted(Crawl crawl, RemoteCommand command, int progress) {
+    crawl.setProgress(progress);
+    saveCrawl(crawl);
+  }
+
+  @Override
+  public void crawlingFinished(Crawl crawl) {
+    crawl.setStatus(CrawlStatus.FINISHED);
+    saveCrawl(crawl);
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/service/impl/NutchInstanceServiceImpl.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/service/impl/NutchInstanceServiceImpl.java b/nutch-core/src/main/java/org/apache/nutch/webui/service/impl/NutchInstanceServiceImpl.java
new file mode 100644
index 0000000..e100054
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/service/impl/NutchInstanceServiceImpl.java
@@ -0,0 +1,76 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.webui.service.impl;
+
+import java.sql.SQLException;
+import java.util.List;
+
+import javax.annotation.Resource;
+
+import org.apache.nutch.webui.client.NutchClientFactory;
+import org.apache.nutch.webui.model.NutchInstance;
+import org.apache.nutch.webui.service.NutchInstanceService;
+import org.springframework.stereotype.Service;
+
+import com.j256.ormlite.dao.Dao;
+
+@Service
+public class NutchInstanceServiceImpl implements NutchInstanceService {
+
+  @Resource
+  private NutchClientFactory nutchClientFactory;
+
+  @Resource
+  private Dao<NutchInstance, Long> instancesDao;
+
+  @Override
+  public List<NutchInstance> getInstances() {
+    try {
+      return instancesDao.queryForAll();
+    } catch (SQLException e) {
+      throw new RuntimeException(e);
+    }
+
+  }
+
+  @Override
+  public NutchInstance getInstance(Long id) {
+    try {
+      return instancesDao.queryForId(id);
+    } catch (SQLException e) {
+      throw new RuntimeException(e);
+    }
+  }
+
+  @Override
+  public void saveInstance(NutchInstance instance) {
+    try {
+      instancesDao.createOrUpdate(instance);
+    } catch (SQLException e) {
+      throw new RuntimeException(e);
+    }
+  }
+
+  @Override
+  public void removeInstance(Long id) {
+    try {
+      instancesDao.deleteById(id);
+    } catch (SQLException e) {
+      throw new RuntimeException(e);
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/service/impl/NutchServiceImpl.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/service/impl/NutchServiceImpl.java b/nutch-core/src/main/java/org/apache/nutch/webui/service/impl/NutchServiceImpl.java
new file mode 100644
index 0000000..db989cf
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/service/impl/NutchServiceImpl.java
@@ -0,0 +1,82 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.webui.service.impl;
+
+import java.net.ConnectException;
+import java.util.Collections;
+import java.util.Map;
+
+import javax.annotation.Resource;
+
+import org.apache.nutch.webui.client.NutchClientFactory;
+import org.apache.nutch.webui.client.model.ConnectionStatus;
+import org.apache.nutch.webui.client.model.NutchStatus;
+import org.apache.nutch.webui.model.NutchInstance;
+import org.apache.nutch.webui.service.NutchInstanceService;
+import org.apache.nutch.webui.service.NutchService;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.springframework.stereotype.Service;
+
+import com.sun.jersey.api.client.ClientHandlerException;
+
+@Service
+public class NutchServiceImpl implements NutchService {
+  private static final Logger logger = LoggerFactory
+      .getLogger(NutchServiceImpl.class);
+
+  @Resource
+  private NutchClientFactory nutchClientFactory;
+
+  @Resource
+  private NutchInstanceService instanceService;
+
+  @Override
+  public ConnectionStatus getConnectionStatus(Long instanceId) {
+    NutchInstance instance = instanceService.getInstance(instanceId);
+    try {
+      NutchStatus nutchStatus = nutchClientFactory.getClient(instance)
+          .getNutchStatus();
+      if (nutchStatus.getStartDate() != null) {
+        return ConnectionStatus.CONNECTED;
+      }
+    } catch (Exception e) {
+      if (e.getCause() instanceof ConnectException) {
+        return ConnectionStatus.DISCONNECTED;
+      }
+
+      logger.error("Cannot connect to nutch server!", e);
+    }
+    return null;
+  }
+
+  @Override
+  public Map<String, String> getNutchConfig(Long instanceId) {
+    NutchInstance instance = instanceService.getInstance(instanceId);
+    try {
+      return nutchClientFactory.getClient(instance).getNutchConfig("default");
+    } catch (ClientHandlerException exception) {
+      return Collections.emptyMap();
+    }
+  }
+
+  @Override
+  public NutchStatus getNutchStatus(Long instanceId) {
+    NutchInstance instance = instanceService.getInstance(instanceId);
+    return nutchClientFactory.getClient(instance).getNutchStatus();
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/service/impl/SeedListServiceImpl.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/service/impl/SeedListServiceImpl.java b/nutch-core/src/main/java/org/apache/nutch/webui/service/impl/SeedListServiceImpl.java
new file mode 100644
index 0000000..fced2d3
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/service/impl/SeedListServiceImpl.java
@@ -0,0 +1,77 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.webui.service.impl;
+
+import java.sql.SQLException;
+import java.util.List;
+
+import javax.annotation.Resource;
+
+import org.apache.nutch.webui.model.SeedList;
+import org.apache.nutch.webui.model.SeedUrl;
+import org.apache.nutch.webui.service.SeedListService;
+import org.springframework.stereotype.Service;
+
+import com.j256.ormlite.dao.Dao;
+
+@Service
+public class SeedListServiceImpl implements SeedListService {
+
+  @Resource
+  private Dao<SeedList, Long> seedListDao;
+
+  @Resource
+  private Dao<SeedUrl, Long> seedUrlDao;
+
+  @Override
+  public void save(SeedList seedList) {
+    try {
+      seedListDao.createOrUpdate(seedList);
+    } catch (SQLException e) {
+      throw new RuntimeException(e);
+    }
+  }
+
+  @Override
+  public void delete(Long seedListId) {
+    try {
+      seedListDao.deleteById(seedListId);
+    } catch (SQLException e) {
+      throw new RuntimeException(e);
+    }
+
+  }
+
+  @Override
+  public List<SeedList> findAll() {
+    try {
+      return seedListDao.queryForAll();
+    } catch (SQLException e) {
+      throw new RuntimeException(e);
+    }
+  }
+
+  @Override
+  public SeedList getSeedList(Long seedListId) {
+    try {
+      return seedListDao.queryForId(seedListId);
+    } catch (SQLException e) {
+      throw new RuntimeException(e);
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/overview.html
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/overview.html b/nutch-core/src/main/java/overview.html
new file mode 100644
index 0000000..1132141
--- /dev/null
+++ b/nutch-core/src/main/java/overview.html
@@ -0,0 +1,9 @@
+<html>
+<head>
+   <title>Apache Nutch</title>
+</head>
+<body>
+<p>Apache Nutch is a highly extensible and scalable open source web crawler software project.</p>
+<p>Nutch is a project of the Apache Software Foundation and is part of the larger Apache community of developers and users.</p>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/crawl/ContinuousCrawlTestUtil.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/crawl/ContinuousCrawlTestUtil.java b/nutch-core/src/test/java/org/apache/nutch/crawl/ContinuousCrawlTestUtil.java
new file mode 100644
index 0000000..bb938a6
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/crawl/ContinuousCrawlTestUtil.java
@@ -0,0 +1,270 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Date;
+import java.util.List;
+
+import junit.framework.TestCase;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.nutch.crawl.CrawlDbUpdateUtil;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.TimingUtil;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Emulate a continuous crawl for one URL.
+ * 
+ */
+public class ContinuousCrawlTestUtil extends TestCase {
+
+  private static final Logger LOG = LoggerFactory
+      .getLogger(ContinuousCrawlTestUtil.class);
+
+  protected static Text dummyURL = new Text("http://nutch.apache.org/");
+
+  protected static Configuration defaultConfig = CrawlDBTestUtil
+      .createConfiguration();
+
+  protected long interval = FetchSchedule.SECONDS_PER_DAY * 1000; // (default)
+                                                                  // launch
+                                                                  // crawler
+                                                                  // every day
+  protected long duration = 2 * 365L * FetchSchedule.SECONDS_PER_DAY * 1000L; // run
+                                                                              // for
+                                                                              // two
+                                                                              // years
+
+  protected Configuration configuration;
+  private FetchSchedule schedule;
+
+  /** status a fetched datum should get */
+  protected byte fetchStatus = CrawlDatum.STATUS_FETCH_SUCCESS;
+  /** expected status of the resulting Db datum */
+  protected byte expectedDbStatus = CrawlDatum.STATUS_DB_FETCHED;
+
+  /** for signature calculation */
+  protected Signature signatureImpl;
+  protected Content content = new Content();
+
+  {
+    byte[] data = { 'n', 'u', 't', 'c', 'h' };
+    content.setContent(data);
+  }
+
+  protected ContinuousCrawlTestUtil(Configuration conf) {
+    configuration = conf;
+    schedule = FetchScheduleFactory.getFetchSchedule(new JobConf(conf));
+    signatureImpl = SignatureFactory.getSignature(conf);
+  }
+
+  protected ContinuousCrawlTestUtil(Configuration conf, byte fetchStatus,
+      byte expectedDbStatus) {
+    this(conf);
+    this.fetchStatus = fetchStatus;
+    this.expectedDbStatus = expectedDbStatus;
+  }
+
+  protected ContinuousCrawlTestUtil() {
+    this(defaultConfig);
+  }
+
+  protected ContinuousCrawlTestUtil(byte fetchStatus, byte expectedDbStatus) {
+    this(defaultConfig, fetchStatus, expectedDbStatus);
+  }
+
+  /** set the interval the crawl is relaunched (default: every day) */
+  protected void setInterval(int seconds) {
+    interval = seconds * 1000L;
+  }
+
+  /** set the duration of the continuous crawl (default = 2 years) */
+  protected void setDuraction(int seconds) {
+    duration = seconds * 1000L;
+  }
+
+  /**
+   * default fetch action: set status and time
+   * 
+   * @param datum
+   *          CrawlDatum to fetch
+   * @param currentTime
+   *          current time used to set the fetch time via
+   *          {@link CrawlDatum#setFetchTime(long)}
+   * @return the modified CrawlDatum
+   */
+  protected CrawlDatum fetch(CrawlDatum datum, long currentTime) {
+    datum.setStatus(fetchStatus);
+    datum.setFetchTime(currentTime);
+    return datum;
+  }
+
+  /**
+   * get signature for content and configured signature implementation
+   */
+  protected byte[] getSignature() {
+    return signatureImpl.calculate(content, null);
+  }
+
+  /**
+   * change content to force a changed signature
+   */
+  protected void changeContent() {
+    byte[] data = Arrays.copyOf(content.getContent(),
+        content.getContent().length + 1);
+    data[content.getContent().length] = '2'; // append one byte
+    content.setContent(data);
+    LOG.info("document content changed");
+  }
+
+  /**
+   * default parse action: add signature if successfully fetched
+   * 
+   * @param fetchDatum
+   *          fetch datum
+   * @return list of all datums resulting from parse (status: signature, linked,
+   *         parse_metadata)
+   */
+  protected List<CrawlDatum> parse(CrawlDatum fetchDatum) {
+    List<CrawlDatum> parseDatums = new ArrayList<CrawlDatum>(0);
+    if (fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_SUCCESS) {
+      CrawlDatum signatureDatum = new CrawlDatum(CrawlDatum.STATUS_SIGNATURE, 0);
+      signatureDatum.setSignature(getSignature());
+      parseDatums.add(signatureDatum);
+    }
+    return parseDatums;
+  }
+
+  /**
+   * default implementation to check the result state
+   * 
+   * @param datum
+   *          the CrawlDatum to be checked
+   * @return true if the check succeeds
+   */
+  protected boolean check(CrawlDatum datum) {
+    if (datum.getStatus() != expectedDbStatus)
+      return false;
+    return true;
+  }
+
+  /**
+   * Run the continuous crawl.
+   * <p>
+   * A loop emulates a continuous crawl launched in regular intervals (see
+   * {@link #setInterval(int)} over a longer period ({@link #setDuraction(int)}.
+   * 
+   * <ul>
+   * <li>every "round" emulates
+   * <ul>
+   * <li>a fetch (see {@link #fetch(CrawlDatum, long)})</li>
+   * <li>{@literal updatedb} which returns a {@link CrawlDatum}</li>
+   * </ul>
+   * <li>the returned CrawlDatum is used as input for the next round</li>
+   * <li>and is checked whether it is correct (see {@link #check(CrawlDatum)})
+   * </ul>
+   * </p>
+   * 
+   * @param maxErrors
+   *          (if > 0) continue crawl even if the checked CrawlDatum is not
+   *          correct, but stop after max. number of errors
+   * 
+   * @return false if a check of CrawlDatum failed, true otherwise
+   */
+  protected boolean run(int maxErrors) {
+
+    long now = System.currentTimeMillis();
+
+    CrawlDbUpdateUtil<CrawlDbReducer> updateDb = new CrawlDbUpdateUtil<CrawlDbReducer>(
+        new CrawlDbReducer(), configuration);
+
+    /* start with a db_unfetched */
+    CrawlDatum dbDatum = new CrawlDatum();
+    dbDatum.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
+    schedule.initializeSchedule(dummyURL, dbDatum); // initialize fetchInterval
+    dbDatum.setFetchTime(now);
+
+    LOG.info("Emulate a continuous crawl, launched every "
+        + (interval / (FetchSchedule.SECONDS_PER_DAY * 1000)) + " day ("
+        + (interval / 1000) + " seconds)");
+    long maxTime = (now + duration);
+    long nextTime = now;
+    long lastFetchTime = -1;
+    boolean ok = true; // record failure but keep going
+    CrawlDatum fetchDatum = new CrawlDatum();
+    /*
+     * Keep copies because CrawlDbReducer.reduce() and
+     * FetchSchedule.shouldFetch() may alter the references. Copies are used for
+     * verbose logging in case of an error.
+     */
+    CrawlDatum copyDbDatum = new CrawlDatum();
+    CrawlDatum copyFetchDatum = new CrawlDatum();
+    CrawlDatum afterShouldFetch = new CrawlDatum();
+    int errorCount = 0;
+    while (nextTime < maxTime) {
+      LOG.info("check: " + new Date(nextTime));
+      fetchDatum.set(dbDatum);
+      copyDbDatum.set(dbDatum);
+      if (schedule.shouldFetch(dummyURL, fetchDatum, nextTime)) {
+        LOG.info("... fetching now (" + new Date(nextTime) + ")");
+        if (lastFetchTime > -1) {
+          LOG.info("(last fetch: " + new Date(lastFetchTime) + " = "
+              + TimingUtil.elapsedTime(lastFetchTime, nextTime) + " ago)");
+        }
+        lastFetchTime = nextTime;
+        afterShouldFetch.set(fetchDatum);
+        fetchDatum = fetch(fetchDatum, nextTime);
+        copyFetchDatum.set(fetchDatum);
+        List<CrawlDatum> values = new ArrayList<CrawlDatum>();
+        values.add(dbDatum);
+        values.add(fetchDatum);
+        values.addAll(parse(fetchDatum));
+        List<CrawlDatum> res = updateDb.update(values);
+        assertNotNull("null returned", res);
+        assertFalse("no CrawlDatum", 0 == res.size());
+        assertEquals("more than one CrawlDatum", 1, res.size());
+        if (!check(res.get(0))) {
+          LOG.info("previously in CrawlDb: " + copyDbDatum);
+          LOG.info("after shouldFetch(): " + afterShouldFetch);
+          LOG.info("fetch: " + fetchDatum);
+          LOG.warn("wrong result in CrawlDb: " + res.get(0));
+          if (++errorCount >= maxErrors) {
+            if (maxErrors > 0) {
+              LOG.error("Max. number of errors " + maxErrors
+                  + " reached. Stopping.");
+            }
+            return false;
+          } else {
+            ok = false; // record failure but keep going
+          }
+        }
+        /* use the returned CrawlDatum for the next fetch */
+        dbDatum = res.get(0);
+      }
+      nextTime += interval;
+    }
+    return ok;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/crawl/CrawlDBTestUtil.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/crawl/CrawlDBTestUtil.java b/nutch-core/src/test/java/org/apache/nutch/crawl/CrawlDBTestUtil.java
new file mode 100644
index 0000000..56905e4
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/crawl/CrawlDBTestUtil.java
@@ -0,0 +1,179 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.crawl;
+
+import java.io.IOException;
+import java.net.UnknownHostException;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.MapFile;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.MapFile.Writer.Option;
+import org.apache.hadoop.io.Text;
+import org.mortbay.jetty.Server;
+import org.mortbay.jetty.bio.SocketConnector;
+import org.mortbay.jetty.handler.ContextHandler;
+import org.mortbay.jetty.handler.ResourceHandler;
+
+public class CrawlDBTestUtil {
+
+  private static final Logger LOG = LoggerFactory
+      .getLogger(CrawlDBTestUtil.class);
+
+  /**
+   * Creates synthetic crawldb
+   * 
+   * @param fs
+   *          filesystem where db will be created
+   * @param crawldb
+   *          path were db will be created
+   * @param init
+   *          urls to be inserted, objects are of type URLCrawlDatum
+   * @throws Exception
+   */
+  public static void createCrawlDb(Configuration conf, FileSystem fs,
+      Path crawldb, List<URLCrawlDatum> init) throws Exception {
+    LOG.trace("* creating crawldb: " + crawldb);
+    Path dir = new Path(crawldb, CrawlDb.CURRENT_NAME);
+    Option wKeyOpt = MapFile.Writer.keyClass(Text.class);
+    org.apache.hadoop.io.SequenceFile.Writer.Option wValueOpt = SequenceFile.Writer.valueClass(CrawlDatum.class);
+    MapFile.Writer writer = new MapFile.Writer(conf, new Path(dir,
+        "part-r-00000"), wKeyOpt, wValueOpt);
+    Iterator<URLCrawlDatum> it = init.iterator();
+    while (it.hasNext()) {
+      URLCrawlDatum row = it.next();
+      LOG.info("adding:" + row.url.toString());
+      writer.append(new Text(row.url), row.datum);
+    }
+    writer.close();
+  }
+
+  /**
+   * For now we need to manually construct our Configuration, because we need to
+   * override the default one and it is currently not possible to use
+   * dynamically set values.
+   * 
+   * @return
+   * @deprecated Use {@link #createConfiguration()} instead
+   */
+  @Deprecated
+  public static Configuration create() {
+    return createConfiguration();
+  }
+
+  /**
+   * For now we need to manually construct our Configuration, because we need to
+   * override the default one and it is currently not possible to use
+   * dynamically set values.
+   * 
+   * @return
+   */
+  public static Configuration createConfiguration() {
+    Configuration conf = new Configuration();
+    conf.addResource("nutch-default.xml");
+    conf.addResource("crawl-tests.xml");
+    return conf;
+  }
+
+  public static class URLCrawlDatum {
+
+    public Text url;
+
+    public CrawlDatum datum;
+
+    public URLCrawlDatum(Text url, CrawlDatum datum) {
+      this.url = url;
+      this.datum = datum;
+    }
+  }
+
+  /**
+   * Generate seedlist
+   * 
+   * @throws IOException
+   */
+  public static void generateSeedList(FileSystem fs, Path urlPath,
+      List<String> urls) throws IOException {
+    generateSeedList(fs, urlPath, urls, new ArrayList<String>());
+  }
+
+  /**
+   * Generate seedlist
+   * 
+   * @throws IOException
+   */
+  public static void generateSeedList(FileSystem fs, Path urlPath,
+      List<String> urls, List<String> metadata) throws IOException {
+    FSDataOutputStream out;
+    Path file = new Path(urlPath, "urls.txt");
+    fs.mkdirs(urlPath);
+    out = fs.create(file);
+
+    Iterator<String> urls_i = urls.iterator();
+    Iterator<String> metadata_i = metadata.iterator();
+
+    String url;
+    String md;
+    while (urls_i.hasNext()) {
+      url = urls_i.next();
+
+      out.writeBytes(url);
+
+      if (metadata_i.hasNext()) {
+        md = metadata_i.next();
+        out.writeBytes(md);
+      }
+
+      out.writeBytes("\n");
+    }
+
+    out.flush();
+    out.close();
+  }
+
+  /**
+   * Creates a new JettyServer with one static root context
+   * 
+   * @param port
+   *          port to listen to
+   * @param staticContent
+   *          folder where static content lives
+   * @throws UnknownHostException
+   */
+  public static Server getServer(int port, String staticContent)
+      throws UnknownHostException {
+    Server webServer = new org.mortbay.jetty.Server();
+    SocketConnector listener = new SocketConnector();
+    listener.setPort(port);
+    listener.setHost("127.0.0.1");
+    webServer.addConnector(listener);
+    ContextHandler staticContext = new ContextHandler();
+    staticContext.setContextPath("/");
+    staticContext.setResourceBase(staticContent);
+    staticContext.addHandler(new ResourceHandler());
+    webServer.addHandler(staticContext);
+    return webServer;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/crawl/CrawlDbUpdateTestDriver.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/crawl/CrawlDbUpdateTestDriver.java b/nutch-core/src/test/java/org/apache/nutch/crawl/CrawlDbUpdateTestDriver.java
new file mode 100644
index 0000000..7238f88
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/crawl/CrawlDbUpdateTestDriver.java
@@ -0,0 +1,138 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+import java.io.IOException;
+import java.net.URI;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configuration.IntegerRanges;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.RawComparator;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Counter;
+import org.apache.hadoop.mapreduce.Counters;
+import org.apache.hadoop.mapreduce.InputFormat;
+import org.apache.hadoop.mapreduce.InputSplit;
+import org.apache.hadoop.mapreduce.JobContext;
+import org.apache.hadoop.mapreduce.JobID;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.OutputCommitter;
+import org.apache.hadoop.mapreduce.OutputFormat;
+import org.apache.hadoop.mapreduce.Partitioner;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.hadoop.mapreduce.Reducer.Context;
+import org.apache.hadoop.mapreduce.TaskAttemptID;
+import org.apache.hadoop.mapreduce.TaskInputOutputContext;
+import org.apache.hadoop.security.Credentials;
+import org.apache.hadoop.util.StringUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.mrunit.mapreduce.ReduceDriver;
+import org.apache.hadoop.mrunit.types.Pair;
+
+/**
+ * Utility to test transitions of {@link CrawlDatum} states during an update of
+ * {@link CrawlDb} (command {@literal updatedb}): call
+ * {@link CrawlDbReducer#reduce(Text, Iterator, OutputCollector, Reporter)}
+ * (using MRUnit) with the old CrawlDatum (db status) and the new one (fetch
+ * status)
+ */
+public class CrawlDbUpdateTestDriver<T extends Reducer<Text, CrawlDatum, Text, CrawlDatum>> {
+
+  private static final Logger LOG = LoggerFactory
+      .getLogger(CrawlDbUpdateTestDriver.class);
+
+  private ReduceDriver<Text, CrawlDatum, Text, CrawlDatum> reduceDriver;
+  private T reducer;
+  private Configuration configuration;
+
+  public static Text dummyURL = new Text("http://nutch.apache.org/");
+
+//  protected CrawlDbUpdateUtilNewAPI(T red, T.Context con) {
+  protected CrawlDbUpdateTestDriver(T updateReducer, Configuration conf) {
+    reducer = updateReducer;
+    configuration = conf;
+  }
+
+  /**
+   * run
+   * {@link CrawlDbReducer#reduce(Text, Iterator, OutputCollector, Reporter)}
+   * and return the CrawlDatum(s) which would have been written into CrawlDb
+   *
+   * @param values
+   *          list of input CrawlDatums
+   * @return list of resulting CrawlDatum(s) in CrawlDb
+   */
+  public List<CrawlDatum> update(List<CrawlDatum> values) {
+    List<CrawlDatum> result = new ArrayList<CrawlDatum>(0);
+    if (values == null || values.size() == 0) {
+      return result;
+    }
+    Collections.shuffle(values); // sorting of values should have no influence
+    reduceDriver = ReduceDriver.newReduceDriver(reducer);
+    reduceDriver.setConfiguration(configuration);
+    reduceDriver.withInput(dummyURL, values);
+    List<Pair<Text,CrawlDatum>> reduceResult;
+    try {
+      reduceResult = reduceDriver.run();
+      for (Pair<Text,CrawlDatum> p : reduceResult) {
+        if (p.getFirst().equals(dummyURL)) {
+          result.add(p.getSecond());
+        }
+      }
+    } catch (IOException e) {
+      LOG.error(StringUtils.stringifyException(e));
+      return result;
+    }
+    return result;
+  }
+
+  /**
+   * run
+   * {@link CrawlDbReducer#reduce(Text, Iterator, OutputCollector, Reporter)}
+   * and return the CrawlDatum(s) which would have been written into CrawlDb
+   *
+   * @param dbDatum
+   *          previous CrawlDatum in CrawlDb
+   * @param fetchDatum
+   *          CrawlDatum resulting from fetching
+   * @return list of resulting CrawlDatum(s) in CrawlDb
+   */
+  public List<CrawlDatum> update(CrawlDatum dbDatum, CrawlDatum fetchDatum) {
+    List<CrawlDatum> values = new ArrayList<CrawlDatum>();
+    if (dbDatum != null)
+      values.add(dbDatum);
+    if (fetchDatum != null)
+      values.add(fetchDatum);
+    return update(values);
+  }
+
+  /**
+   * see {@link #update(List)}
+   */
+  public List<CrawlDatum> update(CrawlDatum... values) {
+    return update(Arrays.asList(values));
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/crawl/CrawlDbUpdateUtil.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/crawl/CrawlDbUpdateUtil.java b/nutch-core/src/test/java/org/apache/nutch/crawl/CrawlDbUpdateUtil.java
new file mode 100644
index 0000000..bfb716d
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/crawl/CrawlDbUpdateUtil.java
@@ -0,0 +1,166 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.Counters;
+import org.apache.hadoop.mapred.Counters.Counter;
+import org.apache.hadoop.mapred.InputSplit;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reducer;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.hadoop.util.StringUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Utility to test transitions of {@link CrawlDatum} states during an update of
+ * {@link CrawlDb} (command {@literal updatedb}): call
+ * {@link CrawlDbReducer#reduce(Text, Iterator, OutputCollector, Reporter)} with
+ * the old CrawlDatum (db status) and the new one (fetch status)
+ */
+public class CrawlDbUpdateUtil<T extends Reducer<Text, CrawlDatum, Text, CrawlDatum>> {
+
+  private static final Logger LOG = LoggerFactory
+      .getLogger(CrawlDbUpdateUtil.class);
+
+  private T reducer;
+
+  public static Text dummyURL = new Text("http://nutch.apache.org/");
+
+  protected CrawlDbUpdateUtil(T red, Configuration conf) {
+    reducer = red;
+    reducer.configure(new JobConf(conf));
+  }
+
+  /** {@link OutputCollector} to collect all values in a {@link List} */
+  private class ListOutputCollector implements
+      OutputCollector<Text, CrawlDatum> {
+
+    private List<CrawlDatum> values = new ArrayList<CrawlDatum>();
+
+    public void collect(Text key, CrawlDatum value) throws IOException {
+      values.add(value);
+    }
+
+    /** collected values as list */
+    public List<CrawlDatum> getValues() {
+      return values;
+    }
+
+  }
+
+  /**
+   * Dummy reporter which does nothing and does not return null for getCounter()
+   * 
+   * @see {@link Reporter#NULL}
+   */
+  private class DummyReporter implements Reporter {
+
+    private Counters dummyCounters = new Counters();
+
+    public void progress() {
+    }
+
+    public Counter getCounter(Enum<?> arg0) {
+      return dummyCounters.getGroup("dummy").getCounterForName("dummy");
+    }
+
+    public Counter getCounter(String arg0, String arg1) {
+      return dummyCounters.getGroup("dummy").getCounterForName("dummy");
+    }
+
+    public InputSplit getInputSplit() throws UnsupportedOperationException {
+      throw new UnsupportedOperationException("Dummy reporter without input");
+    }
+
+    public void incrCounter(Enum<?> arg0, long arg1) {
+    }
+
+    public void incrCounter(String arg0, String arg1, long arg2) {
+    }
+
+    public void setStatus(String arg0) {
+    }
+
+    public float getProgress() {
+      return 1f;
+    }
+
+  }
+
+  /**
+   * run
+   * {@link CrawlDbReducer#reduce(Text, Iterator, OutputCollector, Reporter)}
+   * and return the CrawlDatum(s) which would have been written into CrawlDb
+   * 
+   * @param values
+   *          list of input CrawlDatums
+   * @return list of resulting CrawlDatum(s) in CrawlDb
+   */
+  public List<CrawlDatum> update(List<CrawlDatum> values) {
+    if (values == null || values.size() == 0) {
+      return new ArrayList<CrawlDatum>(0);
+    }
+    Collections.shuffle(values); // sorting of values should have no influence
+    ListOutputCollector output = new ListOutputCollector();
+    try {
+      reducer.reduce(dummyURL, values.iterator(), output, new DummyReporter());
+    } catch (IOException e) {
+      LOG.error(StringUtils.stringifyException(e));
+    }
+    return output.getValues();
+  }
+
+  /**
+   * run
+   * {@link CrawlDbReducer#reduce(Text, Iterator, OutputCollector, Reporter)}
+   * and return the CrawlDatum(s) which would have been written into CrawlDb
+   * 
+   * @param dbDatum
+   *          previous CrawlDatum in CrawlDb
+   * @param fetchDatum
+   *          CrawlDatum resulting from fetching
+   * @return list of resulting CrawlDatum(s) in CrawlDb
+   */
+  public List<CrawlDatum> update(CrawlDatum dbDatum, CrawlDatum fetchDatum) {
+    List<CrawlDatum> values = new ArrayList<CrawlDatum>();
+    if (dbDatum != null)
+      values.add(dbDatum);
+    if (fetchDatum != null)
+      values.add(fetchDatum);
+    return update(values);
+  }
+
+  /**
+   * see {@link #update(List)}
+   */
+  public List<CrawlDatum> update(CrawlDatum... values) {
+    return update(Arrays.asList(values));
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/crawl/DummyWritable.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/crawl/DummyWritable.java b/nutch-core/src/test/java/org/apache/nutch/crawl/DummyWritable.java
new file mode 100644
index 0000000..94c27b5
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/crawl/DummyWritable.java
@@ -0,0 +1,32 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+import org.apache.hadoop.io.IntWritable;
+
+public class DummyWritable extends IntWritable {
+
+  public DummyWritable() {
+
+  }
+
+  public DummyWritable(int i) {
+    super(i);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/crawl/TODOTestCrawlDbStates.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/crawl/TODOTestCrawlDbStates.java b/nutch-core/src/test/java/org/apache/nutch/crawl/TODOTestCrawlDbStates.java
new file mode 100644
index 0000000..fd88c7d
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/crawl/TODOTestCrawlDbStates.java
@@ -0,0 +1,171 @@
+package org.apache.nutch.crawl;
+
+import static org.apache.nutch.crawl.CrawlDatum.*;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.test.IntegrationTest;
+import org.apache.nutch.util.TimingUtil;
+
+import static org.junit.Assert.*;
+
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+@Category({ IntegrationTest.class})
+public class TODOTestCrawlDbStates extends TestCrawlDbStates {
+
+  private static final Logger LOG = LoggerFactory
+      .getLogger(TODOTestCrawlDbStates.class);
+
+  /**
+   * NUTCH-578: a fetch_retry should result in a db_gone if db.fetch.retry.max
+   * is reached. Retry counter has to be reset appropriately.
+   */
+  @Test
+  public void testCrawlDbReducerPageRetrySchedule() {
+    LOG.info("NUTCH-578: test long running continuous crawl with fetch_retry");
+    ContinuousCrawlTestUtil crawlUtil = new ContinuousCrawlTestFetchRetry();
+    // keep going for long, to "provoke" a retry counter overflow
+    if (!crawlUtil.run(150)) {
+      fail("fetch_retry did not result in a db_gone if retry counter > maxRetries (NUTCH-578)");
+    }
+  }
+
+  private class ContinuousCrawlTestFetchRetry extends ContinuousCrawlTestUtil {
+
+    private int retryMax = 3;
+    private int totalRetries = 0;
+
+    ContinuousCrawlTestFetchRetry() {
+      super();
+      fetchStatus = STATUS_FETCH_RETRY;
+      retryMax = configuration.getInt("db.fetch.retry.max", retryMax);
+    }
+
+    @Override
+    protected CrawlDatum fetch(CrawlDatum datum, long currentTime) {
+      datum.setStatus(fetchStatus);
+      datum.setFetchTime(currentTime);
+      totalRetries++;
+      return datum;
+    }
+
+    @Override
+    protected boolean check(CrawlDatum result) {
+      if (result.getRetriesSinceFetch() > retryMax) {
+        LOG.warn("Retry counter > db.fetch.retry.max: " + result);
+      } else if (result.getRetriesSinceFetch() == Byte.MAX_VALUE) {
+        LOG.warn("Retry counter max. value reached (overflow imminent): "
+            + result);
+      } else if (result.getRetriesSinceFetch() < 0) {
+        LOG.error("Retry counter overflow: " + result);
+        return false;
+      }
+      // use retry counter bound to this class (totalRetries)
+      // instead of result.getRetriesSinceFetch() because the retry counter
+      // in CrawlDatum could be reset (eg. NUTCH-578_v5.patch)
+      if (totalRetries < retryMax) {
+        if (result.getStatus() == STATUS_DB_UNFETCHED) {
+          LOG.info("ok: " + result);
+          result.getRetriesSinceFetch();
+          return true;
+        }
+      } else {
+        if (result.getStatus() == STATUS_DB_GONE) {
+          LOG.info("ok: " + result);
+          return true;
+        }
+      }
+      LOG.warn("wrong: " + result);
+      return false;
+    }
+
+  }
+
+  /**
+   * NUTCH-1564 AdaptiveFetchSchedule: sync_delta forces immediate re-fetch for
+   * documents not modified
+   * <p>
+   * Problem: documents not modified for a longer time are fetched in every
+   * cycle because of an error in the SYNC_DELTA calculation of
+   * {@link AdaptiveFetchSchedule}. <br>
+   * The next fetch time should always be in the future, never in the past.
+   * </p>
+   */
+  @Test
+  public void testAdaptiveFetchScheduleSyncDelta() {
+    LOG.info("NUTCH-1564 test SYNC_DELTA calculation of AdaptiveFetchSchedule");
+    Configuration conf = CrawlDBTestUtil.createConfiguration();
+    conf.setLong("db.fetch.interval.default", 172800); // 2 days
+    conf.setLong("db.fetch.schedule.adaptive.min_interval", 86400); // 1 day
+    conf.setLong("db.fetch.schedule.adaptive.max_interval", 604800); // 7 days
+    conf.setLong("db.fetch.interval.max", 604800); // 7 days
+    conf.set("db.fetch.schedule.class",
+        "org.apache.nutch.crawl.AdaptiveFetchSchedule");
+    ContinuousCrawlTestUtil crawlUtil = new CrawlTestFetchScheduleNotModifiedFetchTime(
+        conf);
+    crawlUtil.setInterval(FetchSchedule.SECONDS_PER_DAY / 3);
+    if (!crawlUtil.run(100)) {
+      fail("failed: sync_delta calculation with AdaptiveFetchSchedule");
+    }
+  }
+
+  private class CrawlTestFetchScheduleNotModifiedFetchTime extends
+      CrawlTestFetchNotModified {
+
+    // time of current fetch
+    private long fetchTime;
+
+    private long minInterval;
+    private long maxInterval;
+
+    CrawlTestFetchScheduleNotModifiedFetchTime(Configuration conf) {
+      super(conf);
+      minInterval = conf.getLong("db.fetch.schedule.adaptive.min_interval",
+          86400); // 1 day
+      maxInterval = conf.getLong("db.fetch.schedule.adaptive.max_interval",
+          604800); // 7 days
+      if (conf.getLong("db.fetch.interval.max", 604800) < maxInterval) {
+        maxInterval = conf.getLong("db.fetch.interval.max", 604800);
+      }
+    }
+
+    @Override
+    protected CrawlDatum fetch(CrawlDatum datum, long currentTime) {
+      // remember time of fetching
+      fetchTime = currentTime;
+      return super.fetch(datum, currentTime);
+    }
+
+    @Override
+    protected boolean check(CrawlDatum result) {
+      if (result.getStatus() == STATUS_DB_NOTMODIFIED) {
+        // check only status notmodified here
+        long secondsUntilNextFetch = (result.getFetchTime() - fetchTime) / 1000L;
+        if (secondsUntilNextFetch < -1) {
+          // next fetch time is in the past (more than one second)
+          LOG.error("Next fetch time is in the past: " + result);
+          return false;
+        }
+        if (secondsUntilNextFetch < 60) {
+          // next fetch time is in less than one minute
+          // (critical: Nutch can hardly be so fast)
+          LOG.error("Less then one minute until next fetch: " + result);
+        }
+        // Next fetch time should be within min. and max. (tolerance: 60 sec.)
+        if (secondsUntilNextFetch + 60 < minInterval
+            || secondsUntilNextFetch - 60 > maxInterval) {
+          LOG.error("Interval until next fetch time ("
+              + TimingUtil.elapsedTime(fetchTime, result.getFetchTime())
+              + ") is not within min. and max. interval: " + result);
+          // TODO: is this a failure?
+        }
+      }
+      return true;
+    }
+
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/crawl/TestAdaptiveFetchSchedule.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/crawl/TestAdaptiveFetchSchedule.java b/nutch-core/src/test/java/org/apache/nutch/crawl/TestAdaptiveFetchSchedule.java
new file mode 100644
index 0000000..3fa798d
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/crawl/TestAdaptiveFetchSchedule.java
@@ -0,0 +1,121 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.crawl;
+
+import junit.framework.TestCase;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Before;
+import org.junit.Test;
+
+/**
+ * Test cases for AdaptiveFetchSchedule.
+ * 
+ */
+public class TestAdaptiveFetchSchedule extends TestCase {
+
+  private float inc_rate;
+  private float dec_rate;
+  private Configuration conf;
+  private long curTime, lastModified;
+  private int changed, interval, calculateInterval;
+
+  @Before
+  public void setUp() throws Exception {
+    super.setUp();
+    conf = NutchConfiguration.create();
+    inc_rate = conf.getFloat("db.fetch.schedule.adaptive.inc_rate", 0.2f);
+    dec_rate = conf.getFloat("db.fetch.schedule.adaptive.dec_rate", 0.2f);
+    interval = 100;
+    lastModified = 0;
+  }
+
+  /**
+   * Test the core functionality of AdaptiveFetchSchedule.
+   * 
+   */
+
+  @Test
+  public void testAdaptiveFetchSchedule() {
+
+    FetchSchedule fs = new AdaptiveFetchSchedule();
+    fs.setConf(conf);
+
+    CrawlDatum p = prepareCrawlDatum();
+    Text url = new Text("http://www.example.com");
+
+    changed = FetchSchedule.STATUS_UNKNOWN;
+    fs.setFetchSchedule(url, p, p.getFetchTime(), p.getModifiedTime(), curTime,
+        lastModified, changed);
+    validateFetchInterval(changed, p.getFetchInterval());
+
+    changed = FetchSchedule.STATUS_MODIFIED;
+    fs.setFetchSchedule(url, p, p.getFetchTime(), p.getModifiedTime(), curTime,
+        lastModified, changed);
+    validateFetchInterval(changed, p.getFetchInterval());
+    p.setFetchInterval(interval);
+
+    changed = FetchSchedule.STATUS_NOTMODIFIED;
+    fs.setFetchSchedule(url, p, p.getFetchTime(), p.getModifiedTime(), curTime,
+        lastModified, changed);
+    validateFetchInterval(changed, p.getFetchInterval());
+
+  }
+
+  /**
+   * Prepare a CrawlDatum (STATUS_DB_UNFETCHED) to Test AdaptiveFetchSchedule.
+   * 
+   * @return properly initialized CrawlDatum
+   */
+  public CrawlDatum prepareCrawlDatum() {
+    CrawlDatum p = new CrawlDatum();
+    p.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
+    p.setFetchInterval(interval);
+    p.setScore(1.0f);
+    p.setFetchTime(0);
+    return p;
+  }
+
+  /**
+   * 
+   * The Method validates interval values according to changed parameter.
+   * 
+   * @param changed
+   *          status value to check calculated interval value.
+   * @param getInterval
+   *          to test IntervalValue from CrawlDatum which is calculated via
+   *          AdaptiveFetchSchedule algorithm.
+   */
+  private void validateFetchInterval(int changed, int getInterval) {
+
+    if (changed == FetchSchedule.STATUS_UNKNOWN) {
+      assertEquals(getInterval, interval);
+
+    } else if (changed == FetchSchedule.STATUS_MODIFIED) {
+      calculateInterval = (int) (interval - (interval * dec_rate));
+      assertEquals(getInterval, calculateInterval);
+
+    } else if (changed == FetchSchedule.STATUS_NOTMODIFIED) {
+      calculateInterval = (int) (interval + (interval * inc_rate));
+      assertEquals(getInterval, calculateInterval);
+    }
+
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbFilter.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbFilter.java b/nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbFilter.java
new file mode 100644
index 0000000..773dd29
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbFilter.java
@@ -0,0 +1,148 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+import java.io.IOException;
+import java.util.ArrayList;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.SequenceFile.Reader.Option;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.*;
+import org.apache.nutch.crawl.CrawlDBTestUtil.URLCrawlDatum;
+import org.apache.nutch.test.IntegrationTest;
+import org.apache.nutch.util.NutchJob;
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+
+/**
+ * CrawlDbFiltering test which tests for correct, error free url normalization
+ * when the CrawlDB includes urls with <code>DB GONE</code> status and
+ * <code>CRAWLDB_PURGE_404</code> is set to true.
+ * 
+ * @author lufeng
+ */
+public class TestCrawlDbFilter {
+  Configuration conf;
+  Path dbDir;
+  Path newCrawlDb;
+  final static Path testdir = new Path("build/test/crawldbfilter-test");
+  FileSystem fs;
+
+  @Before
+  public void setUp() throws Exception {
+    conf = CrawlDBTestUtil.createConfiguration();
+    fs = FileSystem.get(conf);
+    fs.delete(testdir, true);
+  }
+
+  @After
+  public void tearDown() {
+    delete(testdir);
+  }
+
+  private void delete(Path p) {
+    try {
+      fs.delete(p, true);
+    } catch (IOException e) {
+    }
+  }
+
+  /**
+   * Test url404Purging
+   * 
+   * @throws Exception
+   */
+  @Test
+  @Category({IntegrationTest.class})
+  public void testUrl404Purging() throws Exception {
+    // create a CrawlDatum with DB GONE status
+    ArrayList<URLCrawlDatum> list = new ArrayList<URLCrawlDatum>();
+    list.add(new URLCrawlDatum(new Text("http://www.example.com"),
+        new CrawlDatum(CrawlDatum.STATUS_DB_GONE, 0, 0.0f)));
+    list.add(new URLCrawlDatum(new Text("http://www.example1.com"),
+        new CrawlDatum(CrawlDatum.STATUS_DB_FETCHED, 0, 0.0f)));
+    list.add(new URLCrawlDatum(new Text("http://www.example2.com"),
+        new CrawlDatum(CrawlDatum.STATUS_DB_UNFETCHED, 0, 0.0f)));
+    dbDir = new Path(testdir, "crawldb");
+    newCrawlDb = new Path(testdir, "newcrawldb");
+    // create crawldb
+    CrawlDBTestUtil.createCrawlDb(conf, fs, dbDir, list);
+    // set CRAWLDB_PURGE_404 to true
+    conf.setBoolean(CrawlDb.CRAWLDB_PURGE_404, true);
+    conf.setBoolean(CrawlDbFilter.URL_NORMALIZING, true);
+    conf.setBoolean(CrawlDbFilter.URL_FILTERING, false);
+    conf.setInt("urlnormalizer.loop.count", 2);
+    JobConf job = new NutchJob(conf);
+    job.setJobName("Test CrawlDbFilter");
+    Path current = new Path(dbDir, "current");
+    if (FileSystem.get(job).exists(current)) {
+      FileInputFormat.addInputPath(job, current);
+    }
+    job.setInputFormat(SequenceFileInputFormat.class);
+    job.setMapperClass(CrawlDbFilter.class);
+    job.setReducerClass(CrawlDbReducer.class);
+    FileOutputFormat.setOutputPath(job, newCrawlDb);
+    job.setOutputFormat(MapFileOutputFormat.class);
+    job.setOutputKeyClass(Text.class);
+    job.setOutputValueClass(CrawlDatum.class);
+    JobClient.runJob(job);
+
+    Path fetchlist = new Path(new Path(newCrawlDb, "part-00000"), "data");
+
+    ArrayList<URLCrawlDatum> l = readContents(fetchlist);
+
+    // verify we got right amount of records
+    Assert.assertEquals(2, l.size());
+  }
+
+  /**
+   * Read contents of fetchlist.
+   * 
+   * @param fetchlist
+   *          path to Generated fetchlist
+   * @return Generated {@link URLCrawlDatum} objects
+   * @throws IOException
+   */
+  private ArrayList<URLCrawlDatum> readContents(Path fetchlist)
+      throws IOException {
+    // verify results
+    Option fFile = SequenceFile.Reader.file(fetchlist);
+    SequenceFile.Reader reader = new SequenceFile.Reader(conf, fFile);
+
+    ArrayList<URLCrawlDatum> l = new ArrayList<URLCrawlDatum>();
+
+    READ: do {
+      Text key = new Text();
+      CrawlDatum value = new CrawlDatum();
+      if (!reader.next(key, value)) {
+        break READ;
+      }
+      l.add(new URLCrawlDatum(key, value));
+    } while (true);
+
+    reader.close();
+    return l;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbMerger.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbMerger.java b/nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbMerger.java
new file mode 100644
index 0000000..599c353
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbMerger.java
@@ -0,0 +1,163 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.TreeSet;
+import java.util.logging.Logger;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.MapFile;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.MapFile.Writer.Option;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.nutch.test.IntegrationTest;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+
+public class TestCrawlDbMerger {
+  private static final Logger LOG = Logger.getLogger(CrawlDbMerger.class
+      .getName());
+
+  String url10 = "http://example.com/";
+  String url11 = "http://example.com/foo";
+  String url20 = "http://example.com/";
+  String url21 = "http://example.com/bar";
+  String[] urls_expected = new String[] { url10, url11, url21 };
+
+  TreeSet<String> init1 = new TreeSet<String>();
+  TreeSet<String> init2 = new TreeSet<String>();
+  HashMap<String, CrawlDatum> expected = new HashMap<String, CrawlDatum>();
+  CrawlDatum cd1, cd2, cd3;
+  Configuration conf;
+  FileSystem fs;
+  Path testDir;
+  CrawlDbReader reader;
+
+  @Before
+  public void setUp() throws Exception {
+    init1.add(url10);
+    init1.add(url11);
+    init2.add(url20);
+    init2.add(url21);
+    long time = System.currentTimeMillis();
+    cd1 = new CrawlDatum();
+    cd1.setFetchInterval(1.0f);
+    cd1.setFetchTime(time);
+    cd1.getMetaData().put(new Text("name"), new Text("cd1"));
+    cd1.getMetaData().put(new Text("cd1"), new Text("cd1"));
+    cd2 = new CrawlDatum();
+    cd2.setFetchInterval(1.0f);
+    cd2.setFetchTime(time + 10000);
+    cd2.getMetaData().put(new Text("name"), new Text("cd2"));
+    cd3 = new CrawlDatum();
+    cd3.setFetchInterval(1.0f);
+    cd3.setFetchTime(time + 10000);
+    cd3.getMetaData().putAll(cd1.getMetaData());
+    cd3.getMetaData().putAll(cd2.getMetaData());
+    expected.put(url10, cd3);
+    expected.put(url11, cd1);
+    expected.put(url21, cd2);
+    conf = NutchConfiguration.create();
+    fs = FileSystem.get(conf);
+    testDir = new Path("test-crawldb-" + new java.util.Random().nextInt());
+    fs.mkdirs(testDir);
+  }
+
+  @After
+  public void tearDown() {
+    try {
+      if (fs.exists(testDir))
+        fs.delete(testDir, true);
+    } catch (Exception e) {
+    }
+    try {
+      reader.close();
+    } catch (Exception e) {
+    }
+  }
+
+  /**
+   * Test creates two sample {@link org.apache.nutch.crawl.CrawlDb}'s
+   * populating entries for keys as {@link org.apache.hadoop.io.Text} e.g. URLs 
+   * and values as {@link org.apache.nutch.crawl.CrawlDatum} e.g. record data. 
+   * It then simulates a merge process for the two CrawlDb's via the {@link org.apache.nutch.crawl.CrawlDbMerger}
+   * tool. The merged CrawlDb is then written to an arbitrary output location and the results
+   * read using the {@link org.apache.nutch.crawl.CrawlDbReader} tool. 
+   * Test assertions include comparing expected CrawlDb key, value (URL, CrawlDatum) values
+   * with actual results based on the merge process. 
+   * @throws Exception
+   */
+  @Test
+  @Category({IntegrationTest.class})
+  public void testMerge() throws Exception {
+    Path crawldb1 = new Path(testDir, "crawldb1");
+    Path crawldb2 = new Path(testDir, "crawldb2");
+    Path output = new Path(testDir, "output");
+    createCrawlDb(conf, fs, crawldb1, init1, cd1);
+    createCrawlDb(conf, fs, crawldb2, init2, cd2);
+    CrawlDbMerger merger = new CrawlDbMerger(conf);
+    LOG.fine("* merging crawldbs to " + output);
+    merger.merge(output, new Path[] { crawldb1, crawldb2 }, false, false);
+    LOG.fine("* reading crawldb: " + output);
+    reader = new CrawlDbReader();
+    String crawlDb = output.toString();
+    Iterator<String> it = expected.keySet().iterator();
+    while (it.hasNext()) {
+      String url = it.next();
+      LOG.fine("url=" + url);
+      CrawlDatum cd = expected.get(url);
+      CrawlDatum res = reader.get(crawlDb, url, new JobConf(conf));
+      LOG.fine(" -> " + res);
+      System.out.println("url=" + url);
+      System.out.println(" cd " + cd);
+      System.out.println(" res " + res);
+      // may not be null
+      Assert.assertNotNull(res);
+      Assert.assertTrue(cd.equals(res));
+    }
+    reader.close();
+    fs.delete(testDir, true);
+  }
+
+  private void createCrawlDb(Configuration config, FileSystem fs, Path crawldb,
+      TreeSet<String> init, CrawlDatum cd) throws Exception {
+    LOG.fine("* creating crawldb: " + crawldb);
+    Path dir = new Path(crawldb, CrawlDb.CURRENT_NAME);
+    
+    Option wKeyOpt = MapFile.Writer.keyClass(Text.class);
+    org.apache.hadoop.io.SequenceFile.Writer.Option wValueOpt = SequenceFile.Writer.valueClass(CrawlDatum.class);
+    
+    MapFile.Writer writer = new MapFile.Writer(config, new Path(dir,
+        "part-r-00000"), wKeyOpt, wValueOpt);
+    Iterator<String> it = init.iterator();
+    while (it.hasNext()) {
+      String key = it.next();
+      writer.append(new Text(key), cd);
+    }
+    writer.close();
+  }
+}

[19/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build for nutch-core and nutch-plugins

Posted by th...@apache.org.

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/language-identifier/src/test/java/org/apache/nutch/analysis/lang/en.test
----------------------------------------------------------------------
diff --git a/nutch-plugins/language-identifier/src/test/java/org/apache/nutch/analysis/lang/en.test b/nutch-plugins/language-identifier/src/test/java/org/apache/nutch/analysis/lang/en.test
new file mode 100644
index 0000000..e4465e5
--- /dev/null
+++ b/nutch-plugins/language-identifier/src/test/java/org/apache/nutch/analysis/lang/en.test
@@ -0,0 +1,105 @@
+Resumption of the session
+I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999, and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period.
+Although, as you will have seen, the dreaded 'millennium bug' failed to materialise, still the people in a number of countries suffered a series of natural disasters that truly were dreadful. You have requested a debate on this subject in the course of the next few days, during this part-session. In the meantime, I should like to observe a minute' s silence, as a number of Members have requested, on behalf of all the victims concerned, particularly those of the terrible storms, in the various countries of the European Union. Please rise, then, for this minute' s silence.
+(The House rose and observed a minute' s silence)
+
+Madam President, on a point of order. You will be aware from the press and television that there have been a number of bomb explosions and killings in Sri Lanka. One of the people assassinated very recently in Sri Lanka was Mr Kumar Ponnambalam, who had visited the European Parliament just a few months ago. Would it be appropriate for you, Madam President, to write a letter to the Sri Lankan President expressing Parliament's regret at his and the other violent deaths in Sri Lanka and urging her to do everything she possibly can to seek a peaceful reconciliation to a very difficult situation?
+
+Yes, Mr Evans, I feel an initiative of the type you have just suggested would be entirely appropriate. If the House agrees, I shall do as Mr Evans has suggested.
+
+Madam President, on a point of order. I would like your advice about Rule 143 concerning inadmissibility. My question relates to something that will come up on Thursday and which I will then raise again.
+The Cunha report on multiannual guidance programmes comes before Parliament on Thursday and contains a proposal in paragraph 6 that a form of quota penalties should be introduced for countries which fail to meet their fleet reduction targets annually. It says that this should be done despite the principle of relative stability. I believe that the principle of relative stability is a fundamental legal principle of the common fisheries policy and a proposal to subvert it would be legally inadmissible. I want to know whether one can raise an objection of that kind to what is merely a report, not a legislative proposal, and whether that is something I can competently do on Thursday.
+
+That is precisely the time when you may, if you wish, raise this question, i.e. on Thursday prior to the start of the presentation of the report.
+
+Madam President, coinciding with this year' s first part-session of the European Parliament, a date has been set, unfortunately for next Thursday, in Texas in America, for the execution of a young 34 year-old man who has been sentenced to death. We shall call him Mr Hicks.
+At the request of a French Member, Mr Zimeray, a petition has already been presented, which many people signed, including myself. However, I would ask you, in accordance with the line which is now constantly followed by the European Parliament and by the whole of the European Community, to make representations, using the weight of your prestigious office and the institution you represent, to the President and to the Governor of Texas, Mr Bush, who has the power to order a stay of execution and to reprieve the condemned person.
+This is all in accordance with the principles that we have always upheld.
+
+Thank you, Mr Segni, I shall do so gladly. Indeed, it is quite in keeping with the positions this House has always adopted.
+
+Madam President, I should like to draw your attention to a case in which this Parliament has consistently shown an interest. It is the case of Alexander Nikitin. All of us here are pleased that the courts have acquitted him and made it clear that in Russia, too, access to environmental information is a constitutional right. Now, however, he is to go before the courts once more because the public prosecutor is appealing. We know, and we have stated as much in very many resolutions indeed, including specifically during the last plenary part-session of last year, that this is not solely a legal case and that it is wrong for Alexander Nikitin to be accused of criminal activity and treason because of our involvement as the beneficiaries of his findings. These findings form the basis of the European programmes to protect the Barents Sea, and that is why I would ask you to examine a draft letter setting out the most important facts and to make Parliament's position, as expressed in the res
 olutions which it has adopted, clear as far as Russia is concerned.
+
+Yes, Mrs Schroedter, I shall be pleased to look into the facts of this case when I have received your letter.
+
+Madam President, I would firstly like to compliment you on the fact that you have kept your word and that, during this first part-session of the new year, the number of television channels in our offices has indeed increased considerably. But, Madam President, my personal request has not been met. Although there are now two Finnish channels and one Portuguese one, there is still no Dutch channel, which is what I had requested because Dutch people here like to be able to follow the news too when we are sent to this place of exile every month. I would therefore once more ask you to ensure that we get a Dutch channel as well.
+
+Mrs Plooij-van Gorsel, I can tell you that this matter is on the agenda for the Quaestors' meeting on Wednesday. It will, I hope, be examined in a positive light.
+
+Madam President, can you tell me why this Parliament does not adhere to the health and safety legislation that it actually passes? Why has no air quality test been done on this particular building since we were elected? Why has there been no Health and Safety Committee meeting since 1998? Why has there been no fire drill, either in the Brussels Parliament buildings or the Strasbourg Parliament buildings? Why are there no fire instructions? Why have the staircases not been improved since my accident? Why are no-smoking areas not enforced? It seems absolutely disgraceful that we pass legislation and do not adhere to it ourselves.
+
+Mrs Lynne, you are quite right and I shall check whether this has actually not been done. I shall also refer the matter to the College of Quaestors, and I am certain that they will be keen to ensure that we comply with the regulations we ourselves vote on.
+
+Madam President, Mrs D�ez Gonz�lez and I had tabled questions on certain opinions of the Vice-President, Mrs de Palacio, which appeared in a Spanish newspaper. The competent services have not included them in the agenda on the grounds that they had been answered in a previous part-session.
+I would ask that they reconsider, since this is not the case. The questions answered previously referred to Mrs de Palacio' s intervention, on another occasion, and not to these comments which appeared in the ABC newspaper on 18 November.
+
+Mr Berenguer Fuster, we shall check all this. I admit that, at present, the matter seems to be somewhat confused. We shall therefore look into it properly to ensure that everything is as it should be.
+
+Madam President, I should like to know if there will be a clear message going out from Parliament this week about our discontent over today's decision refusing to renew the arms embargo on Indonesia, considering that the vast majority in this Parliament have endorsed the arms embargo in Indonesia in the past? Today's decision not to renew the embargo is extremely dangerous considering the situation there. So Parliament should send a message, since that is the wish of the vast majority. It is irresponsible of EU Member States to refuse to renew the embargo. As people have said, the situation there is extremely volatile. There is, in fact, a risk of a military coup in the future. We do not know what is happening. So why should EU arms producers profit at the expense of innocent people?
+
+In any event, this question is not presently included among the requests for topical and urgent debate on Thursday.
+
+Agenda
+The next item is the verification of the final version of the draft agenda as drawn up by the Conference of Presidents at its meeting of 13 January pursuant to Rule 110 of the Rules of Procedure. No amendments have been proposed relating to Monday and Tuesday.
+Relating to Wednesday:
+The Group of the Party of European Socialists requests that a Commission statement be included on its strategic objectives for the next five years and on the administrative reform of the Commission.
+I would like Mr Bar�n Crespo, who made the request, to speak to propose it. That is, if he so wishes, of course. Then we shall follow the usual procedure, hearing one speaker in favour and one against.
+
+Madam President, the presentation of the Prodi Commission' s political programme for the whole legislature was initially a proposal by the Group of the Party of European Socialists which was unanimously approved by the Conference of Presidents in September and which was also explicitly accepted by President Prodi, who reiterated his commitment in his inaugural speech.
+This commitment is important because the Commission is a body with a monopoly of initiative in accordance with the Treaties and, therefore, basically dictates this Parliament' s political and legislative activity for the next five years. I would also like to point out, Madam President, that this Parliament voted to express its confidence in President Prodi during the previous legislature. It did so again during this legislature, in July, and then, in September, it voted once more to approve the whole Commission. There has therefore been enough time for the Commission to prepare its programme and for us to become familiar with it and explain it to our citizens. To this end, I would like to remind you of the resolution of 15 September, which recommended that the proposal be presented as soon as possible.
+The events of last week - which originated outside the Conference of Presidents, that Conference being used simply to corroborate and ratify decisions taken elsewhere - present us with a dilemma. Either the Commission is not ready to present this programme, in which case it should clarify it. According to its President, it is in a position to do so. Given that the Commission is represented by Vice-President de Palacio, I believe that, before voting, it would help if the Commission could let us know how ready it is to present this programme, as agreed. Alternatively, Parliament is not ready to examine this programme, as some appear to be suggesting. In my opinion, this second hypothesis would imply the failure of Parliament in its duty as a Parliament, as well as introducing an original thesis, an unknown method which consists of making political groups aware, in writing, of a speech concerning the Commission' s programme a week earlier - and not a day earlier, as had been agreed - b
 earing in mind that the legislative programme will be discussed in February, so we could forego the debate, since on the next day our citizens will hear about it in the press and on the Internet and Parliament will no longer have to worry about it.
+My Group believes that since a parliament is meant to listen, debate and reflect, there can be no justification whatsoever for this delay and we believe that, if the Commission is ready to do so, we still have time to re-establish the original agreement between Parliament and the Commission and proceed in a manner which fulfils our duty to our fellow citizens. Therefore, the proposal of the Group of the Party of European Socialists, and which you have mentioned, is that the Prodi Commission present its legislative programme on Wednesday, including its proposed administrative reform, because, otherwise, we could find ourselves in a paradoxical situation: on the pretext that there is no text, on the one hand, the President of the Commission would be denied his right to speak in this Parliament and, on the other hand, there would be a debate on a reform when Parliament had no prior knowledge of the texts on which it is based. Therefore, Madam President, I would ask you to request that 
 the Commission express its opinion on this issue and that we then proceed to the vote.
+(Applause from the PSE Group)
+
+Madam President, I really am quite astonished at Mr Bar�n Crespo' s behaviour and the fact that he is now asking for this item to be put on Wednesday's agenda.
+Mr Bar�n Crespo, you were unable to attend the Conference of Presidents last Thursday. I am not criticising this; it happens from time to time that people send someone to represent them. Mr H�nsch represented you on this occasion. In the Conference of Presidents, we had an in-depth discussion. Your Group was alone in advocating what you are saying now. We then put it to a vote. As you know, each chairman has the same number of votes as his Group has Members. There was a vote on this matter. As I recall, the outcome of this vote was 422 votes to 180 with a few abstentions. This means that all the Groups with the exception of the non-attached Members - but, of course, they are not a Group - were in agreement; only your Group thought that we should proceed as you have proposed here. All of the others were of a different opinion. That was the decision.
+I should now like to comment on the issue itself. We have confidence in the Commission and in Romano Prodi and, after a difficult procedure, as everyone knows, the vast majority of our Group supported the vote of confidence in Romano Prodi and the Commission. We believe, however, that the Commission's strategic plan needs to be debated within a proper procedural framework, not only on the basis of an oral statement here in the European Parliament, but also on the basis of a document which is adopted in the Commission and which describes this programme over the five-year period. There is no such document!
+
+The Commission will present its programme for the year 2000 in February. We have said, very well, if the Commission does not wish to introduce the 2000 programme as early as January then we will do it in February. We have agreed to this. After all, we do not wish to quarrel with the Commission; if at all possible, we believe that the Commission and Parliament need to tread the same path. However, we in Parliament also have a supervisory role with regard to the Commission and we do not have to agree with everything which comes out of the Commission.
+I should like us to be able to do a reasonable amount of preparation for the debate on the five-year programme in our Groups. You cannot prepare if you hear a statement in this House and have no idea of its content. That is why we would recommend - and it is my impression that the Commission is also open to this idea - that we hold the debate on the Commission's long-term programme up to the year 2005 in February - and I hope that the Commission will agree on a programme before then which it will propose to us - and that, at the same time, in February we also hold the debate on the Commission's legislative programme for the year 2000. The fact that the subjects are connected also suggests that we should hold the debate on both programmes together. That is why my Group firmly rejects the proposal made by the Socialist Group.
+(Applause from the PPE-DE Group)
+
+Madam President, I would like to make it very clear that, above all, the Commission has absolute respect for the decisions of this Parliament and, amongst those, the decision establishing its agenda. We therefore respect whatever Parliament may decide.
+But I would also like to make it very clear that President Prodi made a commitment to this Parliament to introduce a new debate, as Mr Bar�n Crespo has reminded us, which would be in addition to the annual debate on the Commission' s legislative programme, on the broad areas of action for the next five years, that is to say, for this legislature.
+Madam President, I would like to say that the agreement reached in September distinguished this debate from the annual presentation of the Commission' s legislative programme. I would also like to say that the Commission is prepared and ready to hold this debate whenever it is convenient and that we were ready to do so this week as we had agreed originally, on the basis that it would be presented the day before in a speech to parliamentary groups.
+Therefore, Madam President, I would like to repeat that the Commission has debated the action plan for the next five years and, when Parliament decides, - this week if that is the decision - we are prepared to come and explain the programme for the next five years and, next month, the programme for 2000, which is what we fully agreed upon.
+
+I propose that we vote on the request of the Group of the Party of European Socialists that the Commission statement on its strategic objectives should be reinstated.
+(Parliament rejected the request) President. Still on the subject of Wednesday' s sitting, I have another proposal regarding the oral question on capital tax. The PPE-DE Group is requesting that this item be taken off the agenda.
+Is there a member who wishes to speak on behalf of this Group to propose this?
+
+Madam President, I can hear a ripple of laughter from the Socialists. I was told that large sections of the Socialist Group were also keen to have this item taken off the agenda, because at the vote in the Conference of Presidents no vote was received from the working group of Members of the Socialist Group responsible for this matter. I do not know whether this information is correct, but the PPE-DE Group would, in any case, be grateful if this item were removed because Parliament has addressed this issue several times already. Decisions have also been adopted against a tax of this kind. That is why my Group moves that this item be taken off the agenda.
+
+Thank you, Mr Poettering.
+We shall now hear Mr Wurtz speaking against this request.
+
+Madam President, I would firstly like to point out Mr Poettering' s lack of logic. He has just been preaching to the Group of the Party of European Socialists because they went back on a decision taken in a perfectly clear manner at the Conference of Presidents, and now he is doing just the same. We discussed that matter and we were unanimous, with the exception of the PPE and ELDR Groups. As my fellow chairmen will recall, I even mentioned that it was not a matter of knowing whether one was for or against the Tobin tax, but of whether one dared to hear what the Commission and the Council thought of it. It is not a lot to ask. I therefore repeat the proposal that this oral question to the Commission and the Council should be retained so that we can find out, once and for all, the positions of these two bodies regarding the proposal which is relatively modest but which would give a clear message to public opinion, particularly after the tide of feeling generated by the failure of the
  Seattle Conference.
+
+We shall proceed to vote on the PPE-DE Group' s request that the oral question regarding the capital tax be withdrawn from the agenda.
+(Parliament rejected the request, with 164 votes for, 166 votes against and 7 abstentions)
+
+Madam President, I would like to thank Mr Poettering for advertising this debate. Thank you very much.
+
+Madam President, has my vote been counted? I was unable to vote electronically, since I do not have a card. My vote was "in favour" .
+
+Indeed, if we add the two Members who have declared themselves, then the result of the vote would be ....
+
+Madam President, the Presidency has already declared the result of the vote. There is no room for amendments.
+
+Ladies and gentlemen, once again, we see it is essential for Members to bring their voting cards along on a Monday. Clearly there is a problem here. That being the case, I shall have to make a decision.
+I too forgot my card, and I would have voted against. I therefore consider that the oral question may be kept on the agenda as per the vote.
+This is the last time that we shall make allowances for Members who have forgotten their cards. Let that be clearly noted and understood.
+(Applause)The oral question will therefore remain on the agenda, and yes, your President is entitled to vote just as she is entitled to forget her voting card.
+We shall continue with the other amendments to the agenda.
+
+Madam President, in the earlier vote - and I will abide by your ruling on this matter - on the question of the strategic plan of the Commission I indicated that I would like to speak in advance of the vote on behalf of my Group. That did not happen. I would appreciate it if, on the close of this item of business, I might be allowed to give an explanation of vote on behalf of my Group. This is an important matter. It would be useful for the record of the House to state how people perceive what we have just done in the light of their own political analysis.
+
+Madam President, I do not wish to reopen the debate, but I had also asked for the floor, to comment on Mr Bar�n Crespo's motion. You did not call me either. I regret this, but the vote has already been taken and the decision is made so let us leave the matter there.
+
+I am terribly sorry, Mr H�nsch and Mr Cox. I did not see you asking to speak. Even so, I think the positions are quite clear and they shall be entered in the Minutes. When we adopt the Minutes for today' s sitting tomorrow, then any Members who think the positions have not been explained clearly enough may ask for amendments. This seems to me to be a workable solution. Of course, the Minutes for tomorrow' s sitting will take into account any additional explanations. I think this is a better solution than proceeding now to extremely time-consuming explanations of votes. Mr Cox, Mr H�nsch, would this be acceptable to you?
+
+Madam President, if the vote records correctly how my Group voted I shall not, and cannot, object to that. If your ruling is that I cannot give an explanation of vote, I accept that but with reservations.
+
+We shall pay particular attention to the wording of the Minutes, as we always do, of course. If they do not properly reflect the positions adopted, then we may correct them, if necessary.
+(The order of business was adopted thus amended)
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/language-identifier/src/test/java/org/apache/nutch/analysis/lang/es.test
----------------------------------------------------------------------
diff --git a/nutch-plugins/language-identifier/src/test/java/org/apache/nutch/analysis/lang/es.test b/nutch-plugins/language-identifier/src/test/java/org/apache/nutch/analysis/lang/es.test
new file mode 100644
index 0000000..c113c5e
--- /dev/null
+++ b/nutch-plugins/language-identifier/src/test/java/org/apache/nutch/analysis/lang/es.test
@@ -0,0 +1,107 @@
+Reanudaci�n del per�odo de sesiones
+Declaro reanudado el per�odo de sesiones del Parlamento Europeo, interrumpido el viernes 17 de diciembre pasado, y reitero a Sus Se�or�as mi deseo de que hayan tenido unas buenas vacaciones.
+Como todos han podido comprobar, el gran "efecto del a�o 2000" no se ha producido. En cambio, los ciudadanos de varios de nuestros pa�ses han sido v�ctimas de cat�strofes naturales verdaderamente terribles. Sus Se�or�as han solicitado un debate sobre el tema para los pr�ximos d�as, en el curso de este per�odo de sesiones. A la espera de que se produzca, de acuerdo con muchos colegas que me lo han pedido, pido que hagamos un minuto de silencio en memoria de todas las v�ctimas de las tormentas, en los distintos pa�ses de la Uni�n Europea afectados. Invito a todos a que nos pongamos de pie para guardar un minuto de silencio.
+(El Parlamento, de pie, guarda un minuto de silencio)
+
+Se�ora Presidenta, una cuesti�n de procedimiento. Sabr� usted por la prensa y la televisi�n que se han producido una serie de explosiones y asesinatos en Sri Lanka. Una de las personas que recientemente han asesinado en Sri Lanka ha sido al Sr. Kumar Ponnambalam, quien hace pocos meses visit� el Parlamento Europeo. �Ser�a apropiado que usted, Se�ora Presidenta, escribiese una carta al Presidente de Sri Lanka expresando las condolencias del Parlamento por esa y otras muertes violentas, pidi�ndole que haga todo lo posible para encontrar una reconciliaci�n pac�fica ante la extremadamente dif�cil situaci�n que est� viviendo su pa�s?
+
+S�, se�or Evans, pienso que una iniciativa como la que usted acaba de sugerir ser�a muy adecuada. Si la Asamblea est� de acuerdo, har� lo que el se�or Evans acaba de sugerir.
+
+ Se�ora Presidenta, una cuesti�n de procedimiento. Me gustar�a que me asesorara sobre el Art�culo 143 concerniente a la inadmisibilidad. Mi pregunta se refiere a un asunto del que se hablar� el jueves, d�a que en volver� a plantearla.
+El informe Cunha sobre los programas de direcci�n plurianual se presenta al Parlamento el jueves y contiene una propuesta en el apartado 6 en torno a una forma de penalizaciones basada en cuotas que debe aplicarse a los pa�ses que no cumplan anualmente sus objetivos de reducci�n de flota. El informe estipula que se debe aplicarse a pesar del principio de estabilidad relativa. Creo que el principio de estabilidad relativa es un principio legal fundamental de las pol�ticas pesqueras comunitarias, por lo que una propuesta que lo subvierta es legalmente inadmisible. Quiero saber si se puede hacer este tipo de objeci�n a lo que s�lo es un informe, no una propuesta legislativa, y si es algo que puedo plantear el jueves.
+
+Su Se�or�a, si as� lo desea, podr� plantear esta cuesti�n en ese momento, es decir, el jueves antes de que se presente el informe.
+
+Se�ora Presidenta, coincidiendo con el primer per�odo parcial de sesiones de este a�o del Parlamento Europeo, lamentablemente, en los Estados Unidos, en Texas, se ha fijado para el pr�ximo jueves la ejecuci�n de un condenado a la pena capital, un joven de 34 a�os que llamaremos con el nombre de Hicks.
+A petici�n de un diputado franc�s, el Sr. Zimeray, se ha presentado una solicitud, cuyos firmantes han sido numerosos y entre los cuales me cuento, sin embargo, de conformidad con la posici�n constantemente expresada por el Parlamento Europeo y por toda la Comunidad europea, le pido que intervenga con el prestigio de su cargo y de la Instituci�n que representa ante el Presidente y el Gobernador del Estado de Texas, Sr. Bush, que tiene la facultad de suspender la condena a muerte y de indultar al condenado.
+Todo ello, de conformidad con los principios que siempre hemos apoyado.
+
+Gracias, se�or Segni, lo har� de muy buen grado. Est�, sin duda, dentro de la l�nea de las posiciones que nuestro Parlamento siempre ha adoptado.
+
+Se�ora Presidenta, quisiera llamar la atenci�n de usted acerca de un caso del que se ha ocupado una y otra vez este Parlamento. Es el caso de Alexander Nikitin. Todos celebramos aqu� que el tribunal le haya declarado inocente y que haya puesto de manifiesto que tambi�n en Rusia el acceso a la informaci�n sobre el medio ambiente es un derecho constitucional. Sin embargo, sucede que va a ser acusado de nuevo, pues el fiscal interpondr� un recurso. Sabemos y lo hemos afirmado realmente en muchas resoluciones -precisamente en la �ltima sesi�n plenaria del a�o pasado- que �ste no es solamente un caso jur�dico y que es un error acusar a Alexander Nikitin de conducta criminal y de traici�n, porque nosotros, como afectados por sus resultados, obtengamos provecho de ella. Esos resultados constituyen la base de los programas europeos del mar de Barent y, por ello, le pido que estudie un borrador de carta que le describe los hechos m�s relevantes y que haga patente a Rusia esta po
 sici�n que es acorde con las decisiones del Parlamento.
+
+S�, se�ora Schroedter, de buena gana voy a examinar los hechos relacionados con este tema en cuanto reciba su carta.
+
+Se�ora Presidenta, en primer lugar me gustar�a felicitarla por haber cumplido con su promesa ya que efectivamente en este primer periodo de sesiones del a�o nuevo se ha ampliado dr�sticamente el n�mero de canales en nuestros despachos. Sin embargo, se�ora Presidenta, no se ha hecho lo que yo hab�a pedido. Bien es verdad que ahora hay dos canales finlandeses y uno portugu�s, pero a�n no hay ninguno neerland�s y yo le hab�a pedido un canal neerland�s, porque tambi�n los neerlandeses quisi�ramos escuchar las noticias cada mes, cuando nos mandan a este exilio. Por lo tanto quisiera pedirle nuevamente que se encargue de que podamos ver tambi�n un canal neerland�s.
+
+Se�ora Plooij-van Gorsel, puedo decirle que esa cuesti�n est� en el orden del d�a de la reuni�n de cuestores fijada para el mi�rcoles. Espero que se examine con un esp�ritu positivo.
+
+Se�ora Presidenta, �podr�a decirme por qu� este Parlamento no cumple la legislaci�n de sanidad y seguridad que en realidad ha aprobado? �Por qu� no se ha hecho ninguna prueba de calidad del aire de este edificio desde que hemos sido elegidos? �Por qu� no se ha celebrado ninguna reuni�n del Comit� de Sanidad y Seguridad desde 1998? �Por qu� no hemos tenido simulacros de incendio ni en los edificios del Parlamento de Bruselas ni en los del Parlamento de Estrasburgo? �Por qu� no hay instrucciones en caso de incendio? �Por qu� no se han mejorado las escaleras desde mi accidente? �Por qu� no se respetan las �reas de no fumadores? Es totalmente vergonzoso que aprobemos una legislaci�n y no la cumplamos ni nosotros mismos.
+
+Se�ora Lynne, tiene toda la raz�n del mundo, y verificar� si estas cosas no se han hecho de verdad. Tambi�n someter� el problema a la Junta de Cuestores, y estoy segura de que los cuestores har�n lo debido para que respetemos las normas que, efectivamente, nosotros mismos votamos.
+
+Sra. Presidenta, la Sra. D�ez Gonz�lez y yo mismo hab�amos presentado unas preguntas sobre determinadas opiniones, reproducidas en un diario espa�ol, de la Vicepresidenta, Sra. de Palacio. Los servicios competentes no las han incluido en el orden del d�a por considerar que ya hab�an sido contestadas en una sesi�n anterior.
+Ruego que se reconsidere esta decisi�n porque esto no es as�. Las preguntas contestadas con anterioridad se refer�an a la intervenci�n, en determinado expediente, de la Sra. de Palacio, no a esas declaraciones aparecidas en el diario ABC el 18 de noviembre pasado.
+
+Se�or�a, vamos a verificar todo esto. Le confieso que, de momento, las cosas me parecen un tanto confusas. Por consiguiente, vamos a revisarlas con seriedad para que todo est� en orden.
+
+Se�ora Presidenta, me gustar�a saber si el Parlamento emitir� esta semana un comunicado claro sobre el descontento por la decisi�n de hoy de no renovar el embargo de armas a Indonesia, considerando que la inmensa mayor�a de este Parlamento apoy� en el pasado el embargo de armas a Indonesia. La decisi�n de hoy de no renovar el embargo es extremadamente peligrosa teniendo en cuenta la situaci�n de ese pa�s. Por lo tanto, el Parlamento debe enviar un mensaje, ya que �se es el deseo de la inmensa mayor�a. Es irresponsable por parte de los Estados Miembros de la UE negarse a renovar el embargo. Como ha dicho la gente, la situaci�n all� es extremadamente precaria. De hecho, existe el riesgo de un golpe militar en el futuro. No sabemos lo que est� ocurriendo all�. �Por qu� tienen que beneficiarse los productores de armas de la UE a expensas de gente inocente?
+
+En cualquier caso, este tema no figura ahora mismo entre las demandas de urgencia para el jueves pr�ximo.
+
+Orden de los trabajos
+De conformidad con el orden del d�a, se procede a examinar el proyecto definitivo de orden del d�a, establecido de conformidad con el art�culo 110 del Reglamento en la Conferencia de Presidentes, el jueves 13 de enero. En lo relativo al lunes y al martes, no hay modificaciones.
+Mi�rcoles :
+El Grupo del Partido de los Socialistas Europeos solicita que se incluya una declaraci�n de la Comisi�n sobre sus objetivos estrat�gicos para los pr�ximos cinco a�os, as� como sobre la reforma administrativa de la Comisi�n.
+Desear�a que el Sr. Bar�n Crespo, autor de la petici�n, interviniera para justificarla, si a �l le parece adecuado, por supuesto. Como siempre, escucharemos a un orador a favor y a otro en contra.
+
+Se�ora Presidenta, la presentaci�n del programa pol�tico de la Comisi�n Prodi para toda la legislatura fue inicialmente una propuesta del Grupo del Partido de los Socialistas Europeos que logr� la unanimidad de la Conferencia de Presidentes en septiembre y tambi�n la aceptaci�n expl�cita del Presidente Prodi, que reiter� su compromiso en su discurso de investidura.
+Tiene importancia este compromiso en la medida en que la Comisi�n es un organismo que tiene el monopolio de la iniciativa de acuerdo con los Tratados y, por lo tanto, configura b�sicamente lo que va a ser la actividad pol�tica y legislativa de este Parlamento en los pr�ximos cincos a�os. Recuerdo adem�s, se�ora Presidenta, que este Parlamento vot� en dos ocasiones en la anterior legislatura su confianza en el Presidente Prodi; en esta legislatura la vot� de nuevo en julio y despu�s, con la nueva Comisi�n en funciones, volvi� en septiembre a darle un voto de confianza a toda la Comisi�n. Por lo tanto, ya ha habido tiempo suficiente para que la Comisi�n prepare su programa y para que nosotros lo podamos conocer y podamos explic�rselo a los ciudadanos. En este sentido recuerdo la resoluci�n del 15 de septiembre, en la que se recomendaba que se presentara la propuesta en el plazo m�s breve posible.
+Los hechos ocurridos la semana pasada -que se han originado al margen de la Conferencia de Presidentes y utiliz�ndola s�lo para corroborar y ratificar decisiones tomadas fuera de ella- plantean un dilema: o bien la Comisi�n no est� en condiciones de presentar ese programa (en ese caso convendr�a que lo aclarara. Seg�n palabras de su Presidente, est� en condiciones de hacerlo. Dado que la Comisi�n est� representada por la Vicepresidenta, Sra. de Palacio, creo que antes de votar ser�a conveniente conocer la situaci�n de la Comisi�n en relaci�n con su disponibilidad para presentar el programa, tal como se hab�a convenido); o bien el Parlamento no est� en condiciones de examinar este programa, como parece que pretenden algunos. En mi opini�n, esta segunda hip�tesis significar�a hacer dejaci�n de nuestras responsabilidades como Parlamento, adem�s de introducir una tesis original, un m�todo desconocido consistente en dar a conocer a los grupos pol�ticos por escrito e
 l discurso program�tico de la Comisi�n una semana antes -y no el d�a antes, como se hab�a convenido-, teniendo en cuenta que el programa legislativo se discutir� en febrero, de tal manera que podr�amos prescindir del debate, porque al d�a siguiente la prensa e Internet lo habr�an dado a conocer a todos los ciudadanos y el Parlamento no tendr�a ya por qu� ocuparse del asunto.
+Como mi Grupo opina que un Parlamento est� hecho para escuchar, para debatir y para reflexionar, pensamos que no hay raz�n alguna que justifique este aplazamiento y creemos que si, la Comisi�n est� en condiciones de hacerlo, estamos perfectamente a tiempo para poder restablecer el acuerdo original entre el Parlamento y la Comisi�n y proceder con responsabilidad ante nuestras conciudadanas y nuestros conciudadanos. Por lo tanto, la propuesta que hace el Grupo del Partido de los Socialistas Europeos y que su Se�or�a ha mencionado es que el mi�rcoles se mantenga la presentaci�n del programa de legislatura de la Comisi�n Prodi, incluyendo dentro de este programa tambi�n el proyecto de reforma administrativa porque, de no ser as�, nos podemos encontrar con una situaci�n parad�jica: con la excusa de que no hay texto, se niega por una parte el derecho al Presidente de la Comisi�n a hablar en este Parlamento y, por otra parte, que tenga lugar un debate sobre la reforma sin qu
 e este Parlamento conozca previamente los textos en que se basa. Por lo tanto, le ruego, se�ora Presidenta, que pida a la Comisi�n que se manifieste en este momento y despu�s que se proceda al voto.
+(Aplausos del grupo PSE)
+
+Se�ora Presidenta, estimados colegas, estoy un tanto sorprendido por la conducta de nuestro colega, el Sr. Bar�n Crespo, que ahora pide que este punto del orden del d�a se incluya en el orden del d�a del mi�rcoles.
+Se�or Bar�n Crespo, usted no pudo asistir el jueves pasado a la Conferencia de Presidentes. No se lo critico; ocurre a veces que se delega la representaci�n. Nuestro colega, el Sr. H�nsch, le represent� a usted en la misma. En la Conferencia de Presidentes, tuvimos un debate muy minucioso. Solamente su Grupo mantuvo lo que usted dice ahora. Votamos luego. Cada Presidente o cada Presidenta tiene tantos votos como miembros tiene su Grupo. Hubo una votaci�n sobre este punto. Seg�n recuerdo el resultado de la votaci�n fue el siguiente: 422 votos contra 180 con unas pocas abstenciones. Esto significa que todos los Grupos, con excepci�n de los que no pertenecen a grupos -los cuales, ciertamente, no constituyen grupo alguno- estaban de acuerdo. S�lo su Grupo opinaba que hab�a que proceder tal como usted acaba de proponer aqu�. Todos los dem�s eran de otra opini�n. As� fue el acuerdo.
+Ahora quisiera decir algo sobre el asunto mismo. Nosotros tenemos confianza en la Comisi�n, en Romano Prodi, y la gran mayor�a de nuestro Grupo manifest� su confianza a Romano Prodi y a la Comisi�n despu�s de un dif�cil proceso, como todos sabemos. Sin embargo, somos tambi�n de la opini�n de que debemos celebrar un debate sobre esta estrategia de la Comisi�n en un procedimiento, no s�lo a causa de una explicaci�n oral que ha tenido lugar aqu�, en el Parlamento Europeo, sino tambi�n a causa de un documento que se ha aprobado en la Comisi�n y que describe este programa para cinco a�os. Tal documento no est�.
+La Comisi�n presentar� en febrero el programa para el a�o 2000. Nosotros hemos dicho, bueno, si la Comisi�n no quiere hacer todav�a el Programa 2000 en enero, hag�moslo en febrero. Hemos otorgado nuestra conformidad. En efecto, no queremos tener ninguna disputa con la Comisi�n, sino que somos de la opini�n de que, si es posible, la Comisi�n y el Parlamento han de marchar por un camino com�n. Pero, como Parlamento, somos la instancia controladora de la Comisi�n. Y no todo lo que proceda de la Comisi�n debe ser opini�n nuestra.
+Quisiera que en los Grupos nos pudi�ramos preparar razonablemente para un debate sobre el programa para cinco a�os. No es posible una preparaci�n, si se escucha aqu� una declaraci�n y no se sabe cu�l es el contenido de tal declaraci�n. Por esta raz�n, nuestra propuesta -y mi impresi�n es que la Comisi�n se encuentra tambi�n abierta a estas ideas- es que celebremos en febrero el debate sobre el plan a largo plazo de la Comisi�n hasta el a�o 2005 -espero que de aqu� a all� la Comisi�n se ponga tambi�n de acuerdo en un programa que tendr� que proponernos- y que nosotros, en febrero, celebremos tambi�n, al mismo tiempo, el debate sobre el programa legislativo de la Comisi�n para el a�o 2000. Por consiguiente, es un contexto objetivo y sensato el que nos aconseja celebrar conjuntamente el debate sobre ambos programas. Por esta raz�n mi Grupo rechaza de manera decidida la propuesta del Grupo Socialista.
+(Aplausos del Grupo PPE-DE)
+. Se�ora Presidenta, quiero dejar muy claro que, ante todo, la Comisi�n tiene el m�ximo respeto por las decisiones de este Parlamento y, entre ellas, la de establecer su orden del d�a. Por lo tanto, nosotros respetamos lo que en este sentido pueda decidir el Parlamento.
+Pero quiero dejar tambi�n muy claro que el Presidente Prodi se comprometi� con el Parlamento a incorporar un nuevo debate, como ha recordado el Sr. Bar�n, que se a�ade al debate anual sobre el programa legislativo de la Comisi�n, sobre las grandes l�neas de actuaci�n para el pr�ximo per�odo de cinco a�os, es decir, para esta legislatura.
+Quiero decir, se�ora Presidenta, que este debate se distingu�a, en el acuerdo al que se lleg� en el mes de septiembre, de lo que es la presentaci�n anual del programa legislativo de la Comisi�n. Y quiero decir, se�ora Presidenta, que, por parte de la Comisi�n, estamos preparados y dispuestos a tener ese debate cuando convenga, que est�bamos preparados para desarrollarlo esta semana, como en principio se hab�a acordado, partiendo de la base de que se presentaba la v�spera en un discurso a los grupos parlamentarios.
+Por lo tanto, se�ora Presidenta, quiero reiterar que, por nuestra parte, hemos debatido el programa de actuaci�n para los pr�ximos cinco a�os y que estamos preparados para, cuando as� lo decida el Parlamento, -esta misma semana si �sa es la decisi�n- venir a exponer el programa para los pr�ximos cinco a�os y, el mes que viene, el programa para el a�o 2000, que era lo que estaba perfectamente acordado.
+
+Propongo que votemos la petici�n del Grupo del Partido de los Socialistas Europeos para volver a incluir la declaraci�n de la Comisi�n sobre sus objetivos estrat�gicos.
+(El Parlamento rechaza la petici�n) El Presidente. Sobre el tema del mi�rcoles, tengo otra propuesta relativa a la pregunta oral sobre el impuesto al capital. El grupo PPE-DE solicita que se retire este punto del orden del d�a.
+�Alg�n colega tomar� la palabra en nombre del grupo para justificar esta petici�n?
+
+Se�ora Presidenta, escucho algunas risas entre los socialistas. Me han dicho que tambi�n amplios c�rculos del Grupo Socialista ven de buen grado la supresi�n de este punto del orden del d�a, ya que en la votaci�n habida en la Conferencia de Presidentes falt� el voto del grupo de trabajo competente para ello de nuestros colegas del Grupo Socialista. Ignoro si esta informaci�n es correcta, pero nosotros como Grupo del PPE/DE agradecer�amos en cualquier caso que se suprimiera este punto, ya que el Parlamento se ha ocupado ya numerosas veces de esta cuesti�n. Tambi�n existen acuerdos contra semejante impuesto. Por consiguiente, mi Grupo solicita que se suprima este punto del d�a.
+
+Gracias, se�or Poettering.
+Escuchemos ahora al Sr. Wurtz, que hablar� en contra de la petici�n.
+
+Se�ora Presidenta, ante todo quiero subrayar la falta de l�gica del se�or Poettering, que acaba de dar una lecci�n al Grupo de los Socialista Europeos para que se vuelva a tratar una decisi�n que se tom� de un modo absolutamente claro en la Conferencia de Presidentes. Y �l hace lo mismo. Hemos discutido, expresamos unanimidad, con excepci�n de los Grupos del PPE y del Partido Europeo de los Liberales Dem�cratas y Reformistas, y yo mismo hice notar -como lo recordar�n mis queridos compa�eros presidentes- que no se trata de saber si est�n ustedes a favor o en contra del impuesto Todin, sino de saber si se atreven a comprender lo que piensan al respecto la Comisi�n y el Consejo. No es pedir demasiado. Por lo tanto, reitero la propuesta de mantener esta pregunta oral a la Comisi�n y al Consejo para saber de una vez por todas cu�l es la posici�n de esos dos �rganos en lo que se refiere a esta propuesta relativamente modesta, pero que ser�a una se�al importante para la 
 opini�n, sobre todo despu�s de la conmoci�n que suscit� el fracaso de la Conferencia de Seattle.
+
+Se vota la propuesta del Grupo PPE-DE para retirar del orden del d�a la pregunta oral referida al impuesto sobre el capital.
+(El Parlamento rechaza la propuesta por 164 votos a favor, 166 votos en contra y 7 abstenciones)
+
+Se�ora Presidenta, quiero agradecer al se�or Poettering la publicidad que acaba de dar a este debate. Gracias.
+
+Se�ora Presidenta, �se ha contabilizado mi voto, que no ha podido ser realizado electr�nicamente, porque no tengo la tarjeta? Mi voto era "a favor" .
+
+En efecto, si se suman los dos colegas que han hablado, el resultado ser�a...
+
+
+Se�ora Presidenta, la Presidencia ha proclamado el resultado de la votaci�n. No caben modificaciones.
+
+Se�or�as, repito una vez m�s que es preciso que todos tengamos la tarjeta. Es evidente que se trata de un problema. En estas circunstancias, debo tomar una decisi�n.
+Tambi�n yo he olvidado mi tarjeta y habr�a votado en contra. Considero, pues, que la pregunta oral se mantiene en el orden del d�a.
+
+Es la �ltima vez que tomaremos en cuenta las tarjetas olvidadas. Que esto quede bien claro y que se avise a todos.
+(Aplausos)
+S�, la pregunta oral se mantiene en el orden del d�a y s�, la Presidenta tiene el derecho de votar, como tambi�n tiene el derecho de olvidar su tarjeta.
+Continuamos con las otras modificaciones del orden del d�a.1
+
+Se�ora Presidenta, en la votaci�n anterior -y me atendr� a su decisi�n en este asunto- sobre la cuesti�n del plan estrat�gico de la Comisi�n, indiqu� que me habr�a gustado hablar antes de la votaci�n en nombre de mi Grupo. Esto no ha ocurrido. Le agradecer�a que, al cierre de este punto, se me permitiera dar una explicaci�n del voto en nombre de mi Grupo. Es un asunto importante. Ser�a �til que el historial de la C�mara registrara c�mo percibe la gente lo que hemos hecho a la luz de sus propios an�lisis pol�ticos.
+
+Se�ora Presidenta, no deseo reanudar el debate, pero hab�a pedido tambi�n la palabra para dar mi opini�n acerca de la enmienda del Sr. Bar�n Crespo. Tampoco me ha nombrado usted. Lo lamento, pero la votaci�n se ha realizado, se ha adoptado la decisi�n y, por consiguiente, dejemos as� las cosas.
+
+Lo siento mucho, se�or H�nsch, se�or Cox, no he advertido que ustedes ped�an la palabra. En estas circunstancias, creo que las posiciones est�n claras y que se reflejar�n en el Acta. Cuando ma�ana se trate la aprobaci�n del Acta de la sesi�n de hoy, si Sus Se�or�as estiman que las posiciones no se han explicado lo bastante bien, podr�n pedir modificaciones. Creo que es una buena f�rmula. Por supuesto que el Acta de la sesi�n de ma�ana consignar� todas las explicaciones complementarias. Creo que esta f�rmula es mejor que la de proceder ahora a unas explicaciones de voto que nos llevar�an mucho tiempo. Sr. H�nsch, Sr. Cox, �les parece bien?
+
+ Se�ora Presidenta, si en el registro de la votaci�n consta correctamente c�mo ha votado mi Grupo, ni quiero ni puedo plantear objeci�n alguna. Si su decisi�n es que no podemos explicar nuestro voto, la acatar�, pero no sin reservas.
+
+Examinaremos con cuidado la redacci�n del Acta, aunque es algo que siempre hacemos. Si no refleja bien las posiciones, se podr� corregir.
+
+(El Parlamento aprueba el orden de los trabajos as� modificado)
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/language-identifier/src/test/java/org/apache/nutch/analysis/lang/fi.test
----------------------------------------------------------------------
diff --git a/nutch-plugins/language-identifier/src/test/java/org/apache/nutch/analysis/lang/fi.test b/nutch-plugins/language-identifier/src/test/java/org/apache/nutch/analysis/lang/fi.test
new file mode 100644
index 0000000..6fd7f6d
--- /dev/null
+++ b/nutch-plugins/language-identifier/src/test/java/org/apache/nutch/analysis/lang/fi.test
@@ -0,0 +1,106 @@
+Istuntokauden uudelleenavaaminen
+ Julistan perjantaina joulukuun 17. p�iv�n� keskeytetyn Euroopan parlamentin istunnon avatuksi ja esit�n viel� kerran vilpitt�m�n toiveeni siit�, ett� teill� olisi ollut oikein mukava joululoma.
+Kuten olette varmaan saattaneet huomata, vuodenvaihteeseen 2000 povattuja suuria tietokoneongelmia ei ilmennytk��n. Sen sijaan todella kauheat luonnonkatastrofit koettelivat kansalaisia joissakin unionimme maissa. Te olette esitt�neet toiveen, ett� t�st� asiasta keskusteltaisiin l�hip�ivin� t�m�n istuntojakson aikana. Sill� v�lin toivoisin, kuten useampi kollega on minulle esitt�nytkin, ett� viett�isimme minuutin hiljaisuuden kaikkien niiden uhrien muistoksi, jotka saivat surmansa useita Euroopan unionin maita koetelleissa myrskyiss�. Kehotan, ett� nousette seisomaan t�m�n minuutin hiljaisuuden ajaksi
+(Parlamentti vietti seisaallaan minuutin hiljaisuuden.)
+
+Arvoisa puhemies, k�yt�n ty�j�rjestyspuheenvuoron. Olette varmaan saaneet tietoonne lehdist�n ja television kautta, ett� Sri Lankassa on sattunut useita kuolemaan johtaneita pommi-iskuja. N�iden Sri Lankassa aivan �skett�in surmansa saaneiden ihmisten joukossa on Kumar Ponnambalam, joka vieraili parlamentissa vain pari kuukautta sitten. K�visik� teille, arvoisa puhemies, ett� kirjoittaisitte Sri Lankan presidentille kirjeen, jossa v�litt�isitte surunvalittelut parlamentin puolesta Ponnambalanin kuoleman ja muiden Sri Lankassa sattuneiden v�kivaltaisten kuolemantapausten johdosta ja jossa pyyt�isitte presidentti� tekem��n kaikkensa, jotta t�h�n hyvin vaikeaan tilanteeseen l�ydett�isiin rauhanomainen sovitteluratkaisu?
+
+ Kyll�, j�sen Evans, mielest�ni �sken ehdottamanne suuntainen aloite olisi varsin oikeaan osuva. Jos parlamentin j�senet kannattavat sit�, teen niin kuin j�sen Evans ehdotti.
+
+Arvoisa puhemies, k�yt�n ty�j�rjestyspuheenvuoron. Haluaisin, ett� kertoisitte mielipiteenne ty�j�rjestyksen 143 artiklasta, joka koskee sit�, milloin jotakin asiaa ei oteta k�sitelt�v�ksi. Kysymykseni liittyy er��seen seikkaan, jota k�sitell��n torstaina ja jonka otan silloin taas esille.
+Cunhan monivuotisia ohjausohjelmia koskevaa mietint�� k�sitell��n parlamentissa torstaina, ja sen 6 kohtaan sis�ltyy ehdotus, jonka mukaan niiden maiden kohdalla, jotka eiv�t suoriudu vuosittaisista laivueita koskevista supistamistavoitteistaan, otetaan k�ytt��n er��nlaisia kiinti�iden v�hent�mist� tarkoittavia rangaistuksia. Mietinn�ss� sanotaan, ett� n�in olisi teht�v� suhteellisen vakauden periaatteesta huolimatta. Uskon, ett� suhteellisen vakauden periaate on perusluonteinen oikeudellinen periaate yhteisen kalastuspolitiikan alalla ja ett� ehdotusta, jossa sit� horjutetaan, olisi lains��d�nn�llisesti mahdoton hyv�ksy�. Haluaisin tiet��, onko mahdollista esitt�� t�llainen vastalause, jonka kohteena on pelkk� mietint� eik� lakiehdotus, ja onko minulla valtuuksia esitt�� t�llainen vastalause torstaina.
+
+ Te voitte tosiaankin juuri silloin tehd� t�m�n esityksen, mik�li sit� haluatte, eli torstaina ennen mietinn�n k�sittelyn alkamista.
+
+Arvoisa puhemies, samanaikaisesti Euroopan parlamentin t�m�n vuoden ensimm�isen istuntojakson kanssa Texasissa Yhdysvalloissa p��tettiin, ett� er�s nuori - 34-vuotias - kuolemaantuomittu, nimelt��n Hicks, teloitetaan valitettavasti ensi perjantaina.
+Ranskalaisen parlamentin j�senen Zimerayn pyynn�st� on esitetty vetoomus, jonka monet ovat allekirjoittaneet, ja olen itse yksi heist�, mutta pyyd�n, ett� te Euroopan parlamentin ja koko Euroopan yhteis�n johdonmukaisesti osoittaman kannan mukaisesti otatte virkanne suoman ja edustamanne toimielimen arvovallan voimin yhteytt� Texasin osavaltion johtajaan, kuvern��ri Bushiin, jolla on valtuudet kumota kuolemantuomio ja armahtaa kuolemaantuomittu.
+Ja kaikki t�m� niiden periaatteiden mukaisesti, joita olemme aina puolustaneet.
+
+ Kiitos, j�sen Segni, teen sen oikein mielell�ni. Se on tosiaankin t�ysin niiden periaatteiden mukaista, joita parlamentti on aina puolustanut.
+
+Arvoisa puhemies, haluaisin kiinnitt�� huomionne tapaukseen, jota t�m� parlamentti on k�sitellyt toistuvasti. Se on tapaus Aleksandr Nikitin. Me kaikki t��ll� parlamentissa olemme iloisia siit�, ett� oikeus vapautti h�net ja teki selv�ksi, ett� ymp�rist�� koskevien tietojen saaminen on perustuslaillinen oikeus my�s Ven�j�ll�. Tilanne on nyt kuitenkin se, ett� h�n on joutunut uudelleen syytteeseen, koska yleinen syytt�j� valittaa oikeuden p��t�ksest�. Me tied�mme ja olemme todenneet sen todellakin eritt�in monissa p��t�slauselmissa - nimenomaan viime vuoden viimeisess� t�ysistunnossa -, ett� t�m� ei ole pelk�st��n oikeustapaus ja on v��rin syytt�� Aleksandr Nikitini� rikollisuudesta ja petoksesta, koska me asianosaisina hy�dymme h�nen tuloksistaan. N�m� tulokset muodostavat pohjan eurooppalaisille Barentsinmeren suojeluohjelmille, ja pyyd�n teit� siksi tarkastelemaan huolellisesti kirjeluonnosta, jossa selitet��n t�rkeimm�t tosi
 asiat, ja tekem��n t�m�n kannan selv�ksi Ven�j�ll� parlamentin p��t�sten hengess�.
+
+ Kyll� j�sen Schroedter, saatuani kirjeenne otan varsin mielell�ni selv�� t�h�n kysymykseen liittyvist� seikoista.
+
+Arvoisa puhemies, haluaisin aluksi kiitt�� teit� siit�, ett� olette pit�nyt sananne ja ett� nyt uuden vuoden ensimm�isen istuntojakson aikana huoneissamme n�kyvien televisiokanavien m��r� on todellakin kasvanut valtavasti. Arvoisa puhemies, se, mit� min� pyysin, ei ole kuitenkaan toteutunut. Nyt n�kyviss� on kaksi suomalaista kanavaa ja yksi portugalilainen kanava, ja min� pyysin teilt� alankomaalaista kanavaa, koska my�s alankomaalaiset haluavat seurata uutisia joka kuukausi, kun meid�t l�hetet��n t�nne karkotuspaikkaan. Haluaisin pyyt�� teit� viel� kerran huolehtimaan siit�, ett� me saamme my�s alankomaalaisen kanavan.
+
+ J�sen Plooij-van Gorsel, voin kertoa, ett� kyseinen asia on kvestorien ensi keskiviikon kokouksen esityslistalla. Toivon, ett� sit� k�sitell��n my�nteisess� hengess�.
+
+Arvoisa puhemies, voitteko kertoa minulle, miksi parlamentti ei noudata terveytt� ja turvallisuutta koskevaa lains��d�nt��, jonka se oikeastaan itse hyv�ksyy? Miksei juuri t�m�n rakennuksen ilmanlaatua ole testattu sin� aikana, kun olemme olleet j�seni�? Miksi terveys- ja turvallisuusasioista vastaava komitea ei ole kokoontunut vuoden 1998 j�lkeen? Miksi parlamentin Brysselin tiloissa tai Strasbourgin tiloissa ei ole j�rjestetty paloharjoituksia? Miksi tulipalon varalta ei ole annettu ohjeita? Miksi portaikossa ei ole tehty parannuksia minulle sattuneen onnettomuuden j�lkeen? Miksi "tupakointi kielletty" -alueita ei ole pantu t�yt�nt��n? On todella noloa, ett� hyv�ksymme lakeja emmek� noudata niit� itse.
+
+J�sen Lynne, te olette t�ysin oikeassa, ja aion tarkistaa, eik� kaikkea t�t� todellakaan ole tehty. Toimitan my�s t�m�n ongelman kvestorikollegion k�sitelt�v�ksi ja olen varma, ett� kvestoreillemme on eritt�in t�rke�� toimia niin, ett� noudatamme sellaista lains��d�nt��, jonka olemme itse hyv�ksyneet.
+
+Arvoisa puhemies, D�ez Gonz�lez ja min� olimme esitt�neet muutamia kysymyksi�, jotka koskivat tiettyj� komission varapuheenjohtaja de Palacion lausuntoja, jotka julkaistiin er��ss� espanjalaisessa p�iv�lehdess�. Toimivaltaiset yksik�t eiv�t ole ottaneet kysymyksi� esityslistalle, koska yksik�iden mukaan niihin on jo vastattu aiemmalla istuntojaksolla.
+Pyyd�n, ett� t�m� p��t�s perutaan, sill� asia ei ole niin. Ne kysymykset, joihin aiemmin vastattiin, liittyv�t komission varapuheenjohtaja De Palacion puheenvuoroon er��ss� asiassa, eik� n�ihin viime marraskuun 18. p�iv�n� ABC-lehdess� julkaistuihin lausuntoihin.
+
+Hyv� kollega, tarkistamme asian. Minun t�ytyy my�nt��, ett� t�ll� hetkell� asiat vaikuttavat minusta hieman sekavilta. Niinp� k�ymme asian eritt�in tarkasti l�pi, jotta kaikki saadaan hyv�lle tolalle.
+
+Arvoisa puhemies, haluaisin tiet��, l�hett��k� parlamentti t�ll� viikolla ulkomaailmaan selv�n viestin siit�, kuinka tyytym�tt�mi� olemme t�m�np�iv�iseen p��t�kseen, jonka mukaan aseiden vientikieltoa Indonesiaan ei jatketa, varsinkin, jos otamme huomioon, ett� parlamentin valtaenemmist� on aiemmin tukenut aseiden vientikieltoa Indonesiaan? T�m�np�iv�inen p��t�s olla jatkamatta vientikieltoa on eritt�in vaarallinen, jos otamme huomioon Indonesian tilanteen. N�in ollen parlamentin olisi l�hetett�v� t�llainen viesti, sill� valtaenemmist� toivoo sit�. On vastuutonta, ett� EU:n j�senvaltiot kielt�ytyv�t jatkamasta aseiden vientikieltoa. Kuten monet ovat kertoneet, tilanne on Indonesiassa ��rimm�isen r�j�hdysaltis. Vaarana todellakin on tuleva sotilasvallankaappaus. Me emme tied�, mit� tapahtuu. Miksi siis EU:n asevalmistajien olisi teht�v� voittoa viattomien ihmisten kustannuksella?
+
+Joka tapauksessa, t�m� kysymys ei ole toistaiseksi torstain ajankohtaiskeskustelun aiheiden joukossa.
+
+K�sittelyj�rjestys
+Esityslistalla on seuraavana lopullisen esityslistaluonnoksen k�sittely siin� muodossa kuin puheenjohtajakokous torstaina 13. tammikuuta ty�j�rjestyksen 110 artiklan mukaisesti sen laati. Maanantain ja tiistain osalta minulle ei ole ehdotettu muutoksia.
+Keskiviikon osalta:
+Sosialistiryhm� pyyt��, ett� esityslistalle otetaan alunperin esityslistaluonnokseen merkitty komission julkilausuma sen strategisista tavoitteista seuraavien viiden vuoden ajaksi sek� komission hallinnollisesta uudistuksesta.
+Toivoisin, ett� j�sen Bar�n Crespo, pyynn�n esitt�j�, k�ytt�isi puheenvuoron ja perustelisi pyynt�ns�, mutta tietenkin vain, mik�li h�n niin haluaa. Sen j�lkeen teemme, kuten tapanamme on: kuulemme yhden puheenvuoron puolesta ja yhden vastaan.
+
+Arvoisa puhemies, Prodin johtaman komission koko toimikauden kattavan poliittisen ohjelman esittely oli alun perin Euroopan parlamentin sosiaalidemokraattisen puolueen ryhm�n ehdotus, ja se hyv�ksyttiin yksimielisesti syyskuun puheenjohtajakokouksessa, ja samoin komission puheenjohtaja Prodi antoi sille selv�n hyv�ksynt�ns� ja toisti nimityksens� yhteydess� pit�m�ss��n puheessa sitoutumistaan siihen.
+T�ll� sitoutumisella on merkityst� siin� mieless�, ett� komissiolla on elimen� perustamissopimusten mukaisesti yksinoikeus tehd� aloitteita, ja siksi komissio m��r�� sen, millaiseksi parlamentin poliittinen ja lains��d�nn�llinen toiminta viiden seuraavan vuoden aikana pohjimmiltaan muotoutuu. Arvoisa puhemies, muistuttaisin lis�ksi siit�, ett� parlamentti antoi edellisell� vaalikaudella kaksi kertaa luottamuslauseen puheenjohtaja Prodille; t�ll� vaalikaudella luottamuslause annettiin uudelleen hein�kuussa ja sen j�lkeen viel� kerran uuden komission astuessa virkaansa, jolloin parlamentti ��nesti syyskuussa koko komissiolle annettavasta luottamuslauseesta. Siksi on jo mennyt riitt�v�sti aikaa, jotta komissio on voinut laatia ohjelmansa ja jotta mekin voimme tutustua siihen ja selitt�� sen kansalaisille. T�ss� mieless� palautan mieliin syyskuun 15. p�iv�n p��t�slauselman, jossa suositeltiin ehdotuksen esitt�mist� mahdollisimman pikaisesti.
+Viime viikolla sattuneet asiat jotka saivat alkunsa puheenjohtajakokouksen yhteydess�, jolloin sit� k�ytettiin vain siihen, ett� sen ulkopuolella tehdyt p��t�kset voitaisiin vahvistaa ja sinet�id� ovat johtaneet pulmalliseen tilanteeseen: joko komissio ei pysty esittelem��n t�t� ohjelmaa (miss� tapauksessa komission olisi ilmoitettava se. Komission puheenjohtajan puheiden perusteella komissio pystyykin siihen. Koska komissiota edustaa varapuheenjohtaja de Palacio, luulen, ett� ennen ��nest�mist� olisi hyv� tiet�� komission tilanne eli se, pystyyk� komissio esittelem��n ohjelman niin kuin sovittiin); tai sitten parlamentti ei pysty k�sittelem��n t�t� ohjelmaa, kuten jotkut n�ytt�v�t esitt�v�n. Mielest�ni t�m� toinen olettamus merkitsisi sit�, ett� luopuisimme meille parlamenttina kuuluvista velvollisuuksistamme ja lis�ksi sit�, ett� otettaisiin k�ytt��n omaper�inen teesi ja aivan uudenlainen menetelm�, joka tarkoittaisi sit�, ett� pol
 iittisille ryhmille annetaan kirjallisessa muodossa tiedoksi komission ohjelmalliset aiheet viikkoa aikaisemmin eik� p�iv�� aikaisemmin, kuten oli sovittu n�in varsinkin, jos otetaan huomioon se, ett� lains��d�nt�ohjelmasta keskustellaan helmikuussa, ja n�in ollen voimmekin luopua koko keskustelusta, sill� seuraavana p�iv�n� lehdist� ja Internet ovat jo v�litt�neet tiedon kaikille kansalaisille, eik� parlamentilla ole en�� syyt� puuttua asiaan.
+Koska ryhm�ni on sit� mielt�, ett� parlamentti on olemassa siksi, ett� siell� voidaan pohtia asioita, kuunnella ja keskustella, k�sityksemme on, ett� t�t� viiv�stymist� ei voida perustella mitenk��n, ja jos komissio pystyy ohjelman esitt�m��n, mielest�mme ehdimme hyvin palata alkuper�iseen parlamentin ja komission v�liseen sopimukseen ja edet� vastuullisesti kansalaistemme silmiss�. Siksi Euroopan parlamentin sosiaalidemokraattisen puolueen ryhm�n ehdotus on te my�s mainitsitte sen , ett� keskiviikkona pidet��n Prodin komission koko toimikauden ohjelman esittely niin, ett� t�h�n ohjelmaan sis�ltyy my�s hallinnollinen uudistussuunnitelma, sill� saatamme joutua paradoksaaliseen tilanteeseen, jos n�in ei k�y: sen varjolla, ett� teksti� ei ole, komission puheenjohtajalta vied��n oikeus puhua parlamentille, ja toisaalta pidett�isiin sellainen uudistuksia koskeva keskustelu, jossa parlamentti ei tuntisi etuk�teen pohjana olevaa teksti�. Siksi toiv
 on, ett� te, arvoisa puhemies, pyyt�isitte komissiota ilmoittamaan komission t�m�nhetkisen kannan, mink� j�lkeen toimitettaisiin ��nestys.
+(Suosionosoituksia PSE-ryhm�lt�)
+
+Arvoisa puhemies, hyv�t kollegat, olen kaikesta huolimatta hieman h�mm�stynyt kollega Bar�n Crespon menettelyst�, sill� h�n vaatii nyt t�m�n esityslistan kohdan ottamista keskiviikon istunnon esityslistalle.
+Hyv� kollega Bar�n Crespo, teill� ei ollut mahdollisuutta olla l�sn� viime torstain puheenjohtajakokouksessa. En kritisoi sit�, sill� aina silloin t�ll�in k�y niin, ett� annamme jonkun toisen edustaa itse�mme. Kollega H�nsch edusti teit� siell�. Me keskustelimme t�st� asiasta puheenjohtajakokouksessa seikkaper�isesti. Vain teid�n ryhm�nne puolsi sit�, mit� te nyt ehdotatte. ��nestimme asiasta keskustelun j�lkeen. Jokaisella puheenjohtajallahan on yht� paljon ��ni� kuin h�nen ryhm�ss��n on j�seni�. T�st� kohdasta toimitettiin siis ��nestys. Sen tulos oli muistaakseni seuraavanlainen: 422-180 muutamien harvojen pid�tt�ytyess� ��nest�m�st�. Se tarkoittaa, ett� kaikki ryhm�t sitoutumattomia j�seni� lukuun ottamatta - mutta heh�n eiv�t olekaan mit��n ryhmi� - olivat yksimielisi� ja vain teid�n ryhm�nne kannatti sellaista menettely�, jota te olette t��ll� ehdottanut. Kaikki muut olivat eri mielt�. T�m� oli puheenjohtajakoko
 uksen p��t�s.
+Nyt haluaisin sanoa jotakin itse asiasta. Me luotamme komissioon ja Romano Prodiin, ja ryhm�mme hyvin suuri enemmist� antoi vaikean prosessin j�lkeen, kuten me kaikki tied�mme, luottamuslauseen Romano Prodille ja komissiolle. Me olemme kuitenkin my�s sit� mielt�, ett� meid�n on keskusteltava t�st� komission strategiasta asianmukaisessa menettelyss� ei pelk�st��n komission t��ll� Euroopan parlamentissa esitt�m�n suullisen tiedonannon pohjalta vaan my�s sen asiakirjan pohjalta, josta on p��tetty komissiossa ja jossa selostetaan t�t� seuraavien viiden vuoden ohjelmaa. Sellaista asiakirjaa ei ole olemassa!
+
+Komissio esitt�� vuoden 2000 ohjelman helmikuussa. Me olemme sanoneet, hyv� on, jos komissio ei halua tehd� vuoden 2000 ohjelmaa viel� tammikuussa, me teemme sen sitten helmikuussa. Olemme hyv�ksyneet sen. Meh�n emme halua sin�ns� mit��n riitaa komission kanssa, vaan olemme sit� mielt�, ett� komission ja parlamentin on kuljettava yhteist� tiet�, jos se nimitt�in on mahdollista. Me parlamenttina toimimme kuitenkin my�s komission valvojana. Eik� meid�n tarvitse olla samaa mielt� kaikesta, mit� komissio esitt��.
+Haluaisin, ett� voimme valmistautua viisivuotisohjelmasta k�yt�v��n keskusteluun poliittisissa ryhmiss� j�rkev�ll� tavalla. Se ei ole mahdollista, jos t��ll� kuullaan julkilausuma tiet�m�tt� lainkaan, mit� se sis�lt��. Suosittelemme siksi - ja minun vaikutelmani on, ett� komissio on my�s vastaanottavainen t�lle ajatukselle -, ett� keskustelemme helmikuussa komission pitk�aikaisesta vuoteen 2005 ulottuvasta ohjelmasta - toivon komission p��sev�n siihen menness� yhteisymm�rrykseen my�s ohjelmasta, jonka se esitt�� meille - ja ett� k�ymme samaan aikaan helmikuussa my�s keskustelua komission vuoden 2000 lains��d�nt�ohjelmasta. T�m� on siis my�s asiayhteys, jonka takia on j�rkev�� keskustella molemmista ohjelmista yhdess�. Ryhm�ni hylk�� siksi ehdottomasti sosialistiryhm�n ehdotuksen!
+(Suosionosoituksia PPE-DE-ryhm�lt�)
+
+. (ES) Arvoisa puhemies, haluan tehd� hyvin selv�ksi, ett� komissio pit�� ilman muuta parlamentin p��t�ksi� mahdollisimman suuressa arvossa, ja sen my�t� arvostaa my�s esityslistaa. Siksi kunnioitamme sit�, mik� on t�ss� mieless� parlamentin p��tett�viss�.
+Haluaisin my�s tehd� selv�ksi, ett� puheenjohtaja Prodi lupasi parlamentille, ett� t�m� uusi keskustelu otetaan esityslistalle, mist� j�sen Bar�n Crespo muistutti, ja ett� t�m� uusi keskustelu k�yd��n komission lains��d�nt�ohjelmasta k�yt�v�n vuosikeskustelun lis�ksi ja siin� k�sitell��n seuraavan viisivuotiskauden, eli t�m�n toimikauden, suuria toimintalinjoja.
+Arvoisa puhemies, haluaisin sanoa, ett� t�m� keskustelu poikkeaa syyskuussa tehdyll� sopimuksella komission lains��d�nt�ohjelman vuosittaisesta esittelyst�. Ja haluaisin sanoa, arvoisa puhemies, ett� komission puolesta olemme valmistautuneet ja valmiita k�ym��n t�m�n keskustelun, kun se sopii muille, ja ett� olemme valmiita k�ym��n sen t�ll� viikolla, kuten alun perin oli sovittu, ja voimme l�hte� my�s siit�, ett� se edelt�v�n� p�iv�n� esitell��n parlamentin ryhmille.
+Arvoisa puhemies, niinp� haluan toistaa, ett� omasta puolestamme olemme keskustelleet seuraavan viisivuotiskauden toimintasuunnitelmasta ja ett� olemme valmiit, kun parlamentti niin p��tt�� t�ll� samalla viikolla, jos se on parlamentin p��t�s esittelem��n parlamentille t�m�n viisivuotisohjelman ja ensi kuussa ohjelman vuodelle 2000, kuten oli nimenomaan sovittu.
+
+Ehdotan, ett� pid�mme ��nestyksen sosialistiryhm�n pyynn�st�, jolla pyrit��n merkitsem��n esityslistalle uudelleen komission julkilausuma komission strategisista tavoitteista.
+(Parlamentti hylk�si pyynn�n.) Puhemies. Edelleen keskiviikon istunnon osalta minulla on toinenkin ehdotus, joka liittyy p��omaveroa koskevaan suulliseen kysymykseen. PPE-DE-ryhm� pyyt�� t�m�n kohdan poistamista esityslistalta.
+Haluaako joku kollegoista k�ytt�� puheenvuoron ryhm�n puolesta ja perustella t�m�n pyynn�n?
+
+Arvoisa puhemies, koska kuulen sosialistiryhm�n keskuudesta hieman naurua - minulle sanottiin, ett� my�s monet sosialistiryhm�n j�senist� haluavat mielell��n poistaa t�m�n kohdan esityslistalta, koska puheenjohtajakokouksen ��nestyksess� ei ollut k�ytett�viss� sosialistiryhm�n asiasta vastaavien j�senten ty�ryhm�n ��nestystulosta. En tied�, pit��k� t�m� tieto paikkansa, mutta me PPE-DE-ryhm�n� olisimme joka tapauksessa kiitollisia, jos t�m� kohta poistettaisiin esityslistalta, koska parlamentti on k�sitellyt t�t� asiaa jo useita kertoja. T�llaista veroa vastaan on tehty my�s p��t�ksi�. Ryhm�ni pyyt�� siksi t�m�n kohdan poistamista esityslistalta.
+
+Kiitos, j�sen Poettering.
+Nyt kuuntelemme j�sen Wurtzia, joka puhuu t�t� pyynt�� vastaan.
+
+Arvoisa puhemies, haluaisin ensiksi painottaa j�sen Poetteringin ep�loogisuutta. �sken h�n l�ksytti sosialistiryhm��, koska ryhm� olisi perunut puheenjohtajakokouksessa tehdyn ��rimm�isen selke�n p��t�ksen. Nyt h�n toimii itse samoin. Keskustelimme ja olimme asiasta yksimielisi� PPE-ryhm�� ja liberaaliryhm�� lukuun ottamatta, ja silloin jopa huomautin te, hyv�t puheenjohtajakollegani varmaan muistatte sen , ett� kyse ei ole siit�, oletteko te ns. Tobin-veroa vastaan vai ette, vaan siit�, rohkenetteko ymm�rt��, mit� mielt� komissio ja neuvosto siit� ovat. T�m� ei ole liikaa vaadittu. N�in ollen pid�n kiinni ehdotuksesta, jonka mukaan s�ilyt�mme t�m�n komissiolle ja neuvostolle esitett�v�n suullisen kysymyksen, jotta saamme vihdoin tiet�� n�iden kahden tahon kannan t�st� suhteellisen vaatimattomasta ehdotuksesta, joka kuitenkin merkitsisi t�rke�� signaalia yleisen mielipiteen suuntaan varsinkin niiden tunteiden j�lkeen, joita Seattlen 
 huippukokouksen ep�onnistuminen her�tti.
+
+ ��nest�mme PPE-DE-ryhm�n pyynn�st�, jolla pyrit��n s�ilytt�m��n esityslistalla p��omaveroa koskeva suullinen kysymys.
+(Parlamentti hylk�si pyynn�n: 164 puolesta, 166 vastaan ja 7 tyhj��.)
+
+Arvoisa puhemies, haluaisin kiitt�� j�sen Poetteringia t�st� mainosiskusta, jonka h�n juuri j�rjesti kyseiselle keskustelulle. Kiitos.
+
+Arvoisa puhemies, onko ��neni otettu huomioon, vaikka se ei s�hk�isesti mennytk��n l�pi, sill� minulla ei ole ��nestyskorttia? ��nestin puolesta.
+
+Jos lis��mmekin molempien ilmoittautuneiden kollegojen puheet, saamme oikeastaan tulokseksi...
+
+Arvoisa puhemies, ��nestystuloshan on julistettu. Muutoksia ei voi tehd�.
+
+ Hyv�t kollegat, toistan viel� kerran, ett� jokaisella on oltava ��nestyskorttinsa maanantaisin. Meill� on siin� asiassa n�k�j��n ongelmia. N�in ollen minun on teht�v� p��t�s.
+My�s min� unohdin ��nestyskorttini ja olisin ��nest�nyt vastaan. Katson siis, ett� suullinen kysymys s�ilytet��n esityslistalla.
+T�m� on viimeinen kerta, kun otamme huomioon ��nestyskorttinsa unohtaneiden j�senten ��net. Tulkoon t�m� varsin selv�ksi ja teht�k��n se tiett�v�ksi.
+(Suosionosoituksia)
+Kyll�, suullinen kysymys s�ilyy esityslistalla, ja kyll�, puhemiehell�kin on oikeus ��nest��, kuten h�nell� on my�s oikeus unohtaa ��nestyskorttinsa.
+Jatkamme muiden esityslistalle ehdotettujen muutosten k�sittely�.
+
+Arvoisa puhemies, aiemmassa ��nestyksess� ja aion noudattaa sit�, mit� asiasta sanotte , joka koski komission strategista suunnitelmaa, ilmoitin haluavani puhua ennen ��nestyst� ryhm�ni puolesta. N�in ei k�ynyt. Olisin kiitollinen, jos saisin mahdollisuuden antaa ��nestysselityksen ryhm�ni puolesta sen j�lkeen, kun t�m� asia on saatu k�sitelty�. T�m� on t�rke� asia. Olisi parlamentille eduksi, ett� j�senet voisivat ilmaista omien poliittisten analyysiensa valossa, miten he suhtautuvat siihen, mit� olemme juuri saaneet aikaiseksi.
+
+Arvoisa puhemies, en halua avata keskustelua uudelleen, mutta my�s min� pyysin puheenvuoroa ottaakseni kantaa Bar�n Crespon pyynt��n. Te ette pyyt�nyt my�sk��n minua puhumaan. Pahoittelen sit�, mutta ��nestys on toimitettu ja p��t�s tehty, joten antakaamme asian olla.
+
+ Olen pahoillani, j�sen H�nsch ja j�sen Cox, en n�hnyt, ett� pyysitte puheenvuoroa. N�in ollen uskon, ett� kannat ovat varsin hyvin selvill� ja ett� ne kirjataan p�yt�kirjaan. Kun huomenna hyv�ksymme t�m�np�iv�isen istunnon p�yt�kirjan, ne kollegat, joiden mielest� jotakin kantaa ei ole selitetty riitt�v�n hyvin, voivat pyyt�� muutoksia. Minusta t�m� on hyv� tapa. Tietenkin huomisen istunnon p�yt�kirjassa otetaan kaikki t�ydent�v�t selitykset huomioon. Uskoakseni t�m� on parempi tapa kuin se, ett� ryhtyisimme aikaa vieviin ��nestysselityksiin nyt. J�sen Cox ja j�sen H�nsch, sopiiko t�m� teille?
+
+Arvoisa puhemies, mik�li p�yt�kirjaan on kirjattu oikein, miten ryhm�ni ��nesti, en aio enk� voikaan vastustaa sit�. Jos p��t�ksenne on, ett� en voi antaa ��nestysselityst�, hyv�ksyn sen, mutta tietyin varauksin.
+
+ Aiomme siis laatia p�yt�kirjan hyvin tarkasti. Niinh�n me oikeastaan teemme aina. Jos kannat eiv�t tule p�yt�kirjasta kunnolla ilmi, voimme tarvittaessa korjata asian.
+(K�sittelyj�rjestys vahvistettiin n�in muutettuna.)
+

[24/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build for nutch-core and nutch-plugins

Posted by th...@apache.org.

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-anchor/src/test/java/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-anchor/src/test/java/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java b/nutch-plugins/index-anchor/src/test/java/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java
new file mode 100644
index 0000000..08a42f3
--- /dev/null
+++ b/nutch-plugins/index-anchor/src/test/java/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer.anchor;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlink;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Assert;
+import org.junit.Test;
+
+/**
+ * JUnit test case which tests 1. that anchor text is obtained 2. that anchor
+ * deduplication functionality is working
+ * 
+ * @author lewismc
+ * 
+ */
+public class TestAnchorIndexingFilter {
+
+  @Test
+  public void testDeduplicateAnchor() throws Exception {
+    Configuration conf = NutchConfiguration.create();
+    conf.setBoolean("anchorIndexingFilter.deduplicate", true);
+    AnchorIndexingFilter filter = new AnchorIndexingFilter();
+    filter.setConf(conf);
+    Assert.assertNotNull(filter);
+    NutchDocument doc = new NutchDocument();
+    ParseImpl parse = new ParseImpl("foo bar", new ParseData());
+    Inlinks inlinks = new Inlinks();
+    inlinks.add(new Inlink("http://test1.com/", "text1"));
+    inlinks.add(new Inlink("http://test2.com/", "text2"));
+    inlinks.add(new Inlink("http://test3.com/", "text2"));
+    try {
+      filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"),
+          new CrawlDatum(), inlinks);
+    } catch (Exception e) {
+      e.printStackTrace();
+      Assert.fail(e.getMessage());
+    }
+    Assert.assertNotNull(doc);
+    Assert.assertTrue("test if there is an anchor at all", doc.getFieldNames()
+        .contains("anchor"));
+    Assert.assertEquals("test dedup, we expect 2", 2, doc.getField("anchor")
+        .getValues().size());
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-basic/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-basic/build.xml b/nutch-plugins/index-basic/build.xml
new file mode 100755
index 0000000..a834290
--- /dev/null
+++ b/nutch-plugins/index-basic/build.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="index-basic" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-basic/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-basic/ivy.xml b/nutch-plugins/index-basic/ivy.xml
new file mode 100644
index 0000000..848216e
--- /dev/null
+++ b/nutch-plugins/index-basic/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+      <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-basic/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-basic/plugin.xml b/nutch-plugins/index-basic/plugin.xml
new file mode 100755
index 0000000..c5d784d
--- /dev/null
+++ b/nutch-plugins/index-basic/plugin.xml
@@ -0,0 +1,42 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="index-basic"
+   name="Basic Indexing Filter"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+
+   <runtime>
+      <library name="index-basic.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+   <extension id="org.apache.nutch.indexer.basic"
+              name="Nutch Basic Indexing Filter"
+              point="org.apache.nutch.indexer.IndexingFilter">
+      <implementation id="BasicIndexingFilter"
+                      class="org.apache.nutch.indexer.basic.BasicIndexingFilter"/>
+   </extension>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-basic/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-basic/pom.xml b/nutch-plugins/index-basic/pom.xml
new file mode 100644
index 0000000..3dc3d91
--- /dev/null
+++ b/nutch-plugins/index-basic/pom.xml
@@ -0,0 +1,38 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>index-basic</artifactId>
+    <packaging>jar</packaging>
+
+    <name>index-basic</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-basic/src/main/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-basic/src/main/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java b/nutch-plugins/index-basic/src/main/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
new file mode 100644
index 0000000..8584fa8
--- /dev/null
+++ b/nutch-plugins/index-basic/src/main/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
@@ -0,0 +1,158 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.indexer.basic;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.parse.Parse;
+
+import org.apache.nutch.indexer.IndexingFilter;
+import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.util.StringUtil;
+import org.apache.nutch.util.URLUtil;
+import org.apache.hadoop.io.Text;
+
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.Date;
+
+import org.apache.hadoop.conf.Configuration;
+
+/**
+ * Adds basic searchable fields to a document. The fields added are : domain,
+ * host, url, content, title, cache, tstamp domain is included depending on
+ * {@code indexer.add.domain} in nutch-default.xml. title is truncated as per
+ * {@code indexer.max.title.length} in nutch-default.xml. (As per NUTCH-1004, a
+ * zero-length title is not added) content is truncated as per
+ * {@code indexer.max.content.length} in nutch-default.xml.
+ */
+public class BasicIndexingFilter implements IndexingFilter {
+  public static final Logger LOG = LoggerFactory
+      .getLogger(BasicIndexingFilter.class);
+
+  private int MAX_TITLE_LENGTH;
+  private int MAX_CONTENT_LENGTH;
+  private boolean addDomain = false;
+  private Configuration conf;
+
+  /**
+   * The {@link BasicIndexingFilter} filter object which supports few
+   * configuration settings for adding basic searchable fields. See
+   * {@code indexer.add.domain}, {@code indexer.max.title.length},
+   * {@code indexer.max.content.length} in nutch-default.xml.
+   * 
+   * @param doc
+   *          The {@link NutchDocument} object
+   * @param parse
+   *          The relevant {@link Parse} object passing through the filter
+   * @param url
+   *          URL to be filtered for anchor text
+   * @param datum
+   *          The {@link CrawlDatum} entry
+   * @param inlinks
+   *          The {@link Inlinks} containing anchor text
+   * @return filtered NutchDocument
+   */
+  public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
+      CrawlDatum datum, Inlinks inlinks) throws IndexingException {
+
+    Text reprUrl = (Text) datum.getMetaData().get(Nutch.WRITABLE_REPR_URL_KEY);
+    String reprUrlString = reprUrl != null ? reprUrl.toString() : null;
+    String urlString = url.toString();
+
+    String host = null;
+    try {
+      URL u;
+      if (reprUrlString != null) {
+        u = new URL(reprUrlString);
+      } else {
+        u = new URL(urlString);
+      }
+
+      if (addDomain) {
+        doc.add("domain", URLUtil.getDomainName(u));
+      }
+
+      host = u.getHost();
+    } catch (MalformedURLException e) {
+      throw new IndexingException(e);
+    }
+
+    if (host != null) {
+      doc.add("host", host);
+    }
+
+    doc.add("url", reprUrlString == null ? urlString : reprUrlString);
+
+    // content
+    String content = parse.getText();
+    if (MAX_CONTENT_LENGTH > -1 && content.length() > MAX_CONTENT_LENGTH) {
+      content = content.substring(0, MAX_CONTENT_LENGTH);
+    }
+    doc.add("content", StringUtil.cleanField(content));
+
+    // title
+    String title = parse.getData().getTitle();
+    if (MAX_TITLE_LENGTH > -1 && title.length() > MAX_TITLE_LENGTH) { // truncate
+                                                                      // title
+                                                                      // if
+                                                                      // needed
+      title = title.substring(0, MAX_TITLE_LENGTH);
+    }
+
+    if (title.length() > 0) {
+      // NUTCH-1004 Do not index empty values for title field
+      doc.add("title", StringUtil.cleanField(title));
+    }
+
+    // add cached content/summary display policy, if available
+    String caching = parse.getData().getMeta(Nutch.CACHING_FORBIDDEN_KEY);
+    if (caching != null && !caching.equals(Nutch.CACHING_FORBIDDEN_NONE)) {
+      doc.add("cache", caching);
+    }
+
+    // add timestamp when fetched, for deduplication
+    doc.add("tstamp", new Date(datum.getFetchTime()));
+
+    return doc;
+  }
+
+  /**
+   * Set the {@link Configuration} object
+   */
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    this.MAX_TITLE_LENGTH = conf.getInt("indexer.max.title.length", 100);
+    this.addDomain = conf.getBoolean("indexer.add.domain", false);
+    this.MAX_CONTENT_LENGTH = conf.getInt("indexer.max.content.length", -1);
+  }
+
+  /**
+   * Get the {@link Configuration} object
+   */
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-basic/src/main/java/org/apache/nutch/indexer/basic/package.html
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-basic/src/main/java/org/apache/nutch/indexer/basic/package.html b/nutch-plugins/index-basic/src/main/java/org/apache/nutch/indexer/basic/package.html
new file mode 100644
index 0000000..3fae405
--- /dev/null
+++ b/nutch-plugins/index-basic/src/main/java/org/apache/nutch/indexer/basic/package.html
@@ -0,0 +1,5 @@
+<html>
+<body>
+<p>A basic indexing plugin, adds basic fields: url, host, title, content, etc.</p><p></p>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-basic/src/test/java/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-basic/src/test/java/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java b/nutch-plugins/index-basic/src/test/java/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java
new file mode 100644
index 0000000..4bc317e
--- /dev/null
+++ b/nutch-plugins/index-basic/src/test/java/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java
@@ -0,0 +1,99 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer.basic;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.indexer.basic.BasicIndexingFilter;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Assert;
+import org.junit.Test;
+
+import java.util.Date;
+
+/**
+ * JUnit test case which tests 1. that basic searchable fields are added to a
+ * document 2. that domain is added as per {@code indexer.add.domain} in
+ * nutch-default.xml. 3. that title is truncated as per
+ * {@code indexer.max.title.length} in nutch-default.xml. 4. that content is
+ * truncated as per {@code indexer.max.content.length} in nutch-default.xml.
+ * 
+ * @author tejasp
+ * 
+ */
+
+public class TestBasicIndexingFilter {
+
+  @Test
+  public void testBasicIndexingFilter() throws Exception {
+    Configuration conf = NutchConfiguration.create();
+    conf.setInt("indexer.max.title.length", 10);
+    conf.setBoolean("indexer.add.domain", true);
+    conf.setInt("indexer.max.content.length", 20);
+
+    BasicIndexingFilter filter = new BasicIndexingFilter();
+    filter.setConf(conf);
+    Assert.assertNotNull(filter);
+
+    NutchDocument doc = new NutchDocument();
+
+    String title = "The Foo Page";
+    Outlink[] outlinks = new Outlink[] { new Outlink("http://foo.com/", "Foo") };
+    Metadata metaData = new Metadata();
+    metaData.add("Language", "en/us");
+    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title,
+        outlinks, metaData);
+    ParseImpl parse = new ParseImpl(
+        "this is a sample foo bar page. hope you enjoy it.", parseData);
+
+    CrawlDatum crawlDatum = new CrawlDatum();
+    crawlDatum.setFetchTime(100L);
+
+    Inlinks inlinks = new Inlinks();
+
+    try {
+      filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"),
+          crawlDatum, inlinks);
+    } catch (Exception e) {
+      e.printStackTrace();
+      Assert.fail(e.getMessage());
+    }
+    Assert.assertNotNull(doc);
+    Assert.assertEquals("test title, expect \"The Foo Pa\"", "The Foo Pa", doc
+        .getField("title").getValues().get(0));
+    Assert.assertEquals("test domain, expect \"apache.org\"", "apache.org", doc
+        .getField("domain").getValues().get(0));
+    Assert.assertEquals("test host, expect \"nutch.apache.org\"",
+        "nutch.apache.org", doc.getField("host").getValues().get(0));
+    Assert.assertEquals(
+        "test url, expect \"http://nutch.apache.org/index.html\"",
+        "http://nutch.apache.org/index.html", doc.getField("url").getValues()
+            .get(0));
+    Assert.assertEquals("test content", "this is a sample foo",
+        doc.getField("content").getValues().get(0));
+    Assert.assertEquals("test fetch time", new Date(100L),
+        (Date) doc.getField("tstamp").getValues().get(0));
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-geoip/build-ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-geoip/build-ivy.xml b/nutch-plugins/index-geoip/build-ivy.xml
new file mode 100644
index 0000000..2cda7e9
--- /dev/null
+++ b/nutch-plugins/index-geoip/build-ivy.xml
@@ -0,0 +1,54 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="index-geoip" default="deps-jar" xmlns:ivy="antlib:org.apache.ivy.ant">
+
+    <property name="ivy.install.version" value="2.1.0" />
+    <condition property="ivy.home" value="${env.IVY_HOME}">
+      <isset property="env.IVY_HOME" />
+    </condition>
+    <property name="ivy.home" value="${user.home}/.ant" />
+    <property name="ivy.checksums" value="" />
+    <property name="ivy.jar.dir" value="${ivy.home}/lib" />
+    <property name="ivy.jar.file" value="${ivy.jar.dir}/ivy.jar" />
+
+    <target name="download-ivy" unless="offline">
+
+        <mkdir dir="${ivy.jar.dir}"/>
+        <!-- download Ivy from web site so that it can be used even without any special installation -->
+        <get src="http://repo2.maven.org/maven2/org/apache/ivy/ivy/${ivy.install.version}/ivy-${ivy.install.version}.jar" 
+             dest="${ivy.jar.file}" usetimestamp="true"/>
+    </target>
+
+    <target name="init-ivy" depends="download-ivy">
+      <!-- try to load ivy here from ivy home, in case the user has not already dropped
+              it into ant's lib dir (note that the latter copy will always take precedence).
+              We will not fail as long as local lib dir exists (it may be empty) and
+              ivy is in at least one of ant's lib dir or the local lib dir. -->
+        <path id="ivy.lib.path">
+            <fileset dir="${ivy.jar.dir}" includes="*.jar"/>
+
+        </path>
+        <taskdef resource="org/apache/ivy/ant/antlib.xml"
+                 uri="antlib:org.apache.ivy.ant" classpathref="ivy.lib.path"/>
+    </target>
+
+  <target name="deps-jar" depends="init-ivy">
+    <ivy:retrieve pattern="lib/[artifact]-[revision].[ext]"/>
+  </target>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-geoip/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-geoip/build.xml b/nutch-plugins/index-geoip/build.xml
new file mode 100644
index 0000000..92fda82
--- /dev/null
+++ b/nutch-plugins/index-geoip/build.xml
@@ -0,0 +1,27 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="index-geoip" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+  <target name="init-plugin">
+    <echo>Copying MaxMind GeoIP .mmdb files to build</echo>
+    <copy todir="${build.classes}">
+      <fileset dir="${src.dir}" includes="**/*.mmdb" />
+    </copy>
+  </target>
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-geoip/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-geoip/ivy.xml b/nutch-plugins/index-geoip/ivy.xml
new file mode 100644
index 0000000..1b626f0
--- /dev/null
+++ b/nutch-plugins/index-geoip/ivy.xml
@@ -0,0 +1,46 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+    <dependency org="com.maxmind.geoip2" name="geoip2" rev="2.3.1" >
+      <!-- Exlude due to classpath issues -->
+      <exclude org="org.apache.httpcomponents" name="httpclient" />
+      <exclude org="org.apache.httpcomponents" name="httpcore" />
+    </dependency>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-geoip/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-geoip/plugin.xml b/nutch-plugins/index-geoip/plugin.xml
new file mode 100644
index 0000000..214fbd0
--- /dev/null
+++ b/nutch-plugins/index-geoip/plugin.xml
@@ -0,0 +1,51 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="index-geoip"
+   name="GeoIP2 Indexing Filter"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+
+   <runtime>
+      <library name="index-geoip.jar">
+         <export name="*"/>
+      </library>
+      <library name="commons-codec-1.6.jar"/>
+      <library name="commons-logging-1.1.1.jar"/>
+      <library name="geoip2-2.3.1.jar"/>
+      <library name="google-http-client-1.20.0.jar"/>
+      <library name="jackson-annotations-2.5.0.jar"/>
+      <library name="jackson-core-2.5.3.jar"/>
+      <library name="jackson-databind-2.5.3.jar"/>
+      <library name="jsr305-1.3.9.jar"/>
+      <library name="maxmind-db-1.0.0.jar"/>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+   <extension id="org.apache.nutch.indexer.geoip"
+              name="Nutch GeoIP2 Indexing Filter"
+              point="org.apache.nutch.indexer.IndexingFilter">
+      <implementation id="GeoIPIndexingFilter"
+                      class="org.apache.nutch.indexer.geoip.GeoIPIndexingFilter"/>
+   </extension>
+
+</plugin>
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-geoip/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-geoip/pom.xml b/nutch-plugins/index-geoip/pom.xml
new file mode 100644
index 0000000..1238982
--- /dev/null
+++ b/nutch-plugins/index-geoip/pom.xml
@@ -0,0 +1,55 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>index-geoip</artifactId>
+    <packaging>jar</packaging>
+
+    <name>index-geoip</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+    <dependencies>
+        <dependency>
+            <groupId>com.maxmind.geoip2</groupId>
+            <artifactId>geoip2</artifactId>
+            <version>2.3.1</version>
+            <exclusions>
+                <exclusion>
+                    <groupId>org.apache.httpcomponents</groupId>
+                    <artifactId>httpclient</artifactId>
+                </exclusion>
+                <exclusion>
+                    <groupId>org.apache.httpcomponents</groupId>
+                    <artifactId>httpcore</artifactId>
+                </exclusion>
+            </exclusions>
+        </dependency>
+    </dependencies>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-geoip/src/main/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-geoip/src/main/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java b/nutch-plugins/index-geoip/src/main/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java
new file mode 100644
index 0000000..88d78ef
--- /dev/null
+++ b/nutch-plugins/index-geoip/src/main/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java
@@ -0,0 +1,210 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer.geoip;
+
+import java.io.IOException;
+import java.net.InetAddress;
+import java.net.UnknownHostException;
+
+import org.apache.nutch.indexer.NutchDocument;
+
+import com.maxmind.geoip2.DatabaseReader;
+import com.maxmind.geoip2.WebServiceClient;
+import com.maxmind.geoip2.exception.GeoIp2Exception;
+import com.maxmind.geoip2.model.InsightsResponse;
+import com.maxmind.geoip2.model.CityResponse;
+import com.maxmind.geoip2.model.ConnectionTypeResponse;
+import com.maxmind.geoip2.model.CountryResponse;
+import com.maxmind.geoip2.model.DomainResponse;
+import com.maxmind.geoip2.model.IspResponse;
+import com.maxmind.geoip2.record.City;
+import com.maxmind.geoip2.record.Continent;
+import com.maxmind.geoip2.record.Country;
+import com.maxmind.geoip2.record.Location;
+import com.maxmind.geoip2.record.Postal;
+import com.maxmind.geoip2.record.RepresentedCountry;
+import com.maxmind.geoip2.record.Subdivision;
+import com.maxmind.geoip2.record.Traits;
+
+/**
+ * <p>
+ * Simple utility class which enables efficient, structured
+ * {@link org.apache.nutch.indexer.NutchDocument} building based on input from
+ * {@link GeoIPIndexingFilter}, where configuration is also read.
+ * </p>
+ * <p>
+ * Based on the nature of the input, this class wraps factory type
+ * implementations for populating {@link org.apache.nutch.indexer.NutchDocument}
+ * 's with the correct {@link org.apache.nutch.indexer.NutchField} information.
+ * 
+ */
+public class GeoIPDocumentCreator {
+
+  /**
+   * Default constructor.
+   */
+  public GeoIPDocumentCreator() {
+  }
+
+  public static NutchDocument createDocFromInsightsService(String serverIp,
+      NutchDocument doc, WebServiceClient client) throws UnknownHostException,
+      IOException, GeoIp2Exception {
+    doc.add("ip", serverIp);
+    InsightsResponse response = client
+        .insights(InetAddress.getByName(serverIp));
+    // CityResponse response = client.city(InetAddress.getByName(serverIp));
+
+    City city = response.getCity();
+    doc.add("cityName", city.getName()); // 'Minneapolis'
+    doc.add("cityConfidence", city.getConfidence()); // 50
+    doc.add("cityGeoNameId", city.getGeoNameId());
+
+    Continent continent = response.getContinent();
+    doc.add("continentCode", continent.getCode());
+    doc.add("continentGeoNameId", continent.getGeoNameId());
+    doc.add("continentName", continent.getName());
+
+    Country country = response.getCountry();
+    doc.add("countryIsoCode", country.getIsoCode()); // 'US'
+    doc.add("countryName", country.getName()); // 'United States'
+    doc.add("countryConfidence", country.getConfidence()); // 99
+    doc.add("countryGeoName", country.getGeoNameId());
+
+    Location location = response.getLocation();
+    doc.add("latLon", location.getLatitude() + "," + location.getLongitude()); // 44.9733,
+                                                                               // -93.2323
+    doc.add("accRadius", location.getAccuracyRadius()); // 3
+    doc.add("timeZone", location.getTimeZone()); // 'America/Chicago'
+    doc.add("metroCode", location.getMetroCode());
+
+    Postal postal = response.getPostal();
+    doc.add("postalCode", postal.getCode()); // '55455'
+    doc.add("postalConfidence", postal.getConfidence()); // 40
+
+    RepresentedCountry rCountry = response.getRepresentedCountry();
+    doc.add("countryType", rCountry.getType());
+
+    Subdivision subdivision = response.getMostSpecificSubdivision();
+    doc.add("subDivName", subdivision.getName()); // 'Minnesota'
+    doc.add("subDivIdoCode", subdivision.getIsoCode()); // 'MN'
+    doc.add("subDivConfidence", subdivision.getConfidence()); // 90
+    doc.add("subDivGeoNameId", subdivision.getGeoNameId());
+
+    Traits traits = response.getTraits();
+    doc.add("autonSystemNum", traits.getAutonomousSystemNumber());
+    doc.add("autonSystemOrg", traits.getAutonomousSystemOrganization());
+    doc.add("domain", traits.getDomain());
+    doc.add("isp", traits.getIsp());
+    doc.add("org", traits.getOrganization());
+    doc.add("userType", traits.getUserType());
+    doc.add("isAnonProxy", traits.isAnonymousProxy());
+    doc.add("isSatelliteProv", traits.isSatelliteProvider());
+    return doc;
+  }
+
+  @SuppressWarnings("unused")
+  public static NutchDocument createDocFromCityService(String serverIp,
+      NutchDocument doc, WebServiceClient client) throws UnknownHostException,
+      IOException, GeoIp2Exception {
+    CityResponse response = client.city(InetAddress.getByName(serverIp));
+    return doc;
+  }
+
+  @SuppressWarnings("unused")
+  public static NutchDocument createDocFromCountryService(String serverIp,
+      NutchDocument doc, WebServiceClient client) throws UnknownHostException,
+      IOException, GeoIp2Exception {
+    CountryResponse response = client.country(InetAddress.getByName(serverIp));
+    return doc;
+  }
+
+  public static NutchDocument createDocFromIspDb(String serverIp,
+      NutchDocument doc, DatabaseReader reader) throws UnknownHostException,
+      IOException, GeoIp2Exception {
+    IspResponse response = reader.isp(InetAddress.getByName(serverIp));
+    doc.add("ip", serverIp);
+    doc.add("autonSystemNum", response.getAutonomousSystemNumber());
+    doc.add("autonSystemOrg", response.getAutonomousSystemOrganization());
+    doc.add("isp", response.getIsp());
+    doc.add("org", response.getOrganization());
+    return doc;
+  }
+
+  public static NutchDocument createDocFromDomainDb(String serverIp,
+      NutchDocument doc, DatabaseReader reader) throws UnknownHostException,
+      IOException, GeoIp2Exception {
+    DomainResponse response = reader.domain(InetAddress.getByName(serverIp));
+    doc.add("ip", serverIp);
+    doc.add("domain", response.getDomain());
+    return doc;
+  }
+
+  public static NutchDocument createDocFromConnectionDb(String serverIp,
+      NutchDocument doc, DatabaseReader reader) throws UnknownHostException,
+      IOException, GeoIp2Exception {
+    ConnectionTypeResponse response = reader.connectionType(InetAddress
+        .getByName(serverIp));
+    doc.add("ip", serverIp);
+    doc.add("connType", response.getConnectionType().toString());
+    return doc;
+  }
+
+  public static NutchDocument createDocFromCityDb(String serverIp,
+      NutchDocument doc, DatabaseReader reader) throws UnknownHostException,
+      IOException, GeoIp2Exception {
+    doc.add("ip", serverIp);
+    CityResponse response = reader.city(InetAddress.getByName(serverIp));
+
+    City city = response.getCity();
+    doc.add("cityName", city.getName()); // 'Minneapolis'
+    doc.add("cityConfidence", city.getConfidence()); // 50
+    doc.add("cityGeoNameId", city.getGeoNameId());
+
+    Continent continent = response.getContinent();
+    doc.add("continentCode", continent.getCode());
+    doc.add("continentGeoNameId", continent.getGeoNameId());
+    doc.add("continentName", continent.getName());
+
+    Country country = response.getCountry();
+    doc.add("countryIsoCode", country.getIsoCode()); // 'US'
+    doc.add("countryName", country.getName()); // 'United States'
+    doc.add("countryConfidence", country.getConfidence()); // 99
+    doc.add("countryGeoName", country.getGeoNameId());
+
+    Location location = response.getLocation();
+    doc.add("latLon", location.getLatitude() + "," + location.getLongitude()); // 44.9733,
+                                                                               // -93.2323
+    doc.add("accRadius", location.getAccuracyRadius()); // 3
+    doc.add("timeZone", location.getTimeZone()); // 'America/Chicago'
+    doc.add("metroCode", location.getMetroCode());
+
+    Postal postal = response.getPostal();
+    doc.add("postalCode", postal.getCode()); // '55455'
+    doc.add("postalConfidence", postal.getConfidence()); // 40
+
+    RepresentedCountry rCountry = response.getRepresentedCountry();
+    doc.add("countryType", rCountry.getType());
+
+    Subdivision subdivision = response.getMostSpecificSubdivision();
+    doc.add("subDivName", subdivision.getName()); // 'Minnesota'
+    doc.add("subDivIdoCode", subdivision.getIsoCode()); // 'MN'
+    doc.add("subDivConfidence", subdivision.getConfidence()); // 90
+    doc.add("subDivGeoNameId", subdivision.getGeoNameId());
+    return doc;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-geoip/src/main/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-geoip/src/main/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java b/nutch-plugins/index-geoip/src/main/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java
new file mode 100644
index 0000000..f515f1f
--- /dev/null
+++ b/nutch-plugins/index-geoip/src/main/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java
@@ -0,0 +1,241 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer.geoip;
+
+import java.io.File;
+import java.io.IOException;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.indexer.IndexingFilter;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.maxmind.geoip2.DatabaseReader;
+import com.maxmind.geoip2.WebServiceClient;
+
+/**
+ * <p>
+ * This plugin implements an indexing filter which takes advantage of the <a
+ * href="https://github.com/maxmind/GeoIP2-java">GeoIP2-java API</a>.
+ * </p>
+ * <p>
+ * The third party library distribution provides an API for the GeoIP2 <a
+ * href="http://dev.maxmind.com/geoip/geoip2/web-services">Precision web
+ * services</a> and <a
+ * href="http://dev.maxmind.com/geoip/geoip2/downloadable">databases</a>. The
+ * API also works with the free <a
+ * href="http://dev.maxmind.com/geoip/geoip2/geolite2/">GeoLite2 databases</a>.
+ * </p>
+ * <p>
+ * Depending on the service level agreement, you have with the GeoIP service
+ * provider, the plugin can add a number of the following fields to the index
+ * data model:
+ * <ol>
+ * <li>Continent</li>
+ * <li>Country</li>
+ * <li>Regional Subdivision</li>
+ * <li>City</li>
+ * <li>Postal Code</li>
+ * <li>Latitude/Longitude</li>
+ * <li>ISP/Organization</li>
+ * <li>AS Number</li>
+ * <li>Confidence Factors</li>
+ * <li>Radius</li>
+ * <li>User Type</li>
+ * </ol>
+ * </p>
+ * 
+ * <p>
+ * Some of the services are documented at the <a
+ * href="https://www.maxmind.com/en/geoip2-precision-services">GeoIP2 Precision
+ * Services</a> webpage where more information can be obtained.
+ * </p>
+ * 
+ * <p>
+ * You should also consult the following three properties in
+ * <code>nutch-site.xml</code>
+ * </p>
+ * 
+ * <pre>
+ *  {@code
+ * <!-- index-geoip plugin properties -->
+ * <property>
+ *   <name>index.geoip.usage</name>
+ *   <value>insightsService</value>
+ *   <description>
+ *   A string representing the information source to be used for GeoIP information
+ *   association. Either enter 'cityDatabase', 'connectionTypeDatabase', 
+ *   'domainDatabase', 'ispDatabase' or 'insightsService'. If you wish to use any one of the 
+ *   Database options, you should make one of GeoIP2-City.mmdb, GeoIP2-Connection-Type.mmdb, 
+ *   GeoIP2-Domain.mmdb or GeoIP2-ISP.mmdb files respectively available on the Hadoop classpath 
+ *   and available at runtime. This can be achieved by adding it to $NUTCH_HOME/conf
+ *   </description>
+ * </property>
+ * 
+ * <property>
+ *   <name>index.geoip.userid</name>
+ *   <value></value>
+ *   <description>
+ *   The userId associated with the GeoIP2 Precision Services account.
+ *   </description>
+ * </property>
+ * 
+ * <property>
+ *   <name>index.geoip.licensekey</name>
+ *   <value></value>
+ *   <description>
+ *   The license key associated with the GeoIP2 Precision Services account.
+ *   </description>
+ * </property>
+ * }
+ * </pre>
+ * 
+ */
+public class GeoIPIndexingFilter implements IndexingFilter {
+
+  private static final Logger LOG = LoggerFactory
+      .getLogger(GeoIPIndexingFilter.class);
+
+  private Configuration conf;
+
+  private String usage = null;
+
+  private File geoDb = null;
+
+  WebServiceClient client = null;
+
+  DatabaseReader reader = null;
+
+  // private AbstractResponse response = null;
+
+  /**
+   * Default constructor for this plugin
+   */
+  public GeoIPIndexingFilter() {
+  }
+
+  /**
+   * @see org.apache.hadoop.conf.Configurable#getConf()
+   */
+  @Override
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+  /**
+   * @see org.apache.hadoop.conf.Configurable#setConf(org.apache.hadoop.conf.Configuration)
+   */
+  @Override
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    String use = conf.get("index.geoip.usage", "insightsService");
+    LOG.debug("GeoIP usage medium set to: {}", use);
+    if (use.equalsIgnoreCase("cityDatabase")) {
+      try {
+        geoDb = new File(conf.getResource("GeoIP2-City.mmdb").getFile());
+        buildDb();
+      } catch (Exception e) {
+        LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
+      }
+    } else if (use.equalsIgnoreCase("connectionTypeDatabase")) {
+      try {
+        geoDb = new File(conf.getResource("GeoIP2-Connection-Type.mmdb")
+            .getFile());
+        buildDb();
+      } catch (Exception e) {
+        LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
+      }
+    } else if (use.equalsIgnoreCase("domainDatabase")) {
+      try {
+        geoDb = new File(conf.getResource("GeoIP2-Domain.mmdb").getFile());
+        buildDb();
+      } catch (Exception e) {
+        LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
+      }
+    } else if (use.equalsIgnoreCase("ispDatabase")) {
+      try {
+        geoDb = new File(conf.getResource("GeoIP2-ISP.mmdb").getFile());
+        buildDb();
+      } catch (Exception e) {
+        LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
+      }
+    } else if (use.equalsIgnoreCase("insightsService")) {
+      client = new WebServiceClient.Builder(conf.getInt("index.geoip.userid",
+          12345), conf.get("index.geoip.licensekey")).build();
+    }
+    usage = use;
+  }
+
+  private void buildDb() {
+    try {
+      reader = new DatabaseReader.Builder(geoDb).build();
+    } catch (IOException e) {
+      LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
+    }
+  }
+
+  /**
+   * 
+   * @see org.apache.nutch.indexer.IndexingFilter#filter(org.apache.nutch.indexer.NutchDocument,
+   *      org.apache.nutch.parse.Parse, org.apache.hadoop.io.Text,
+   *      org.apache.nutch.crawl.CrawlDatum, org.apache.nutch.crawl.Inlinks)
+   */
+  @Override
+  public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
+      CrawlDatum datum, Inlinks inlinks) throws IndexingException {
+    return addServerGeo(doc, parse.getData(), url.toString());
+  }
+
+  private NutchDocument addServerGeo(NutchDocument doc, ParseData data,
+      String url) {
+
+    if (conf.getBoolean("store.ip.address", false) == true) {
+      try {
+        String serverIp = data.getContentMeta().get("_ip_");
+        if (serverIp != null) {
+          if (usage.equalsIgnoreCase("cityDatabase")) {
+            doc = GeoIPDocumentCreator.createDocFromCityDb(serverIp, doc,
+                reader);
+          } else if (usage.equalsIgnoreCase("connectionTypeDatabase")) {
+            doc = GeoIPDocumentCreator.createDocFromConnectionDb(serverIp, doc,
+                reader);
+          } else if (usage.equalsIgnoreCase("domainDatabase")) {
+            doc = GeoIPDocumentCreator.createDocFromDomainDb(serverIp, doc,
+                reader);
+          } else if (usage.equalsIgnoreCase("ispDatabase")) {
+            doc = GeoIPDocumentCreator
+                .createDocFromIspDb(serverIp, doc, reader);
+          } else if (usage.equalsIgnoreCase("insightsService")) {
+            doc = GeoIPDocumentCreator.createDocFromInsightsService(serverIp,
+                doc, client);
+          }
+        }
+      } catch (Exception e) {
+        LOG.error(e.getMessage());
+        e.printStackTrace();
+      }
+    }
+    return doc;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-geoip/src/main/java/org/apache/nutch/indexer/geoip/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-geoip/src/main/java/org/apache/nutch/indexer/geoip/package-info.java b/nutch-plugins/index-geoip/src/main/java/org/apache/nutch/indexer/geoip/package-info.java
new file mode 100644
index 0000000..ba62519
--- /dev/null
+++ b/nutch-plugins/index-geoip/src/main/java/org/apache/nutch/indexer/geoip/package-info.java
@@ -0,0 +1,28 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * <p>This plugin implements an indexing filter which takes 
+ * advantage of the 
+ * <a href="https://github.com/maxmind/GeoIP2-java">GeoIP2-java API</a>.</p>
+ * <p>The third party library distribution provides an API for the GeoIP2 
+ * <a href="http://dev.maxmind.com/geoip/geoip2/web-services">Precision web services</a> 
+ * and <a href="http://dev.maxmind.com/geoip/geoip2/downloadable">databases</a>. 
+ * The API also works with the free 
+ * <a href="http://dev.maxmind.com/geoip/geoip2/geolite2/">GeoLite2 databases</a>.
+ *
+ */
+package org.apache.nutch.indexer.geoip;
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-links/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-links/build.xml b/nutch-plugins/index-links/build.xml
new file mode 100644
index 0000000..b853ccf
--- /dev/null
+++ b/nutch-plugins/index-links/build.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="index-links" default="jar-core">
+
+    <import file="../build-plugin.xml"/>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-links/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-links/ivy.xml b/nutch-plugins/index-links/ivy.xml
new file mode 100644
index 0000000..0a363f7
--- /dev/null
+++ b/nutch-plugins/index-links/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="${nutch.root}/ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-links/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-links/plugin.xml b/nutch-plugins/index-links/plugin.xml
new file mode 100644
index 0000000..dfdc5d2
--- /dev/null
+++ b/nutch-plugins/index-links/plugin.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+    id="index-links"
+    name="Index inlinks and outlinks"
+    version="1.0.0"
+    provider-name="nutch.org">
+
+    <runtime>
+        <library name="index-links.jar">
+            <export name="*"/>
+        </library>
+    </runtime>
+
+    <requires>
+        <import plugin="nutch-extensionpoints"/>
+    </requires>
+
+    <extension id="org.apache.nutch.indexer.links.LinksIndexingFilter"
+               name="Links indexing filter"
+               point="org.apache.nutch.indexer.IndexingFilter">
+        <implementation id="org.apache.nutch.indexer.links.LinksIndexingFilter"
+                        class="org.apache.nutch.indexer.links.LinksIndexingFilter"/>
+    </extension>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-links/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-links/pom.xml b/nutch-plugins/index-links/pom.xml
new file mode 100644
index 0000000..e5e3a7f
--- /dev/null
+++ b/nutch-plugins/index-links/pom.xml
@@ -0,0 +1,38 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>index-links</artifactId>
+    <packaging>jar</packaging>
+
+    <name>index-links</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-links/src/main/java/org/apache/nutch/indexer/links/LinksIndexingFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-links/src/main/java/org/apache/nutch/indexer/links/LinksIndexingFilter.java b/nutch-plugins/index-links/src/main/java/org/apache/nutch/indexer/links/LinksIndexingFilter.java
new file mode 100644
index 0000000..975df66
--- /dev/null
+++ b/nutch-plugins/index-links/src/main/java/org/apache/nutch/indexer/links/LinksIndexingFilter.java
@@ -0,0 +1,167 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer.links;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlink;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.indexer.IndexingFilter;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.Parse;
+import org.slf4j.LoggerFactory;
+
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.Set;
+
+/**
+ * An {@link org.apache.nutch.indexer.IndexingFilter} that adds
+ * <code>outlinks</code> and <code>inlinks</code> field(s) to the document.
+ *
+ * In case that you want to ignore the outlinks that point to the same host
+ * as the URL being indexed use the following settings in your configuration
+ * file:
+ *
+ * <property>
+ *   <name>index.links.outlinks.host.ignore</name>
+ *   <value>true</value>
+ * </property>
+ *
+ * The same configuration is available for inlinks:
+ *
+ * <property>
+ *   <name>index.links.inlinks.host.ignore</name>
+ *   <value>true</value>
+ * </property>
+ *
+ * To store only the host portion of each inlink URL or outlink URL add the
+ * following to your configuration file.
+ *
+ * <property>
+ *   <name>index.links.hosts.only</name>
+ *   <value>false</value>
+ * </property>
+ *
+ */
+public class LinksIndexingFilter implements IndexingFilter {
+
+  public final static String LINKS_OUTLINKS_HOST = "index.links.outlinks.host.ignore";
+  public final static String LINKS_INLINKS_HOST = "index.links.inlinks.host.ignore";
+  public final static String LINKS_ONLY_HOSTS = "index.links.hosts.only";
+
+  public final static org.slf4j.Logger LOG = LoggerFactory
+      .getLogger(LinksIndexingFilter.class);
+
+  private Configuration conf;
+  private boolean filterOutlinks;
+  private boolean filterInlinks;
+  private boolean indexHost;
+
+  @Override
+  public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
+      CrawlDatum datum, Inlinks inlinks) throws IndexingException {
+
+    // Add the outlinks
+    Outlink[] outlinks = parse.getData().getOutlinks();
+
+    if (outlinks != null) {
+      Set<String> hosts = new HashSet<String>();
+
+      for (Outlink outlink : outlinks) {
+        try {
+          String linkUrl = outlink.getToUrl();
+          String outHost = new URL(linkUrl).getHost().toLowerCase();
+
+          if (indexHost) {
+            linkUrl = outHost;
+
+            if (hosts.contains(linkUrl))
+              continue;
+
+            hosts.add(linkUrl);
+          }
+
+          addFilteredLink("outlinks", url.toString(), linkUrl, outHost,
+              filterOutlinks, doc);
+        } catch (MalformedURLException e) {
+          LOG.error("Malformed URL in {}: {}", url, e.getMessage());
+        }
+      }
+    }
+
+    // Add the inlinks
+    if (null != inlinks) {
+      Iterator<Inlink> iterator = inlinks.iterator();
+      Set<String> inlinkHosts = new HashSet<String>();
+
+      while (iterator.hasNext()) {
+        try {
+          Inlink link = iterator.next();
+          String linkUrl = link.getFromUrl();
+          String inHost = new URL(linkUrl).getHost().toLowerCase();
+
+          if (indexHost) {
+            linkUrl = inHost;
+
+            if (inlinkHosts.contains(linkUrl))
+              continue;
+
+            inlinkHosts.add(linkUrl);
+          }
+
+          addFilteredLink("inlinks", url.toString(), linkUrl, inHost,
+              filterInlinks, doc);
+        } catch (MalformedURLException e) {
+          LOG.error("Malformed URL in {}: {}", url, e.getMessage());
+        }
+      }
+    }
+
+    return doc;
+  }
+
+  private void addFilteredLink(String fieldName, String url, String linkUrl,
+      String urlHost, boolean filter, NutchDocument doc) throws MalformedURLException {
+      if (filter) {
+        String host = new URL(url.toString()).getHost().toLowerCase();
+
+        if (!host.equalsIgnoreCase(urlHost)) {
+          doc.add(fieldName, linkUrl);
+        }
+      } else {
+        doc.add(fieldName, linkUrl);
+      }
+  }
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    filterOutlinks = conf.getBoolean(LINKS_OUTLINKS_HOST, false);
+    filterInlinks = conf.getBoolean(LINKS_INLINKS_HOST, false);
+
+    indexHost = conf.getBoolean(LINKS_ONLY_HOSTS, false);
+  }
+
+  public Configuration getConf() {
+    return this.conf;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-links/src/test/java/org/apache/nutch/indexer/links/TestLinksIndexingFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-links/src/test/java/org/apache/nutch/indexer/links/TestLinksIndexingFilter.java b/nutch-plugins/index-links/src/test/java/org/apache/nutch/indexer/links/TestLinksIndexingFilter.java
new file mode 100644
index 0000000..c490d1f
--- /dev/null
+++ b/nutch-plugins/index-links/src/test/java/org/apache/nutch/indexer/links/TestLinksIndexingFilter.java
@@ -0,0 +1,218 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer.links;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlink;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.indexer.NutchField;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.util.NutchConfiguration;
+
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import java.net.URL;
+import java.util.Iterator;
+
+public class TestLinksIndexingFilter {
+
+  Configuration conf = NutchConfiguration.create();
+  LinksIndexingFilter filter = new LinksIndexingFilter();
+  Metadata metadata = new Metadata();
+
+  @Before
+  public void setUp() throws Exception {
+    metadata.add(Response.CONTENT_TYPE, "text/html");
+  }
+
+  private Outlink[] generateOutlinks() throws Exception {
+    return generateOutlinks(false);
+  }
+
+  private Outlink[] generateOutlinks(boolean parts) throws Exception {
+    Outlink[] outlinks = new Outlink[2];
+
+    outlinks[0] = new Outlink("http://www.test.com", "test");
+    outlinks[1] = new Outlink("http://www.example.com", "example");
+
+    if (parts) {
+      outlinks[0] = new Outlink(outlinks[0].getToUrl() + "/index.php?param=1",
+          "test");
+      outlinks[1] = new Outlink(outlinks[1].getToUrl() + "/index.php?param=2",
+          "test");
+    }
+
+    return outlinks;
+  }
+
+  @Test
+  public void testFilterOutlinks() throws Exception {
+    conf.set(LinksIndexingFilter.LINKS_OUTLINKS_HOST, "true");
+    filter.setConf(conf);
+
+    Outlink[] outlinks = generateOutlinks();
+
+    NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text",
+            new ParseData(new ParseStatus(), "title", outlinks, metadata)),
+        new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
+
+    Assert.assertEquals(1, doc.getField("outlinks").getValues().size());
+
+    Assert.assertEquals("Filter outlinks, allow only those from a different host",
+        outlinks[0].getToUrl(), doc.getFieldValue("outlinks"));
+  }
+
+  @Test
+  public void testFilterInlinks() throws Exception {
+    conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "true");
+    filter.setConf(conf);
+
+    Inlinks inlinks = new Inlinks();
+    inlinks.add(new Inlink("http://www.test.com", "test"));
+    inlinks.add(new Inlink("http://www.example.com", "example"));
+
+    NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text",
+            new ParseData(new ParseStatus(), "title", new Outlink[0], metadata)),
+        new Text("http://www.example.com/"), new CrawlDatum(), inlinks);
+
+    Assert.assertEquals(1, doc.getField("inlinks").getValues().size());
+
+    Assert.assertEquals("Filter inlinks, allow only those from a different host",
+        "http://www.test.com", doc.getFieldValue("inlinks"));
+  }
+
+  @Test
+  public void testNoFilterOutlinks() throws Exception {
+    filter.setConf(conf);
+
+    Outlink[] outlinks = generateOutlinks();
+
+    NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text",
+            new ParseData(new ParseStatus(), "title", outlinks, metadata)),
+        new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
+
+    Assert.assertEquals("All outlinks must be indexed even those from the same host",
+        outlinks.length, doc.getField("outlinks").getValues().size());
+  }
+
+  @Test
+  public void testNoFilterInlinks() throws Exception {
+    conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "false");
+    filter.setConf(conf);
+
+    Inlinks inlinks = new Inlinks();
+    inlinks.add(new Inlink("http://www.test.com", "test"));
+    inlinks.add(new Inlink("http://www.example.com", "example"));
+
+    NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text",
+            new ParseData(new ParseStatus(), "title", new Outlink[0], metadata)),
+        new Text("http://www.example.com/"), new CrawlDatum(), inlinks);
+
+    Assert.assertEquals("All inlinks must be indexed even those from the same host",
+        inlinks.size(), doc.getField("inlinks").getValues().size());
+  }
+
+  @Test
+  public void testIndexOnlyHostPart() throws Exception {
+    conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "true");
+    conf.set(LinksIndexingFilter.LINKS_OUTLINKS_HOST, "true");
+    conf.set(LinksIndexingFilter.LINKS_ONLY_HOSTS, "true");
+    filter.setConf(conf);
+
+    Outlink[] outlinks = generateOutlinks(true);
+
+    Inlinks inlinks = new Inlinks();
+    inlinks.add(new Inlink("http://www.test.com/one-awesome-page", "test"));
+    inlinks.add(new Inlink("http://www.test.com/other-awesome-page", "test"));
+    inlinks.add(new Inlink("http://www.example.com/my-first-awesome-example",
+        "example"));
+
+    NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text",
+            new ParseData(new ParseStatus(), "title", outlinks, metadata)),
+        new Text("http://www.example.com/"), new CrawlDatum(), inlinks);
+
+    NutchField docOutlinks = doc.getField("outlinks");
+
+    Assert.assertEquals("Only the host portion of the outlink URL must be indexed",
+        new URL("http://www.test.com").getHost(),
+        docOutlinks.getValues().get(0));
+
+    Assert.assertEquals(
+        "The inlinks coming from the same host must count only once", 1,
+        doc.getField("inlinks").getValues().size());
+
+    Assert.assertEquals("Only the host portion of the inlinks URL must be indexed",
+        new URL("http://www.test.com").getHost(), doc.getFieldValue("inlinks"));
+  }
+
+  @Test
+  public void testIndexHostsOnlyAndFilterOutlinks() throws Exception {
+    conf = NutchConfiguration.create();
+    conf.set(LinksIndexingFilter.LINKS_ONLY_HOSTS, "true");
+    conf.set(LinksIndexingFilter.LINKS_OUTLINKS_HOST, "true");
+
+    Outlink[] outlinks = generateOutlinks(true);
+
+    filter.setConf(conf);
+
+    NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text",
+            new ParseData(new ParseStatus(), "title", outlinks, metadata)),
+        new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
+
+    Assert.assertEquals(1, doc.getField("outlinks").getValues().size());
+
+    Assert.assertEquals(
+        "Index only the host portion of the outlinks after filtering",
+        new URL("http://www.test.com").getHost(),
+        doc.getFieldValue("outlinks"));
+  }
+
+  @Test
+  public void testIndexHostsOnlyAndFilterInlinks() throws Exception {
+    conf = NutchConfiguration.create();
+    conf.set(LinksIndexingFilter.LINKS_ONLY_HOSTS, "true");
+    conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "true");
+
+    filter.setConf(conf);
+
+    Inlinks inlinks = new Inlinks();
+    inlinks.add(new Inlink("http://www.test.com", "test"));
+    inlinks.add(new Inlink("http://www.example.com", "example"));
+
+    NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text",
+            new ParseData(new ParseStatus(), "title", new Outlink[0], metadata)),
+        new Text("http://www.example.com/"), new CrawlDatum(), inlinks);
+
+    Assert.assertEquals(1, doc.getField("inlinks").getValues().size());
+
+    Assert.assertEquals(
+        "Index only the host portion of the inlinks after filtering",
+        new URL("http://www.test.com").getHost(),
+        doc.getFieldValue("inlinks"));
+
+  }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-links/src/test/java/org/apache/nutch/parse/TestOutlinks.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-links/src/test/java/org/apache/nutch/parse/TestOutlinks.java b/nutch-plugins/index-links/src/test/java/org/apache/nutch/parse/TestOutlinks.java
new file mode 100644
index 0000000..aaaedbf
--- /dev/null
+++ b/nutch-plugins/index-links/src/test/java/org/apache/nutch/parse/TestOutlinks.java
@@ -0,0 +1,54 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse;
+
+import org.junit.Test;
+
+import java.util.HashSet;
+import java.util.Set;
+
+import static org.junit.Assert.*;
+
+public class TestOutlinks {
+
+  @Test
+  public void testAddSameObject() throws Exception {
+    Set<Outlink> set = new HashSet<>();
+
+    Outlink o = new Outlink("http://www.example.com", "Example");
+    set.add(o);
+    set.add(o);
+
+    assertEquals("Adding the same Outlink twice", 1, set.size());
+  }
+
+  @Test
+  public void testAddOtherObjectWithSameData() throws Exception {
+    Set<Outlink> set = new HashSet<>();
+
+    Outlink o = new Outlink("http://www.example.com", "Example");
+    Outlink o1 = new Outlink("http://www.example.com", "Example");
+
+    assertTrue("The two Outlink objects are the same", o.equals(o1));
+
+    set.add(o);
+    set.add(o1);
+
+    assertEquals("The set should contain only 1 Outlink", 1, set.size());
+  }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-metadata/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-metadata/build.xml b/nutch-plugins/index-metadata/build.xml
new file mode 100644
index 0000000..ad96d11
--- /dev/null
+++ b/nutch-plugins/index-metadata/build.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="index-metadata" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-metadata/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-metadata/ivy.xml b/nutch-plugins/index-metadata/ivy.xml
new file mode 100644
index 0000000..24d7606
--- /dev/null
+++ b/nutch-plugins/index-metadata/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../../ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-metadata/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-metadata/plugin.xml b/nutch-plugins/index-metadata/plugin.xml
new file mode 100644
index 0000000..4d4c9a7
--- /dev/null
+++ b/nutch-plugins/index-metadata/plugin.xml
@@ -0,0 +1,42 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="index-metadata"
+   name="Index Metadata"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+    <runtime>
+      <library name="index-metadata.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+
+   <extension id="org.apache.nutch.indexer.metadata"
+              name="Nutch metadata indexer"
+              point="org.apache.nutch.indexer.IndexingFilter">
+      <implementation id="MetadataIndexer"
+                      class="org.apache.nutch.indexer.metadata.MetadataIndexer"/>
+   </extension>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-metadata/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-metadata/pom.xml b/nutch-plugins/index-metadata/pom.xml
new file mode 100644
index 0000000..bef1b9a
--- /dev/null
+++ b/nutch-plugins/index-metadata/pom.xml
@@ -0,0 +1,38 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>index-metadata</artifactId>
+    <packaging>jar</packaging>
+
+    <name>index-metadata</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-metadata/src/main/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-metadata/src/main/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java b/nutch-plugins/index-metadata/src/main/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java
new file mode 100644
index 0000000..78718aa
--- /dev/null
+++ b/nutch-plugins/index-metadata/src/main/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java
@@ -0,0 +1,104 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.indexer.metadata;
+
+import java.util.HashMap;
+import java.util.Locale;
+import java.util.Map;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.indexer.IndexingFilter;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.parse.Parse;
+
+/**
+ * Indexer which can be configured to extract metadata from the crawldb, parse
+ * metadata or content metadata. You can specify the properties "index.db.md",
+ * "index.parse.md" or "index.content.md" who's values are comma-delimited
+ * <value>key1,key2,key3</value>.
+ */
+public class MetadataIndexer implements IndexingFilter {
+  private Configuration conf;
+  private String[] dbFieldnames;
+  private Map<String, String> parseFieldnames;
+  private String[] contentFieldnames;
+  private static final String db_CONF_PROPERTY = "index.db.md";
+  private static final String parse_CONF_PROPERTY = "index.parse.md";
+  private static final String content_CONF_PROPERTY = "index.content.md";
+
+  public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
+      CrawlDatum datum, Inlinks inlinks) throws IndexingException {
+
+    // just in case
+    if (doc == null)
+      return doc;
+
+    // add the fields from crawldb
+    if (dbFieldnames != null) {
+      for (String metatag : dbFieldnames) {
+        Text metadata = (Text) datum.getMetaData().get(new Text(metatag));
+        if (metadata != null)
+          doc.add(metatag, metadata.toString());
+      }
+    }
+
+    // add the fields from parsemd
+    if (parseFieldnames != null) {
+      for (String metatag : parseFieldnames.keySet()) {
+        for (String value : parse.getData().getParseMeta().getValues(metatag)) {
+          if (value != null)
+            doc.add(parseFieldnames.get(metatag), value);
+        }
+      }
+    }
+
+    // add the fields from contentmd
+    if (contentFieldnames != null) {
+      for (String metatag : contentFieldnames) {
+        for (String value : parse.getData().getContentMeta().getValues(metatag)) {
+          if (value != null)
+            doc.add(metatag, value);
+        }
+      }
+    }
+
+    return doc;
+  }
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    dbFieldnames = conf.getStrings(db_CONF_PROPERTY);
+    parseFieldnames = new HashMap<String, String>();
+    for (String metatag : conf.getStrings(parse_CONF_PROPERTY)) {
+      parseFieldnames.put(metatag.toLowerCase(Locale.ROOT), metatag);
+    }
+    contentFieldnames = conf.getStrings(content_CONF_PROPERTY);
+
+    // TODO check conflict between field names e.g. could have same label
+    // from different sources
+
+  }
+
+  public Configuration getConf() {
+    return this.conf;
+  }
+}

[29/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build for nutch-core and nutch-plugins

Posted by th...@apache.org.

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbStates.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbStates.java b/nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbStates.java
new file mode 100644
index 0000000..b631319
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbStates.java
@@ -0,0 +1,569 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+import java.util.ArrayList;
+import java.util.Date;
+import java.util.Iterator;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.util.StringUtils;
+
+import org.apache.nutch.crawl.CrawlDatum;
+
+import static org.apache.nutch.crawl.CrawlDatum.*;
+
+import org.apache.nutch.scoring.ScoringFilterException;
+import org.apache.nutch.scoring.ScoringFilters;
+
+import static org.junit.Assert.*;
+
+import org.apache.nutch.test.IntegrationTest;
+import org.junit.Test;
+
+import org.junit.experimental.categories.Category;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Test transitions of {@link CrawlDatum} states during an update of
+ * {@link CrawlDb} (command {@literal updatedb}):
+ * <ul>
+ * <li>simulate updatedb with the old CrawlDatum (db status) and the new one
+ * (fetch status) and test whether the resulting CrawlDatum has the appropriate
+ * status.</li>
+ * <li>also check for further CrawlDatum fields (signature, etc.)</li>
+ * <li>and additional conditions:</li>
+ * <ul>
+ * <li>retry counters</li>
+ * <li>signatures</li>
+ * <li>configuration properties</li>
+ * <li>(additional) CrawlDatums of status linked (stemming from inlinks)</li>
+ * </ul>
+ * </li> </ul>
+ */
+@Category({IntegrationTest.class})
+public class TestCrawlDbStates {
+
+  private static final Logger LOG = LoggerFactory
+      .getLogger(TestCrawlDbStates.class);
+
+  protected static final byte[][] fetchDbStatusPairs = {
+      { -1, STATUS_DB_UNFETCHED }, { STATUS_FETCH_SUCCESS, STATUS_DB_FETCHED },
+      { STATUS_FETCH_GONE, STATUS_DB_GONE },
+      { STATUS_FETCH_REDIR_TEMP, STATUS_DB_REDIR_TEMP },
+      { STATUS_FETCH_REDIR_PERM, STATUS_DB_REDIR_PERM },
+      { STATUS_FETCH_NOTMODIFIED, STATUS_DB_NOTMODIFIED },
+      { STATUS_FETCH_RETRY, -1 }, // fetch_retry does not have a CrawlDb
+                                  // counter-part
+      { -1, STATUS_DB_DUPLICATE }, };
+
+  /** tested {@link FetchSchedule} implementations */
+  protected String[] schedules = { "DefaultFetchSchedule",
+      "AdaptiveFetchSchedule" };
+
+  /** CrawlDatum as result of a link */
+  protected final CrawlDatum linked = new CrawlDatum(STATUS_LINKED,
+      CrawlDBTestUtil.createConfiguration().getInt("db.fetch.interval.default",
+          2592000), 0.1f);
+
+  /**
+   * Test the matrix of state transitions:
+   * <ul>
+   * <li>for all available {@link FetchSchedule} implementations</li>
+   * <li>for every possible status in CrawlDb (including "not in CrawlDb")</li>
+   * <li>for every possible fetch status</li>
+   * <li>and zero or more (0-3) additional in-links</li>
+   * </ul>
+   * call {@literal updatedb} and check whether the resulting CrawlDb status is
+   * the expected one.
+   */
+  @Test
+  public void testCrawlDbStateTransitionMatrix() {
+    LOG.info("Test CrawlDatum state transitions");
+    Configuration conf = CrawlDBTestUtil.createConfiguration();
+    CrawlDbUpdateUtil<CrawlDbReducer> updateDb = new CrawlDbUpdateUtil<CrawlDbReducer>(
+        new CrawlDbReducer(), conf);
+    int retryMax = conf.getInt("db.fetch.retry.max", 3);
+    for (String sched : schedules) {
+      LOG.info("Testing state transitions with " + sched);
+      conf.set("db.fetch.schedule.class", "org.apache.nutch.crawl." + sched);
+      FetchSchedule schedule = FetchScheduleFactory
+          .getFetchSchedule(new JobConf(conf));
+      for (int i = 0; i < fetchDbStatusPairs.length; i++) {
+        byte fromDbStatus = fetchDbStatusPairs[i][1];
+        for (int j = 0; j < fetchDbStatusPairs.length; j++) {
+          byte fetchStatus = fetchDbStatusPairs[j][0];
+          CrawlDatum fromDb = null;
+          if (fromDbStatus == -1) {
+            // nothing yet in CrawlDb
+            // CrawlDatum added by FreeGenerator or via outlink
+          } else {
+            fromDb = new CrawlDatum();
+            fromDb.setStatus(fromDbStatus);
+            // initialize fetchInterval:
+            schedule.initializeSchedule(CrawlDbUpdateUtil.dummyURL, fromDb);
+          }
+          // expected db status
+          byte toDbStatus = fetchDbStatusPairs[j][1];
+          if (fetchStatus == -1) {
+            if (fromDbStatus == -1) {
+              // nothing fetched yet: new document detected via outlink
+              toDbStatus = STATUS_DB_UNFETCHED;
+            } else {
+              // nothing fetched but new inlinks detected: status is unchanged
+              toDbStatus = fromDbStatus;
+            }
+          } else if (fetchStatus == STATUS_FETCH_RETRY) {
+            // a simple test of fetch_retry (without retries)
+            if (fromDb == null || fromDb.getRetriesSinceFetch() < retryMax) {
+              toDbStatus = STATUS_DB_UNFETCHED;
+            } else {
+              toDbStatus = STATUS_DB_GONE;
+            }
+          }
+          String fromDbStatusName = (fromDbStatus == -1 ? "<not in CrawlDb>"
+              : getStatusName(fromDbStatus));
+          String fetchStatusName = (fetchStatus == -1 ? "<only inlinks>"
+              : CrawlDatum.getStatusName(fetchStatus));
+          LOG.info(fromDbStatusName + " + " + fetchStatusName + " => "
+              + getStatusName(toDbStatus));
+          List<CrawlDatum> values = new ArrayList<CrawlDatum>();
+          for (int l = 0; l <= 2; l++) { // number of additional in-links
+            CrawlDatum fetch = null;
+            if (fetchStatus == -1) {
+              // nothing fetched, need at least one in-link
+              if (l == 0)
+                continue;
+            } else {
+              fetch = new CrawlDatum();
+              if (fromDb != null) {
+                fetch.set(fromDb);
+              } else {
+                // not yet in CrawlDb: added by FreeGenerator
+                schedule.initializeSchedule(CrawlDbUpdateUtil.dummyURL, fetch);
+              }
+              fetch.setStatus(fetchStatus);
+              fetch.setFetchTime(System.currentTimeMillis());
+            }
+            if (fromDb != null)
+              values.add(fromDb);
+            if (fetch != null)
+              values.add(fetch);
+            for (int n = 0; n < l; n++) {
+              values.add(linked);
+            }
+            List<CrawlDatum> res = updateDb.update(values);
+            if (res.size() != 1) {
+              fail("CrawlDb update didn't result in one single CrawlDatum per URL");
+              continue;
+            }
+            byte status = res.get(0).getStatus();
+            if (status != toDbStatus) {
+              fail("CrawlDb update for " + fromDbStatusName + " and "
+                  + fetchStatusName + " and " + l + " inlinks results in "
+                  + getStatusName(status) + " (expected: "
+                  + getStatusName(toDbStatus) + ")");
+            }
+            values.clear();
+          }
+        }
+      }
+    }
+  }
+
+  /**
+   * Test states after inject: inject must not modify the status of CrawlDatums
+   * already in CrawlDb. Newly injected elements have status "db_unfetched".
+   * Inject is simulated by calling {@link Injector.InjectReducer#reduce()}.
+   */
+  @Test
+  public void testCrawlDbStatTransitionInject() {
+    LOG.info("Test CrawlDatum states in Injector after inject");
+    Configuration conf = CrawlDBTestUtil.createConfiguration();
+    Injector.InjectReducer injector = new Injector.InjectReducer();
+    CrawlDbUpdateTestDriver<Injector.InjectReducer> injectDriver =
+        new CrawlDbUpdateTestDriver<Injector.InjectReducer>(injector, conf);
+    ScoringFilters scfilters = new ScoringFilters(conf);
+    for (String sched : schedules) {
+      LOG.info("Testing inject with " + sched);
+      conf.set("db.fetch.schedule.class", "org.apache.nutch.crawl." + sched);
+      FetchSchedule schedule = FetchScheduleFactory
+          .getFetchSchedule(new JobConf(conf));
+      List<CrawlDatum> values = new ArrayList<CrawlDatum>();
+      for (int i = 0; i < fetchDbStatusPairs.length; i++) {
+        byte fromDbStatus = fetchDbStatusPairs[i][1];
+        byte toDbStatus = fromDbStatus;
+        if (fromDbStatus == -1) {
+          toDbStatus = STATUS_DB_UNFETCHED;
+        } else {
+          CrawlDatum fromDb = new CrawlDatum();
+          fromDb.setStatus(fromDbStatus);
+          schedule.initializeSchedule(CrawlDbUpdateUtil.dummyURL, fromDb);
+          values.add(fromDb);
+        }
+        LOG.info("inject "
+            + (fromDbStatus == -1 ? "<not in CrawlDb>" : CrawlDatum
+                .getStatusName(fromDbStatus)) + " + "
+            + getStatusName(STATUS_INJECTED) + " => "
+            + getStatusName(toDbStatus));
+        CrawlDatum injected = new CrawlDatum(STATUS_INJECTED, conf.getInt(
+            "db.fetch.interval.default", 2592000), 0.1f);
+        schedule.initializeSchedule(CrawlDbUpdateUtil.dummyURL, injected);
+        try {
+          scfilters.injectedScore(CrawlDbUpdateUtil.dummyURL, injected);
+        } catch (ScoringFilterException e) {
+          LOG.error(StringUtils.stringifyException(e));
+        }
+        values.add(injected);
+        List<CrawlDatum> res = injectDriver.update(values);
+        if (res.size() != 1) {
+          fail("Inject didn't result in one single CrawlDatum per URL");
+          continue;
+        }
+        byte status = res.get(0).getStatus();
+        if (status != toDbStatus) {
+          fail("Inject for "
+              + (fromDbStatus == -1 ? "" : getStatusName(fromDbStatus)
+                  + " and ") + getStatusName(STATUS_INJECTED) + " results in "
+              + getStatusName(status) + " (expected: "
+              + getStatusName(toDbStatus) + ")");
+        }
+        values.clear();
+      }
+    }
+  }
+
+  /**
+   * Test status db_notmodified detected by
+   * <ul>
+   * <li>signature comparison</li>
+   * <li>or HTTP 304</li>
+   * </ul>
+   * In addition, test for all available {@link FetchSchedule} implementations
+   * whether
+   * <ul>
+   * <li>modified time is set</li>
+   * <li>re-fetch is triggered after a certain time to force the fetched content
+   * to be in a recent segment (old segments are deleted, see comments in
+   * {@link CrawlDbReducer#reduce(Text, Iterator, OutputCollector, Reporter)}</li>
+   * </ul>
+   */
+  @Test
+  public void testCrawlDbReducerNotModified() {
+    LOG.info("Test state notmodified");
+    Configuration conf = CrawlDBTestUtil.createConfiguration();
+    // test not modified detected by signature comparison
+    for (String sched : schedules) {
+      String desc = "test notmodified by signature comparison + " + sched;
+      LOG.info(desc);
+      conf.set("db.fetch.schedule.class", "org.apache.nutch.crawl." + sched);
+      ContinuousCrawlTestUtil crawlUtil = new CrawlTestFetchNotModified(conf);
+      if (!crawlUtil.run(20)) {
+        fail("failed: " + desc);
+      }
+    }
+    // test not modified detected by HTTP 304
+    for (String sched : schedules) {
+      String desc = "test notmodified by HTTP 304 + " + sched;
+      LOG.info(desc);
+      conf.set("db.fetch.schedule.class", "org.apache.nutch.crawl." + sched);
+      ContinuousCrawlTestUtil crawlUtil = new CrawlTestFetchNotModifiedHttp304(
+          conf);
+      if (!crawlUtil.run(20)) {
+        fail("failed: " + desc);
+      }
+    }
+  }
+
+  protected class CrawlTestFetchNotModified extends ContinuousCrawlTestUtil {
+
+    /** time of the current fetch */
+    protected long currFetchTime;
+    /** time the last fetch took place */
+    protected long lastFetchTime;
+    /**
+     * time the document was fetched first (at all or after it has been changed)
+     */
+    protected long firstFetchTime;
+    /** state in CrawlDb before the last fetch */
+    protected byte previousDbState;
+    /** signature in CrawlDb of previous fetch */
+    protected byte[] lastSignature;
+
+    private long maxFetchInterval;
+    private FetchSchedule schedule;
+
+    CrawlTestFetchNotModified(Configuration conf) {
+      super(conf);
+      maxFetchInterval = conf.getLong("db.fetch.interval.max", 7776000); // default
+                                                                         // = 90
+                                                                         // days
+      maxFetchInterval += (24 * 60 * 60); // but take one day more to avoid
+                                          // false alarms
+      maxFetchInterval *= 1000; // in milli-seconds
+      schedule = FetchScheduleFactory.getFetchSchedule(new JobConf(conf));
+    }
+
+    @Override
+    protected boolean check(CrawlDatum result) {
+      if (lastFetchTime > 0
+          && (currFetchTime - lastFetchTime) > maxFetchInterval) {
+        LOG.error("last effective fetch (HTTP 200, not HTTP 304), at "
+            + new Date(lastFetchTime)
+            + ", took place more than db.fetch.interval.max time, "
+            + "segment containing fetched content may have been deleted");
+        return false;
+      }
+      switch (result.getStatus()) {
+      case STATUS_DB_NOTMODIFIED:
+        // db_notmodified is correct if the document has been fetched previously
+        // and it has not been changed since
+        if ((previousDbState == STATUS_DB_FETCHED || previousDbState == STATUS_DB_NOTMODIFIED)) {
+          if (lastSignature != null
+              && result.getSignature() != null
+              && SignatureComparator._compare(lastSignature,
+                  result.getSignature()) != 0) {
+            LOG.error("document has changed (signature changed) but state is still "
+                + getStatusName(STATUS_DB_NOTMODIFIED));
+            return false;
+          }
+          LOG.info("ok: " + result);
+          return checkModifiedTime(result, firstFetchTime);
+        }
+        LOG.warn("notmodified without previous fetch");
+        break;
+      case STATUS_DB_FETCHED:
+        if (previousDbState == STATUS_DB_UNFETCHED) {
+          LOG.info("ok (first fetch): " + result);
+          return checkModifiedTime(result, firstFetchTime);
+        } else if (lastSignature != null
+            && result.getSignature() != null
+            && SignatureComparator._compare(lastSignature,
+                result.getSignature()) != 0) {
+          LOG.info("ok (content changed): " + result);
+          // expect modified time == now
+          return checkModifiedTime(result, currFetchTime);
+        } else {
+          LOG.warn("document has not changed, db_notmodified expected");
+        }
+        break;
+      case STATUS_DB_UNFETCHED:
+        /**
+         * Status db_unfetched is possible with {@link AdaptiveFetchSchedule}
+         * because {@link CrawlDbReducer#reduce} calls
+         * {@link FetchSchedule#forceRefetch} to force a re-fetch if fetch
+         * interval grows too large.
+         */
+        if (schedule.getClass() == AdaptiveFetchSchedule.class) {
+          LOG.info("state set to unfetched by AdaptiveFetchSchedule");
+          if (result.getSignature() != null) {
+            LOG.warn("must reset signature: " + result);
+            return false;
+          }
+          LOG.info("ok: " + result);
+          firstFetchTime = 0;
+          return true;
+        }
+      }
+      LOG.warn("wrong result: " + result);
+      return false;
+    }
+
+    // test modified time
+    private boolean checkModifiedTime(CrawlDatum result, long modifiedTime) {
+      if (result.getModifiedTime() == 0) {
+        LOG.error("modified time not set (TODO: not set by DefaultFetchSchedule)");
+        // TODO: return false (but DefaultFetchSchedule does not set modified
+        // time, see NUTCH-933)
+        return true;
+      } else if (modifiedTime == result.getModifiedTime()) {
+        return true;
+      }
+      LOG.error("wrong modified time: " + new Date(result.getModifiedTime())
+          + " (expected " + new Date(modifiedTime) + ")");
+      return false;
+    }
+
+    @Override
+    protected CrawlDatum fetch(CrawlDatum datum, long currentTime) {
+      lastFetchTime = currFetchTime;
+      currFetchTime = currentTime;
+      previousDbState = datum.getStatus();
+      lastSignature = datum.getSignature();
+      datum = super.fetch(datum, currentTime);
+      if (firstFetchTime == 0) {
+        firstFetchTime = currFetchTime;
+      } else if ((currFetchTime - firstFetchTime) > (duration / 2)) {
+        // simulate a modification after "one year"
+        changeContent();
+        firstFetchTime = currFetchTime;
+      }
+      return datum;
+    }
+  }
+
+  protected class CrawlTestFetchNotModifiedHttp304 extends
+      CrawlTestFetchNotModified {
+
+    CrawlTestFetchNotModifiedHttp304(Configuration conf) {
+      super(conf);
+    }
+
+    @Override
+    protected CrawlDatum fetch(CrawlDatum datum, long currentTime) {
+      lastFetchTime = currFetchTime;
+      currFetchTime = currentTime;
+      previousDbState = datum.getStatus();
+      lastSignature = datum.getSignature();
+      int httpCode;
+      /*
+       * document is "really" fetched (no HTTP 304) - if last-modified time or
+       * signature are unset (page has not been fetched before or fetch is
+       * forced) - for test purposes, we simulate a modified after "one year"
+       */
+      if (datum.getModifiedTime() == 0 && datum.getSignature() == null
+          || (currFetchTime - firstFetchTime) > (duration / 2)) {
+        firstFetchTime = currFetchTime;
+        httpCode = 200;
+        datum.setStatus(STATUS_FETCH_SUCCESS);
+        // modify content to change signature
+        changeContent();
+      } else {
+        httpCode = 304;
+        datum.setStatus(STATUS_FETCH_NOTMODIFIED);
+      }
+      LOG.info("fetched with HTTP " + httpCode + " => "
+          + getStatusName(datum.getStatus()));
+      datum.setFetchTime(currentTime);
+      return datum;
+    }
+  }
+
+  /**
+   * NUTCH-1245: a fetch_gone should always result in a db_gone.
+   * <p>
+   * Even in a long-running continuous crawl, when a gone page is re-fetched
+   * several times over time.
+   * </p>
+   */
+  @Test
+  public void testCrawlDbReducerPageGoneSchedule1() {
+    LOG.info("NUTCH-1245: test long running continuous crawl");
+    ContinuousCrawlTestUtil crawlUtil = new ContinuousCrawlTestUtil(
+        STATUS_FETCH_GONE, STATUS_DB_GONE);
+    if (!crawlUtil.run(20)) {
+      fail("fetch_gone did not result in a db_gone (NUTCH-1245)");
+    }
+  }
+
+  /**
+   * NUTCH-1245: a fetch_gone should always result in a db_gone.
+   * <p>
+   * As some kind of misconfiguration set db.fetch.interval.default to a value
+   * &gt; (fetchIntervalMax * 1.5).
+   * </p>
+   */
+  @Test
+  public void testCrawlDbReducerPageGoneSchedule2() {
+    LOG.info("NUTCH-1245 (misconfiguration): test with db.fetch.interval.default > (1.5 * db.fetch.interval.max)");
+    Configuration conf = CrawlDBTestUtil.createConfiguration();
+    int fetchIntervalMax = conf.getInt("db.fetch.interval.max", 0);
+    conf.setInt("db.fetch.interval.default", 3 + (int) (fetchIntervalMax * 1.5));
+    ContinuousCrawlTestUtil crawlUtil = new ContinuousCrawlTestUtil(conf,
+        STATUS_FETCH_GONE, STATUS_DB_GONE);
+    if (!crawlUtil.run(0)) {
+      fail("fetch_gone did not result in a db_gone (NUTCH-1245)");
+    }
+  }
+
+  /**
+   * Test whether signatures are reset for "content-less" states (gone,
+   * redirect, etc.): otherwise, if this state is temporary and the document
+   * appears again with the old content, it may get marked as not_modified in
+   * CrawlDb just after the redirect state. In this case we cannot expect
+   * content in segments. Cf. NUTCH-1422: reset signature for redirects.
+   */
+  // TODO: can only test if solution is done in CrawlDbReducer
+  @Test
+  public void testSignatureReset() {
+    LOG.info("NUTCH-1422 must reset signature for redirects and similar states");
+    Configuration conf = CrawlDBTestUtil.createConfiguration();
+    for (String sched : schedules) {
+      LOG.info("Testing reset signature with " + sched);
+      conf.set("db.fetch.schedule.class", "org.apache.nutch.crawl." + sched);
+      ContinuousCrawlTestUtil crawlUtil = new CrawlTestSignatureReset(conf);
+      if (!crawlUtil.run(20)) {
+        fail("failed: signature not reset");
+      }
+    }
+  }
+
+  private class CrawlTestSignatureReset extends ContinuousCrawlTestUtil {
+
+    byte[][] noContentStates = { { STATUS_FETCH_GONE, STATUS_DB_GONE },
+        { STATUS_FETCH_REDIR_TEMP, STATUS_DB_REDIR_TEMP },
+        { STATUS_FETCH_REDIR_PERM, STATUS_DB_REDIR_PERM } };
+
+    int counter = 0;
+    byte fetchState;
+
+    public CrawlTestSignatureReset(Configuration conf) {
+      super(conf);
+    }
+
+    @Override
+    protected CrawlDatum fetch(CrawlDatum datum, long currentTime) {
+      datum = super.fetch(datum, currentTime);
+      counter++;
+      // flip-flopping between successful fetch and one of content-less states
+      if (counter % 2 == 1) {
+        fetchState = STATUS_FETCH_SUCCESS;
+      } else {
+        fetchState = noContentStates[(counter % 6) / 2][0];
+      }
+      LOG.info("Step " + counter + ": fetched with "
+          + getStatusName(fetchState));
+      datum.setStatus(fetchState);
+      return datum;
+    }
+
+    @Override
+    protected boolean check(CrawlDatum result) {
+      if (result.getStatus() == STATUS_DB_NOTMODIFIED
+          && !(fetchState == STATUS_FETCH_SUCCESS || fetchState == STATUS_FETCH_NOTMODIFIED)) {
+        LOG.error("Should never get into state "
+            + getStatusName(STATUS_DB_NOTMODIFIED) + " from "
+            + getStatusName(fetchState));
+        return false;
+      }
+      if (result.getSignature() != null
+          && !(result.getStatus() == STATUS_DB_FETCHED || result.getStatus() == STATUS_DB_NOTMODIFIED)) {
+        LOG.error("Signature not reset in state "
+            + getStatusName(result.getStatus()));
+        // ok here: since it's not the problem itself (the db_notmodified), but
+        // the reason for it
+      }
+      return true;
+    }
+
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/crawl/TestGenerator.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/crawl/TestGenerator.java b/nutch-core/src/test/java/org/apache/nutch/crawl/TestGenerator.java
new file mode 100644
index 0000000..0ce3c5f
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/crawl/TestGenerator.java
@@ -0,0 +1,373 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.crawl;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.SequenceFile.Reader.Option;
+import org.apache.nutch.crawl.CrawlDBTestUtil.URLCrawlDatum;
+import org.apache.nutch.test.IntegrationTest;
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+
+/**
+ * Basic generator test. 1. Insert entries in crawldb 2. Generates entries to
+ * fetch 3. Verifies that number of generated urls match 4. Verifies that
+ * highest scoring urls are generated
+ * 
+ */
+@Category({IntegrationTest.class})
+public class TestGenerator {
+
+  Configuration conf;
+
+  Path dbDir;
+
+  Path segmentsDir;
+
+  FileSystem fs;
+
+  final static Path testdir = new Path("build/test/generator-test");
+
+  @Before
+  public void setUp() throws Exception {
+    conf = CrawlDBTestUtil.createConfiguration();
+    fs = FileSystem.get(conf);
+    fs.delete(testdir, true);
+  }
+
+  @After
+  public void tearDown() {
+    delete(testdir);
+  }
+
+  private void delete(Path p) {
+    try {
+      fs.delete(p, true);
+    } catch (IOException e) {
+    }
+  }
+
+  /**
+   * Test that generator generates fetchlish ordered by score (desc).
+   * 
+   * @throws Exception
+   */
+  @Test
+  public void testGenerateHighest() throws Exception {
+
+    final int NUM_RESULTS = 2;
+
+    ArrayList<URLCrawlDatum> list = new ArrayList<URLCrawlDatum>();
+
+    for (int i = 0; i <= 100; i++) {
+      list.add(createURLCrawlDatum("http://aaa/" + pad(i), 1, i));
+    }
+
+    createCrawlDB(list);
+
+    Path generatedSegment = generateFetchlist(NUM_RESULTS, conf, false);
+
+    Path fetchlist = new Path(new Path(generatedSegment,
+        CrawlDatum.GENERATE_DIR_NAME), "part-00000");
+
+    ArrayList<URLCrawlDatum> l = readContents(fetchlist);
+
+    // sort urls by score desc
+    Collections.sort(l, new ScoreComparator());
+
+    // verify we got right amount of records
+    Assert.assertEquals(NUM_RESULTS, l.size());
+
+    // verify we have the highest scoring urls
+    Assert.assertEquals("http://aaa/100", (l.get(0).url.toString()));
+    Assert.assertEquals("http://aaa/099", (l.get(1).url.toString()));
+  }
+
+  private String pad(int i) {
+    String s = Integer.toString(i);
+    while (s.length() < 3) {
+      s = "0" + s;
+    }
+    return s;
+  }
+
+  /**
+   * Comparator that sorts by score desc.
+   */
+  public class ScoreComparator implements Comparator<URLCrawlDatum> {
+
+    public int compare(URLCrawlDatum tuple1, URLCrawlDatum tuple2) {
+      if (tuple2.datum.getScore() - tuple1.datum.getScore() < 0) {
+        return -1;
+      }
+      if (tuple2.datum.getScore() - tuple1.datum.getScore() > 0) {
+        return 1;
+      }
+      return 0;
+    }
+  }
+
+  /**
+   * Test that generator obeys the property "generate.max.per.host".
+   * 
+   * @throws Exception
+   */
+  @Test
+  public void testGenerateHostLimit() throws Exception {
+    ArrayList<URLCrawlDatum> list = new ArrayList<URLCrawlDatum>();
+
+    list.add(createURLCrawlDatum("http://www.example.com/index1.html", 1, 1));
+    list.add(createURLCrawlDatum("http://www.example.com/index2.html", 1, 1));
+    list.add(createURLCrawlDatum("http://www.example.com/index3.html", 1, 1));
+
+    createCrawlDB(list);
+
+    Configuration myConfiguration = new Configuration(conf);
+    myConfiguration.setInt(Generator.GENERATOR_MAX_COUNT, 2);
+    Path generatedSegment = generateFetchlist(Integer.MAX_VALUE,
+        myConfiguration, false);
+
+    Path fetchlistPath = new Path(new Path(generatedSegment,
+        CrawlDatum.GENERATE_DIR_NAME), "part-00000");
+
+    ArrayList<URLCrawlDatum> fetchList = readContents(fetchlistPath);
+
+    // verify we got right amount of records
+    Assert.assertEquals(1, fetchList.size());
+
+    myConfiguration = new Configuration(conf);
+    myConfiguration.setInt(Generator.GENERATOR_MAX_COUNT, 3);
+    generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration,
+        false);
+
+    fetchlistPath = new Path(new Path(generatedSegment,
+        CrawlDatum.GENERATE_DIR_NAME), "part-00000");
+
+    fetchList = readContents(fetchlistPath);
+
+    // verify we got right amount of records
+    Assert.assertEquals(2, fetchList.size());
+
+    myConfiguration = new Configuration(conf);
+    myConfiguration.setInt(Generator.GENERATOR_MAX_COUNT, 4);
+    generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration,
+        false);
+
+    fetchlistPath = new Path(new Path(generatedSegment,
+        CrawlDatum.GENERATE_DIR_NAME), "part-00000");
+
+    fetchList = readContents(fetchlistPath);
+
+    // verify we got right amount of records
+    Assert.assertEquals(3, fetchList.size());
+  }
+
+  /**
+   * Test that generator obeys the property "generator.max.count" and
+   * "generator.count.per.domain".
+   * 
+   * @throws Exception
+   */
+  @Test
+  public void testGenerateDomainLimit() throws Exception {
+    ArrayList<URLCrawlDatum> list = new ArrayList<URLCrawlDatum>();
+
+    list.add(createURLCrawlDatum("http://a.example.com/index.html", 1, 1));
+    list.add(createURLCrawlDatum("http://b.example.com/index.html", 1, 1));
+    list.add(createURLCrawlDatum("http://c.example.com/index.html", 1, 1));
+
+    createCrawlDB(list);
+
+    Configuration myConfiguration = new Configuration(conf);
+    myConfiguration.setInt(Generator.GENERATOR_MAX_COUNT, 2);
+    myConfiguration.set(Generator.GENERATOR_COUNT_MODE,
+        Generator.GENERATOR_COUNT_VALUE_DOMAIN);
+
+    Path generatedSegment = generateFetchlist(Integer.MAX_VALUE,
+        myConfiguration, false);
+
+    Path fetchlistPath = new Path(new Path(generatedSegment,
+        CrawlDatum.GENERATE_DIR_NAME), "part-00000");
+
+    ArrayList<URLCrawlDatum> fetchList = readContents(fetchlistPath);
+
+    // verify we got right amount of records
+    Assert.assertEquals(1, fetchList.size());
+
+    myConfiguration = new Configuration(myConfiguration);
+    myConfiguration.setInt(Generator.GENERATOR_MAX_COUNT, 3);
+    generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration,
+        false);
+
+    fetchlistPath = new Path(new Path(generatedSegment,
+        CrawlDatum.GENERATE_DIR_NAME), "part-00000");
+
+    fetchList = readContents(fetchlistPath);
+
+    // verify we got right amount of records
+    Assert.assertEquals(2, fetchList.size());
+
+    myConfiguration = new Configuration(myConfiguration);
+    myConfiguration.setInt(Generator.GENERATOR_MAX_COUNT, 4);
+    generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration,
+        false);
+
+    fetchlistPath = new Path(new Path(generatedSegment,
+        CrawlDatum.GENERATE_DIR_NAME), "part-00000");
+
+    fetchList = readContents(fetchlistPath);
+
+    // verify we got right amount of records
+    Assert.assertEquals(3, fetchList.size());
+  }
+
+  /**
+   * Test generator obeys the filter setting.
+   * 
+   * @throws Exception
+   * @throws IOException
+   */
+  @Test
+  public void testFilter() throws IOException, Exception {
+
+    ArrayList<URLCrawlDatum> list = new ArrayList<URLCrawlDatum>();
+
+    list.add(createURLCrawlDatum("http://www.example.com/index.html", 1, 1));
+    list.add(createURLCrawlDatum("http://www.example.net/index.html", 1, 1));
+    list.add(createURLCrawlDatum("http://www.example.org/index.html", 1, 1));
+
+    createCrawlDB(list);
+
+    Configuration myConfiguration = new Configuration(conf);
+    myConfiguration.set("urlfilter.suffix.file", "filter-all.txt");
+
+    Path generatedSegment = generateFetchlist(Integer.MAX_VALUE,
+        myConfiguration, true);
+
+    Assert.assertNull("should be null (0 entries)", generatedSegment);
+
+    generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration,
+        false);
+
+    Path fetchlistPath = new Path(new Path(generatedSegment,
+        CrawlDatum.GENERATE_DIR_NAME), "part-00000");
+
+    ArrayList<URLCrawlDatum> fetchList = readContents(fetchlistPath);
+
+    // verify nothing got filtered
+    Assert.assertEquals(list.size(), fetchList.size());
+
+  }
+
+  /**
+   * Read contents of fetchlist.
+   * 
+   * @param fetchlist
+   *          path to Generated fetchlist
+   * @return Generated {@link URLCrawlDatum} objects
+   * @throws IOException
+   */
+  private ArrayList<URLCrawlDatum> readContents(Path fetchlist)
+      throws IOException {
+    // verify results
+    Option rFile = SequenceFile.Reader.file(fetchlist);
+    SequenceFile.Reader reader = new SequenceFile.Reader(conf, rFile);
+
+    ArrayList<URLCrawlDatum> l = new ArrayList<URLCrawlDatum>();
+
+    READ: do {
+      Text key = new Text();
+      CrawlDatum value = new CrawlDatum();
+      if (!reader.next(key, value)) {
+        break READ;
+      }
+      l.add(new URLCrawlDatum(key, value));
+    } while (true);
+
+    reader.close();
+    return l;
+  }
+
+  /**
+   * Generate Fetchlist.
+   * 
+   * @param numResults
+   *          number of results to generate
+   * @param config
+   *          Configuration to use
+   * @return path to generated segment
+   * @throws IOException
+   */
+  private Path generateFetchlist(int numResults, Configuration config,
+      boolean filter) throws IOException {
+    // generate segment
+    Generator g = new Generator(config);
+    Path[] generatedSegment = g.generate(dbDir, segmentsDir, -1, numResults,
+        Long.MAX_VALUE, filter, false);
+    if (generatedSegment == null)
+      return null;
+    return generatedSegment[0];
+  }
+
+  /**
+   * Creates CrawlDB.
+   * 
+   * @param list
+   *          database contents
+   * @throws IOException
+   * @throws Exception
+   */
+  private void createCrawlDB(ArrayList<URLCrawlDatum> list) throws IOException,
+      Exception {
+    dbDir = new Path(testdir, "crawldb");
+    segmentsDir = new Path(testdir, "segments");
+    fs.mkdirs(dbDir);
+    fs.mkdirs(segmentsDir);
+
+    // create crawldb
+    CrawlDBTestUtil.createCrawlDb(conf, fs, dbDir, list);
+  }
+
+  /**
+   * Constructs new {@link URLCrawlDatum} from submitted parameters.
+   * 
+   * @param url
+   *          url to use
+   * @param fetchInterval
+   *          {@link CrawlDatum#setFetchInterval(float)}
+   * @param score
+   *          {@link CrawlDatum#setScore(float)}
+   * @return Constructed object
+   */
+  private URLCrawlDatum createURLCrawlDatum(final String url,
+      final int fetchInterval, final float score) {
+    return new CrawlDBTestUtil.URLCrawlDatum(new Text(url), new CrawlDatum(
+        CrawlDatum.STATUS_DB_UNFETCHED, fetchInterval, score));
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/crawl/TestInjector.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/crawl/TestInjector.java b/nutch-core/src/test/java/org/apache/nutch/crawl/TestInjector.java
new file mode 100644
index 0000000..59a3e8c
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/crawl/TestInjector.java
@@ -0,0 +1,184 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.crawl;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.SequenceFile.Reader.Option;
+import org.apache.nutch.test.IntegrationTest;
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+
+/**
+ * Basic injector test: 1. Creates a text file with urls 2. Injects them into
+ * crawldb 3. Reads crawldb entries and verifies contents 4. Injects more urls
+ * into webdb 5. Reads crawldb entries and verifies contents
+ * 
+ */
+@Category({IntegrationTest.class})
+public class TestInjector {
+
+  private Configuration conf;
+  private FileSystem fs;
+  final static Path testdir = new Path("build/test/inject-test");
+  Path crawldbPath;
+  Path urlPath;
+
+  @Before
+  public void setUp() throws Exception {
+    conf = CrawlDBTestUtil.createConfiguration();
+    urlPath = new Path(testdir, "urls");
+    crawldbPath = new Path(testdir, "crawldb");
+    fs = FileSystem.get(conf);
+    if (fs.exists(urlPath))
+      fs.delete(urlPath, false);
+    if (fs.exists(crawldbPath))
+      fs.delete(crawldbPath, true);
+  }
+
+  @After
+  public void tearDown() throws IOException {
+    fs.delete(testdir, true);
+  }
+
+  @Test
+  public void testInject()
+      throws IOException, ClassNotFoundException, InterruptedException {
+    ArrayList<String> urls = new ArrayList<String>();
+    // We'll use a separate list for MD so we can still compare url with
+    // containsAll
+    ArrayList<String> metadata = new ArrayList<String>();
+    for (int i = 0; i < 100; i++) {
+      urls.add("http://zzz.com/" + i + ".html");
+      metadata.add("\tnutch.score=2." + i
+          + "\tnutch.fetchInterval=171717\tkey=value");
+    }
+    CrawlDBTestUtil.generateSeedList(fs, urlPath, urls, metadata);
+
+    Injector injector = new Injector(conf);
+    injector.inject(crawldbPath, urlPath);
+
+    // verify results
+    List<String> read = readCrawldb();
+
+    Collections.sort(read);
+    Collections.sort(urls);
+
+    Assert.assertEquals(urls.size(), read.size());
+
+    Assert.assertTrue(read.containsAll(urls));
+    Assert.assertTrue(urls.containsAll(read));
+
+    // inject more urls
+    ArrayList<String> urls2 = new ArrayList<String>();
+    for (int i = 0; i < 100; i++) {
+      urls2.add("http://xxx.com/" + i + ".html");
+      // We'll overwrite previously injected records but preserve their original
+      // MD
+      urls2.add("http://zzz.com/" + i + ".html");
+    }
+    CrawlDBTestUtil.generateSeedList(fs, urlPath, urls2);
+    injector = new Injector(conf);
+    conf.setBoolean("db.injector.update", true);
+    injector.inject(crawldbPath, urlPath);
+    urls.addAll(urls2);
+
+    // verify results
+    read = readCrawldb();
+
+    Collections.sort(read);
+    Collections.sort(urls);
+
+    // We should have 100 less records because we've overwritten
+    Assert.assertEquals(urls.size() - 100, read.size());
+
+    Assert.assertTrue(read.containsAll(urls));
+    Assert.assertTrue(urls.containsAll(read));
+
+    // Check if we correctly preserved MD
+    Map<String, CrawlDatum> records = readCrawldbRecords();
+
+    // Iterate over the urls, we're looking for http://zzz.com/ prefixed URLs
+    // so we can check for MD and score and interval
+    Text writableKey = new Text("key");
+    Text writableValue = new Text("value");
+    for (String url : urls) {
+      if (url.indexOf("http://zzz") == 0) {
+        // Check for fetch interval
+        Assert.assertTrue(records.get(url).getFetchInterval() == 171717);
+        // Check for default score
+        Assert.assertTrue(records.get(url).getScore() != 1.0);
+        // Check for MD key=value
+        Assert.assertEquals(writableValue,
+            records.get(url).getMetaData().get(writableKey));
+      }
+    }
+  }
+
+  private List<String> readCrawldb() throws IOException {
+    Path dbfile = new Path(crawldbPath, CrawlDb.CURRENT_NAME
+        + "/part-r-00000/data");
+    System.out.println("reading:" + dbfile);
+    Option rFile = SequenceFile.Reader.file(dbfile);
+    @SuppressWarnings("resource")
+    SequenceFile.Reader reader = new SequenceFile.Reader(conf, rFile);
+    ArrayList<String> read = new ArrayList<String>();
+
+    READ: do {
+      Text key = new Text();
+      CrawlDatum value = new CrawlDatum();
+      if (!reader.next(key, value))
+        break READ;
+      read.add(key.toString());
+    } while (true);
+
+    return read;
+  }
+
+  private HashMap<String, CrawlDatum> readCrawldbRecords() throws IOException {
+    Path dbfile = new Path(crawldbPath, CrawlDb.CURRENT_NAME
+        + "/part-r-00000/data");
+    System.out.println("reading:" + dbfile);
+    Option rFile = SequenceFile.Reader.file(dbfile);
+    @SuppressWarnings("resource")
+    SequenceFile.Reader reader = new SequenceFile.Reader(conf, rFile);
+    HashMap<String, CrawlDatum> read = new HashMap<String, CrawlDatum>();
+
+    READ: do {
+      Text key = new Text();
+      CrawlDatum value = new CrawlDatum();
+      if (!reader.next(key, value))
+        break READ;
+      read.put(key.toString(), value);
+    } while (true);
+
+    return read;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/crawl/TestLinkDbMerger.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/crawl/TestLinkDbMerger.java b/nutch-core/src/test/java/org/apache/nutch/crawl/TestLinkDbMerger.java
new file mode 100644
index 0000000..23aaa88
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/crawl/TestLinkDbMerger.java
@@ -0,0 +1,160 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.TreeMap;
+import java.util.logging.Logger;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.MapFile;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.MapFile.Writer.Option;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+public class TestLinkDbMerger {
+  private static final Logger LOG = Logger.getLogger(TestLinkDbMerger.class
+      .getName());
+
+  String url10 = "http://example.com/foo";
+  String[] urls10 = new String[] { "http://example.com/100",
+      "http://example.com/101" };
+
+  String url11 = "http://example.com/";
+  String[] urls11 = new String[] { "http://example.com/110",
+      "http://example.com/111" };
+
+  String url20 = "http://example.com/";
+  String[] urls20 = new String[] { "http://foo.com/200", "http://foo.com/201" };
+  String url21 = "http://example.com/bar";
+  String[] urls21 = new String[] { "http://foo.com/210", "http://foo.com/211" };
+
+  String[] urls10_expected = urls10;
+  String[] urls11_expected = new String[] { urls11[0], urls11[1], urls20[0],
+      urls20[1] };
+  String[] urls20_expected = urls11_expected;
+  String[] urls21_expected = urls21;
+
+  TreeMap<String, String[]> init1 = new TreeMap<String, String[]>();
+  TreeMap<String, String[]> init2 = new TreeMap<String, String[]>();
+  HashMap<String, String[]> expected = new HashMap<String, String[]>();
+  Configuration conf;
+  Path testDir;
+  FileSystem fs;
+  LinkDbReader reader;
+
+  @Before
+  public void setUp() throws Exception {
+    init1.put(url10, urls10);
+    init1.put(url11, urls11);
+    init2.put(url20, urls20);
+    init2.put(url21, urls21);
+    expected.put(url10, urls10_expected);
+    expected.put(url11, urls11_expected);
+    expected.put(url20, urls20_expected);
+    expected.put(url21, urls21_expected);
+    conf = NutchConfiguration.create();
+    fs = FileSystem.get(conf);
+    testDir = new Path("build/test/test-linkdb-"
+        + new java.util.Random().nextInt());
+    fs.mkdirs(testDir);
+  }
+
+  @After
+  public void tearDown() {
+    try {
+      if (fs.exists(testDir))
+        fs.delete(testDir, true);
+    } catch (Exception e) {
+    }
+    try {
+      reader.close();
+    } catch (Exception e) {
+    }
+  }
+
+  @Test
+  public void testMerge() throws Exception {
+    Configuration conf = NutchConfiguration.create();
+    FileSystem fs = FileSystem.get(conf);
+    fs.mkdirs(testDir);
+    Path linkdb1 = new Path(testDir, "linkdb1");
+    Path linkdb2 = new Path(testDir, "linkdb2");
+    Path output = new Path(testDir, "output");
+    createLinkDb(conf, fs, linkdb1, init1);
+    createLinkDb(conf, fs, linkdb2, init2);
+    LinkDbMerger merger = new LinkDbMerger(conf);
+    LOG.fine("* merging linkdbs to " + output);
+    merger.merge(output, new Path[] { linkdb1, linkdb2 }, false, false);
+    LOG.fine("* reading linkdb: " + output);
+    reader = new LinkDbReader(conf, output);
+    Iterator<String> it = expected.keySet().iterator();
+    while (it.hasNext()) {
+      String url = it.next();
+      LOG.fine("url=" + url);
+      String[] vals = expected.get(url);
+      Inlinks inlinks = reader.getInlinks(new Text(url));
+      // may not be null
+      Assert.assertNotNull(inlinks);
+      ArrayList<String> links = new ArrayList<String>();
+      Iterator<?> it2 = inlinks.iterator();
+      while (it2.hasNext()) {
+        Inlink in = (Inlink) it2.next();
+        links.add(in.getFromUrl());
+      }
+      for (int i = 0; i < vals.length; i++) {
+        LOG.fine(" -> " + vals[i]);
+        Assert.assertTrue(links.contains(vals[i]));
+      }
+    }
+    reader.close();
+    fs.delete(testDir, true);
+  }
+
+  private void createLinkDb(Configuration config, FileSystem fs, Path linkdb,
+      TreeMap<String, String[]> init) throws Exception {
+    LOG.fine("* creating linkdb: " + linkdb);
+    Path dir = new Path(linkdb, LinkDb.CURRENT_NAME);
+    
+    Option wKeyOpt = MapFile.Writer.keyClass(Text.class);
+    org.apache.hadoop.io.SequenceFile.Writer.Option wValueOpt = SequenceFile.Writer.valueClass(Inlinks.class);
+    MapFile.Writer writer = new MapFile.Writer(config, new Path(dir,
+        "part-00000"), wKeyOpt, wValueOpt);
+    Iterator<String> it = init.keySet().iterator();
+    while (it.hasNext()) {
+      String key = it.next();
+      Inlinks inlinks = new Inlinks();
+      String[] vals = init.get(key);
+      for (int i = 0; i < vals.length; i++) {
+        Inlink in = new Inlink(vals[i], vals[i]);
+        inlinks.add(in);
+      }
+      writer.append(new Text(key), inlinks);
+    }
+    writer.close();
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/crawl/TestSignatureFactory.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/crawl/TestSignatureFactory.java b/nutch-core/src/test/java/org/apache/nutch/crawl/TestSignatureFactory.java
new file mode 100644
index 0000000..db82d7a
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/crawl/TestSignatureFactory.java
@@ -0,0 +1,35 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.crawl;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Assert;
+import org.junit.Test;
+
+public class TestSignatureFactory {
+
+  @Test
+  public void testGetSignature() {
+    Configuration conf = NutchConfiguration.create();
+    Signature signature1 = SignatureFactory.getSignature(conf);
+    Signature signature2 = SignatureFactory.getSignature(conf);
+    Assert.assertNotNull(signature1);
+    Assert.assertNotNull(signature2);
+    Assert.assertEquals(signature1, signature2);
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/fetcher/TestFetcher.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/fetcher/TestFetcher.java b/nutch-core/src/test/java/org/apache/nutch/fetcher/TestFetcher.java
new file mode 100644
index 0000000..a23d080
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/fetcher/TestFetcher.java
@@ -0,0 +1,210 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.fetcher;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDBTestUtil;
+import org.apache.nutch.crawl.Generator;
+import org.apache.nutch.crawl.Injector;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.test.IntegrationTest;
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+import org.mortbay.jetty.Server;
+
+/**
+ * Basic fetcher test 1. generate seedlist 2. inject 3. generate 3. fetch 4.
+ * Verify contents
+ * 
+ */
+public class TestFetcher {
+
+  final static Path testdir = new Path("build/test/fetch-test");
+  Configuration conf;
+  FileSystem fs;
+  Path crawldbPath;
+  Path segmentsPath;
+  Path urlPath;
+  Server server;
+
+  @Before
+  public void setUp() throws Exception {
+    conf = CrawlDBTestUtil.createConfiguration();
+    fs = FileSystem.get(conf);
+    fs.delete(testdir, true);
+    urlPath = new Path(testdir, "urls");
+    crawldbPath = new Path(testdir, "crawldb");
+    segmentsPath = new Path(testdir, "segments");
+    server = CrawlDBTestUtil.getServer(
+        conf.getInt("content.server.port", 50000),
+        "build/test/data/fetch-test-site");
+    server.start();
+  }
+
+  @After
+  public void tearDown() throws Exception {
+    server.stop();
+    for (int i = 0; i < 5; i++) {
+      if (!server.isStopped()) {
+       Thread.sleep(1000);
+      }
+    }
+    fs.delete(testdir, true);
+  }
+
+  @Test
+  @Category(IntegrationTest.class)
+  public void testFetch() throws IOException, ClassNotFoundException, InterruptedException {
+
+    // generate seedlist
+    ArrayList<String> urls = new ArrayList<String>();
+
+    addUrl(urls, "index.html");
+    addUrl(urls, "pagea.html");
+    addUrl(urls, "pageb.html");
+    addUrl(urls, "dup_of_pagea.html");
+    addUrl(urls, "nested_spider_trap.html");
+    addUrl(urls, "exception.html");
+
+    CrawlDBTestUtil.generateSeedList(fs, urlPath, urls);
+
+    // inject
+    Injector injector = new Injector(conf);
+    injector.inject(crawldbPath, urlPath);
+
+    // generate
+    Generator g = new Generator(conf);
+    Path[] generatedSegment = g.generate(crawldbPath, segmentsPath, 1,
+        Long.MAX_VALUE, Long.MAX_VALUE, false, false);
+
+    long time = System.currentTimeMillis();
+    // fetch
+    Fetcher fetcher = new Fetcher(conf);
+
+    // Set fetcher.parse to true
+    conf.setBoolean("fetcher.parse", true);
+
+    fetcher.fetch(generatedSegment[0], 1);
+
+    time = System.currentTimeMillis() - time;
+
+    // verify politeness, time taken should be more than (num_of_pages +1)*delay
+    int minimumTime = (int) ((urls.size() + 1) * 1000 * conf.getFloat(
+        "fetcher.server.delay", 5));
+    Assert.assertTrue(time > minimumTime);
+
+    // verify content
+    Path content = new Path(new Path(generatedSegment[0], Content.DIR_NAME),
+        "part-00000/data");
+    @SuppressWarnings("resource")
+    SequenceFile.Reader reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(content));
+
+    ArrayList<String> handledurls = new ArrayList<String>();
+
+    READ_CONTENT: do {
+      Text key = new Text();
+      Content value = new Content();
+      if (!reader.next(key, value))
+        break READ_CONTENT;
+      String contentString = new String(value.getContent());
+      if (contentString.indexOf("Nutch fetcher test page") != -1) {
+        handledurls.add(key.toString());
+      }
+    } while (true);
+
+    reader.close();
+
+    Collections.sort(urls);
+    Collections.sort(handledurls);
+
+    // verify that enough pages were handled
+    Assert.assertEquals(urls.size(), handledurls.size());
+
+    // verify that correct pages were handled
+    Assert.assertTrue(handledurls.containsAll(urls));
+    Assert.assertTrue(urls.containsAll(handledurls));
+
+    handledurls.clear();
+
+    // verify parse data
+    Path parseData = new Path(
+        new Path(generatedSegment[0], ParseData.DIR_NAME), "part-00000/data");
+    reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(parseData));
+
+    READ_PARSE_DATA: do {
+      Text key = new Text();
+      ParseData value = new ParseData();
+      if (!reader.next(key, value))
+        break READ_PARSE_DATA;
+      // make sure they all contain "nutch.segment.name" and
+      // "nutch.content.digest"
+      // keys in parse metadata
+      Metadata contentMeta = value.getContentMeta();
+      if (contentMeta.get(Nutch.SEGMENT_NAME_KEY) != null
+          && contentMeta.get(Nutch.SIGNATURE_KEY) != null) {
+        handledurls.add(key.toString());
+      }
+    } while (true);
+
+    Collections.sort(handledurls);
+
+    Assert.assertEquals(urls.size(), handledurls.size());
+
+    Assert.assertTrue(handledurls.containsAll(urls));
+    Assert.assertTrue(urls.containsAll(handledurls));
+  }
+
+  private void addUrl(ArrayList<String> urls, String page) {
+    urls.add("http://127.0.0.1:" + server.getConnectors()[0].getPort() + "/"
+        + page);
+  }
+
+  @Test
+  public void testAgentNameCheck() {
+
+    boolean failedNoAgentName = false;
+    conf.set("http.agent.name", "");
+
+    try {
+      conf.setBoolean("fetcher.parse", false);
+      Fetcher fetcher = new Fetcher(conf);
+      fetcher.fetch(null, 1);
+    } catch (IllegalArgumentException iae) {
+      String message = iae.getMessage();
+      failedNoAgentName = message.equals("Fetcher: No agents listed in "
+          + "'http.agent.name' property.");
+    } catch (Exception e) {
+    }
+
+    Assert.assertTrue(failedNoAgentName);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/indexer/TestIndexerMapReduce.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/indexer/TestIndexerMapReduce.java b/nutch-core/src/test/java/org/apache/nutch/indexer/TestIndexerMapReduce.java
new file mode 100644
index 0000000..3a25f26
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/indexer/TestIndexerMapReduce.java
@@ -0,0 +1,190 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.indexer;
+
+import org.apache.commons.codec.binary.Base64;
+import org.apache.hadoop.mrunit.ReduceDriver;
+import org.apache.hadoop.mrunit.types.Pair;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.NutchWritable;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.parse.ParseText;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.test.IntegrationTest;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.Reducer;
+
+import static org.junit.Assert.*;
+
+import java.io.IOException;
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+
+/** Test {@link IndexerMapReduce} */
+public class TestIndexerMapReduce {
+
+  private static final Logger LOG = LoggerFactory
+      .getLogger(TestIndexerMapReduce.class);
+
+  public static String testUrl = "http://nutch.apache.org/";
+  public static Text testUrlText = new Text(testUrl);
+  public static String htmlContentType = "text/html";
+  public static String testHtmlDoc = "<!DOCTYPE html>\n"
+      + "<html>\n"
+      + "<head>\n"
+      + "<title>Test Indexing Binary Content</title>\n"
+      + "<meta charset=\"utf-8\">\n"
+      + "<meta name=\"keywords\" lang=\"en\" content=\"charset, encoding\" />\n"
+      + "<meta name=\"keywords\" lang=\"fr\" content=\"codage des caract�res\" />\n"
+      + "<meta name=\"keywords\" lang=\"cs\" content=\"k�dov�n� znak\u016f\" />\n"
+      + "</head>\n"
+      + "<body>\n"
+      + "<p>\n"
+      + "<ul>\n"
+      + "  <li lang=\"en\">English: character set, encoding\n"
+      + "  <li lang=\"fr\">Fran�ais: codage des caract�res\n"
+      + "  <li lang=\"cs\">\u010ce\u0161tina: k�dov�n� znak\u016f (not covered by Latin-1)\n"
+      + "</ul>\n"
+      + "</body>\n"
+      + "</html>";
+  public static Metadata htmlMeta = new Metadata();
+  static {
+    htmlMeta.add("Content-Type", "text/html");
+    // add segment and signature to avoid NPEs
+    htmlMeta.add(Nutch.SEGMENT_NAME_KEY, "123");
+    htmlMeta.add(Nutch.SIGNATURE_KEY, "123");
+  }
+  public static ParseText parseText = new ParseText("Test");
+  public static ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS,
+      "Test", new Outlink[] {}, htmlMeta);
+  public static CrawlDatum crawlDatumDbFetched = new CrawlDatum(
+      CrawlDatum.STATUS_DB_FETCHED, 60 * 60 * 24);
+  public static CrawlDatum crawlDatumFetchSuccess = new CrawlDatum(
+      CrawlDatum.STATUS_FETCH_SUCCESS, 60 * 60 * 24);
+
+  private Reducer<Text, NutchWritable, Text, NutchIndexAction> reducer = new IndexerMapReduce();
+  private ReduceDriver<Text, NutchWritable, Text, NutchIndexAction> reduceDriver;
+  private Configuration configuration;
+
+
+  /**
+   * Test indexing of base64-encoded binary content.
+   */
+  @Test
+  @Category(IntegrationTest.class)
+  public void testBinaryContentBase64() {
+    configuration = NutchConfiguration.create();
+    configuration.setBoolean(IndexerMapReduce.INDEXER_BINARY_AS_BASE64, true);
+
+    Charset[] testCharsets = { StandardCharsets.UTF_8,
+        Charset.forName("iso-8859-1"), Charset.forName("iso-8859-2") };
+    for (Charset charset : testCharsets) {
+      LOG.info("Testing indexing binary content as base64 for charset {}",
+          charset.name());
+
+      String htmlDoc = testHtmlDoc;
+      if (charset != StandardCharsets.UTF_8) {
+        htmlDoc = htmlDoc.replaceAll("utf-8", charset.name());
+        if (charset.name().equalsIgnoreCase("iso-8859-1")) {
+          // Western-European character set: remove Czech content
+          htmlDoc = htmlDoc.replaceAll("\\s*<[^>]+\\slang=\"cs\".+?\\n", "");
+        } else if (charset.name().equalsIgnoreCase("iso-8859-2")) {
+          // Eastern-European character set: remove French content
+          htmlDoc = htmlDoc.replaceAll("\\s*<[^>]+\\slang=\"fr\".+?\\n", "");
+        }
+      }
+
+      Content content = new Content(testUrl, testUrl,
+          htmlDoc.getBytes(charset), htmlContentType, htmlMeta,
+          configuration);
+
+      NutchDocument doc = runIndexer(crawlDatumDbFetched,
+          crawlDatumFetchSuccess, parseText, parseData, content);
+      assertNotNull("No NutchDocument indexed", doc);
+
+      String binaryContentBase64 = (String) doc.getField("binaryContent")
+          .getValues().get(0);
+      LOG.info("binary content (base64): {}", binaryContentBase64);
+      String binaryContent = new String(
+          Base64.decodeBase64(binaryContentBase64), charset);
+      LOG.info("binary content (decoded): {}", binaryContent);
+      assertEquals(
+          "Binary content (" + charset + ") not correctly saved as base64",
+          htmlDoc, binaryContent);
+    }
+  }
+
+  /**
+   * Run {@link IndexerMapReduce.reduce(...)} to get a &quot;indexed&quot;
+   * {@link NutchDocument} by passing objects from segment and CrawlDb to the
+   * indexer.
+   *
+   * @param dbDatum
+   *          crawl datum from CrawlDb
+   * @param fetchDatum
+   *          crawl datum (fetch status) from segment
+   * @param parseText
+   *          plain text from parsed document
+   * @param parseData
+   *          parse data
+   * @param content
+   *          (optional, if index binary content) protocol content
+   * @return &quot;indexed&quot; document
+   */
+  public NutchDocument runIndexer(CrawlDatum dbDatum, CrawlDatum fetchDatum,
+      ParseText parseText, ParseData parseData, Content content) {
+    List<NutchWritable> values = new ArrayList<NutchWritable>();
+    values.add(new NutchWritable(dbDatum));
+    values.add(new NutchWritable(fetchDatum));
+    values.add(new NutchWritable(parseText));
+    values.add(new NutchWritable(parseData));
+    values.add(new NutchWritable(content));
+    reduceDriver = ReduceDriver.newReduceDriver(reducer);
+    reduceDriver.setConfiguration(configuration);
+    reduceDriver.withInput(testUrlText, values);
+    List<Pair<Text, NutchIndexAction>> reduceResult;
+    NutchDocument doc = null;
+    try {
+      reduceResult = reduceDriver.run();
+      for (Pair<Text, NutchIndexAction> p : reduceResult) {
+        if (p.getSecond().action != NutchIndexAction.DELETE) {
+          doc = p.getSecond().doc;
+        }
+      }
+    } catch (IOException e) {
+      LOG.error(StringUtils.stringifyException(e));
+    }
+    return doc;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/indexer/TestIndexingFilters.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/indexer/TestIndexingFilters.java b/nutch-core/src/test/java/org/apache/nutch/indexer/TestIndexingFilters.java
new file mode 100644
index 0000000..14b246b
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/indexer/TestIndexingFilters.java
@@ -0,0 +1,113 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.test.IntegrationTest;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Assert;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+
+@Category(IntegrationTest.class)
+public class TestIndexingFilters {
+
+  /**
+   * Test behaviour when defined filter does not exist.
+   * 
+   * @throws IndexingException
+   */
+  @Test
+  public void testNonExistingIndexingFilter() throws IndexingException {
+    Configuration conf = NutchConfiguration.create();
+    conf.addResource("nutch-default.xml");
+    conf.addResource("crawl-tests.xml");
+
+    String class1 = "NonExistingFilter";
+    String class2 = "org.apache.nutch.indexer.basic.BasicIndexingFilter";
+    conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1 + " " + class2);
+
+    IndexingFilters filters = new IndexingFilters(conf);
+    filters.filter(new NutchDocument(), new ParseImpl("text", new ParseData(
+        new ParseStatus(), "title", new Outlink[0], new Metadata())), new Text(
+        "http://www.example.com/"), new CrawlDatum(), new Inlinks());
+  }
+
+  /**
+   * Test behaviour when NutchDOcument is null
+   */
+  @Test
+  public void testNutchDocumentNullIndexingFilter() throws IndexingException {
+    Configuration conf = NutchConfiguration.create();
+    conf.addResource("nutch-default.xml");
+    conf.addResource("crawl-tests.xml");
+
+    IndexingFilters filters = new IndexingFilters(conf);
+    NutchDocument doc = filters.filter(null, new ParseImpl("text",
+        new ParseData(new ParseStatus(), "title", new Outlink[0],
+            new Metadata())), new Text("http://www.example.com/"),
+        new CrawlDatum(), new Inlinks());
+
+    Assert.assertNull(doc);
+  }
+
+  /**
+   * Test behaviour when reset the index filter order will not take effect
+   * 
+   * @throws IndexingException
+   */
+  @Test
+  public void testFilterCacheIndexingFilter() throws IndexingException {
+    Configuration conf = NutchConfiguration.create();
+    conf.addResource("nutch-default.xml");
+    conf.addResource("crawl-tests.xml");
+
+    String class1 = "org.apache.nutch.indexer.basic.BasicIndexingFilter";
+    conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1);
+
+    IndexingFilters filters1 = new IndexingFilters(conf);
+    NutchDocument fdoc1 = filters1.filter(new NutchDocument(), new ParseImpl(
+        "text", new ParseData(new ParseStatus(), "title", new Outlink[0],
+            new Metadata())), new Text("http://www.example.com/"),
+        new CrawlDatum(), new Inlinks());
+
+    // add another index filter
+    String class2 = "org.apache.nutch.indexer.metadata.MetadataIndexer";
+    // set content metadata
+    Metadata md = new Metadata();
+    md.add("example", "data");
+    // set content metadata property defined in MetadataIndexer
+    conf.set("index.content.md", "example");
+    // add MetadataIndxer filter
+    conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1 + " " + class2);
+    IndexingFilters filters2 = new IndexingFilters(conf);
+    NutchDocument fdoc2 = filters2.filter(new NutchDocument(), new ParseImpl(
+        "text", new ParseData(new ParseStatus(), "title", new Outlink[0], md)),
+        new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
+    Assert.assertEquals(fdoc1.getFieldNames().size(), fdoc2.getFieldNames()
+        .size());
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/metadata/TestMetadata.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/metadata/TestMetadata.java b/nutch-core/src/test/java/org/apache/nutch/metadata/TestMetadata.java
new file mode 100644
index 0000000..f3a320d
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/metadata/TestMetadata.java
@@ -0,0 +1,281 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.metadata;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.io.IOException;
+import java.util.Properties;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+/**
+ * JUnit based tests of class {@link org.apache.nutch.metadata.Metadata}.
+ */
+public class TestMetadata {
+
+  private static final String CONTENTTYPE = "contenttype";
+
+  /**
+   * Test to ensure that only non-null values get written when the
+   * {@link Metadata} object is written using a Writeable.
+   * 
+   * @since NUTCH-406
+   * 
+   */
+  @Test
+  public void testWriteNonNull() {
+    Metadata met = new Metadata();
+    met.add(CONTENTTYPE, null);
+    met.add(CONTENTTYPE, "text/bogus");
+    met.add(CONTENTTYPE, "text/bogus2");
+    met = writeRead(met);
+
+    Assert.assertNotNull(met);
+    Assert.assertEquals(met.size(), 1);
+
+    boolean hasBogus = false, hasBogus2 = false;
+
+    String[] values = met.getValues(CONTENTTYPE);
+    Assert.assertNotNull(values);
+    Assert.assertEquals(values.length, 2);
+
+    for (int i = 0; i < values.length; i++) {
+      if (values[i].equals("text/bogus")) {
+        hasBogus = true;
+      }
+
+      if (values[i].equals("text/bogus2")) {
+        hasBogus2 = true;
+      }
+    }
+
+    Assert.assertTrue(hasBogus && hasBogus2);
+  }
+
+  /** Test for the <code>add(String, String)</code> method. */
+  @Test
+  public void testAdd() {
+    String[] values = null;
+    Metadata meta = new Metadata();
+
+    values = meta.getValues(CONTENTTYPE);
+    Assert.assertEquals(0, values.length);
+
+    meta.add(CONTENTTYPE, "value1");
+    values = meta.getValues(CONTENTTYPE);
+    Assert.assertEquals(1, values.length);
+    Assert.assertEquals("value1", values[0]);
+
+    meta.add(CONTENTTYPE, "value2");
+    values = meta.getValues(CONTENTTYPE);
+    Assert.assertEquals(2, values.length);
+    Assert.assertEquals("value1", values[0]);
+    Assert.assertEquals("value2", values[1]);
+
+    // NOTE : For now, the same value can be added many times.
+    // Should it be changed?
+    meta.add(CONTENTTYPE, "value1");
+    values = meta.getValues(CONTENTTYPE);
+    Assert.assertEquals(3, values.length);
+    Assert.assertEquals("value1", values[0]);
+    Assert.assertEquals("value2", values[1]);
+    Assert.assertEquals("value1", values[2]);
+  }
+
+  /** Test for the <code>set(String, String)</code> method. */
+  @Test
+  public void testSet() {
+    String[] values = null;
+    Metadata meta = new Metadata();
+
+    values = meta.getValues(CONTENTTYPE);
+    Assert.assertEquals(0, values.length);
+
+    meta.set(CONTENTTYPE, "value1");
+    values = meta.getValues(CONTENTTYPE);
+    Assert.assertEquals(1, values.length);
+    Assert.assertEquals("value1", values[0]);
+
+    meta.set(CONTENTTYPE, "value2");
+    values = meta.getValues(CONTENTTYPE);
+    Assert.assertEquals(1, values.length);
+    Assert.assertEquals("value2", values[0]);
+
+    meta.set(CONTENTTYPE, "new value 1");
+    meta.add("contenttype", "new value 2");
+    values = meta.getValues(CONTENTTYPE);
+    Assert.assertEquals(2, values.length);
+    Assert.assertEquals("new value 1", values[0]);
+    Assert.assertEquals("new value 2", values[1]);
+  }
+
+  /** Test for <code>setAll(Properties)</code> method. */
+  @Test
+  public void testSetProperties() {
+    String[] values = null;
+    Metadata meta = new Metadata();
+    Properties props = new Properties();
+
+    meta.setAll(props);
+    Assert.assertEquals(0, meta.size());
+
+    props.setProperty("name-one", "value1.1");
+    meta.setAll(props);
+    Assert.assertEquals(1, meta.size());
+    values = meta.getValues("name-one");
+    Assert.assertEquals(1, values.length);
+    Assert.assertEquals("value1.1", values[0]);
+
+    props.setProperty("name-two", "value2.1");
+    meta.setAll(props);
+    Assert.assertEquals(2, meta.size());
+    values = meta.getValues("name-one");
+    Assert.assertEquals(1, values.length);
+    Assert.assertEquals("value1.1", values[0]);
+    values = meta.getValues("name-two");
+    Assert.assertEquals(1, values.length);
+    Assert.assertEquals("value2.1", values[0]);
+  }
+
+  /** Test for <code>get(String)</code> method. */
+  @Test
+  public void testGet() {
+    Metadata meta = new Metadata();
+    Assert.assertNull(meta.get("a-name"));
+    meta.add("a-name", "value-1");
+    Assert.assertEquals("value-1", meta.get("a-name"));
+    meta.add("a-name", "value-2");
+    Assert.assertEquals("value-1", meta.get("a-name"));
+  }
+
+  /** Test for <code>isMultiValued()</code> method. */
+  @Test
+  public void testIsMultiValued() {
+    Metadata meta = new Metadata();
+    Assert.assertFalse(meta.isMultiValued("key"));
+    meta.add("key", "value1");
+    Assert.assertFalse(meta.isMultiValued("key"));
+    meta.add("key", "value2");
+    Assert.assertTrue(meta.isMultiValued("key"));
+  }
+
+  /** Test for <code>names</code> method. */
+  @Test
+  public void testNames() {
+    String[] names = null;
+    Metadata meta = new Metadata();
+    names = meta.names();
+    Assert.assertEquals(0, names.length);
+
+    meta.add("name-one", "value");
+    names = meta.names();
+    Assert.assertEquals(1, names.length);
+    Assert.assertEquals("name-one", names[0]);
+    meta.add("name-two", "value");
+    names = meta.names();
+    Assert.assertEquals(2, names.length);
+  }
+
+  /** Test for <code>remove(String)</code> method. */
+  @Test
+  public void testRemove() {
+    Metadata meta = new Metadata();
+    meta.remove("name-one");
+    Assert.assertEquals(0, meta.size());
+    meta.add("name-one", "value-1.1");
+    meta.add("name-one", "value-1.2");
+    meta.add("name-two", "value-2.2");
+    Assert.assertEquals(2, meta.size());
+    Assert.assertNotNull(meta.get("name-one"));
+    Assert.assertNotNull(meta.get("name-two"));
+    meta.remove("name-one");
+    Assert.assertEquals(1, meta.size());
+    Assert.assertNull(meta.get("name-one"));
+    Assert.assertNotNull(meta.get("name-two"));
+    meta.remove("name-two");
+    Assert.assertEquals(0, meta.size());
+    Assert.assertNull(meta.get("name-one"));
+    Assert.assertNull(meta.get("name-two"));
+  }
+
+  /** Test for <code>equals(Object)</code> method. */
+  @Test
+  public void testObject() {
+    Metadata meta1 = new Metadata();
+    Metadata meta2 = new Metadata();
+    Assert.assertFalse(meta1.equals(null));
+    Assert.assertFalse(meta1.equals("String"));
+    Assert.assertTrue(meta1.equals(meta2));
+    meta1.add("name-one", "value-1.1");
+    Assert.assertFalse(meta1.equals(meta2));
+    meta2.add("name-one", "value-1.1");
+    Assert.assertTrue(meta1.equals(meta2));
+    meta1.add("name-one", "value-1.2");
+    Assert.assertFalse(meta1.equals(meta2));
+    meta2.add("name-one", "value-1.2");
+    Assert.assertTrue(meta1.equals(meta2));
+    meta1.add("name-two", "value-2.1");
+    Assert.assertFalse(meta1.equals(meta2));
+    meta2.add("name-two", "value-2.1");
+    Assert.assertTrue(meta1.equals(meta2));
+    meta1.add("name-two", "value-2.2");
+    Assert.assertFalse(meta1.equals(meta2));
+    meta2.add("name-two", "value-2.x");
+    Assert.assertFalse(meta1.equals(meta2));
+  }
+
+  /** Test for <code>Writable</code> implementation. */
+  @Test
+  public void testWritable() {
+    Metadata result = null;
+    Metadata meta = new Metadata();
+    result = writeRead(meta);
+    Assert.assertEquals(0, result.size());
+    meta.add("name-one", "value-1.1");
+    result = writeRead(meta);
+    Assert.assertEquals(1, result.size());
+    Assert.assertEquals(1, result.getValues("name-one").length);
+    Assert.assertEquals("value-1.1", result.get("name-one"));
+    meta.add("name-two", "value-2.1");
+    meta.add("name-two", "value-2.2");
+    result = writeRead(meta);
+    Assert.assertEquals(2, result.size());
+    Assert.assertEquals(1, result.getValues("name-one").length);
+    Assert.assertEquals("value-1.1", result.getValues("name-one")[0]);
+    Assert.assertEquals(2, result.getValues("name-two").length);
+    Assert.assertEquals("value-2.1", result.getValues("name-two")[0]);
+    Assert.assertEquals("value-2.2", result.getValues("name-two")[1]);
+  }
+
+  private Metadata writeRead(Metadata meta) {
+    Metadata readed = new Metadata();
+    try {
+      ByteArrayOutputStream out = new ByteArrayOutputStream();
+      meta.write(new DataOutputStream(out));
+      readed.readFields(new DataInputStream(new ByteArrayInputStream(out
+          .toByteArray())));
+    } catch (IOException ioe) {
+      Assert.fail(ioe.toString());
+    }
+    return readed;
+  }
+
+}

[08/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build for nutch-core and nutch-plugins

Posted by th...@apache.org.

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/Client.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/Client.java b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/Client.java
new file mode 100644
index 0000000..da25d87
--- /dev/null
+++ b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/Client.java
@@ -0,0 +1,595 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol.ftp;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.OutputStream;
+
+import java.net.InetAddress;
+import java.net.Socket;
+
+import java.util.List;
+//import java.util.LinkedList;
+
+import org.apache.commons.net.MalformedServerReplyException;
+
+import org.apache.commons.net.ftp.FTP;
+import org.apache.commons.net.ftp.FTPCommand;
+import org.apache.commons.net.ftp.FTPFile;
+import org.apache.commons.net.ftp.FTPFileEntryParser;
+import org.apache.commons.net.ftp.FTPReply;
+
+import org.apache.commons.net.ftp.FTPConnectionClosedException;
+
+/***********************************************
+ * Client.java encapsulates functionalities necessary for nutch to get dir list
+ * and retrieve file from an FTP server. This class takes care of all low level
+ * details of interacting with an FTP server and provides a convenient higher
+ * level interface.
+ * 
+ * Modified from FtpClient.java in apache commons-net.
+ * 
+ * Notes by John Xing: ftp server implementations are hardly uniform and none
+ * seems to follow RFCs whole-heartedly. We have no choice, but assume common
+ * denominator as following: (1) Use stream mode for data transfer. Block mode
+ * will be better for multiple file downloading and partial file downloading.
+ * However not every ftpd has block mode support. (2) Use passive mode for data
+ * connection. So Nutch will work if we run behind firewall. (3) Data connection
+ * is opened/closed per ftp command for the reasons listed in (1). There are ftp
+ * servers out there, when partial downloading is enforced by closing data
+ * channel socket on our client side, the server side immediately closes control
+ * channel (socket). Our codes deal with such a bad behavior. (4) LIST is used
+ * to obtain remote file attributes if possible. MDTM & SIZE would be nice, but
+ * not as ubiquitously implemented as LIST. (5) Avoid using ABOR in single
+ * thread? Do not use it at all.
+ * 
+ * About exceptions: Some specific exceptions are re-thrown as one of
+ * FtpException*.java In fact, each function throws FtpException*.java or pass
+ * IOException.
+ * 
+ * @author John Xing
+ ***********************************************/
+
+public class Client extends FTP {
+  private int __dataTimeout;
+  private int __passivePort;
+  private String __passiveHost;
+  // private int __fileType, __fileFormat;
+  private boolean __remoteVerificationEnabled;
+  // private FTPFileEntryParser __entryParser;
+  private String __systemName;
+
+  /** Public default constructor */
+  public Client() {
+    __initDefaults();
+    __dataTimeout = -1;
+    __remoteVerificationEnabled = true;
+  }
+
+  // defaults when initialize
+  private void __initDefaults() {
+    __passiveHost = null;
+    __passivePort = -1;
+    __systemName = null;
+    // __fileType = FTP.ASCII_FILE_TYPE;
+    // __fileFormat = FTP.NON_PRINT_TEXT_FORMAT;
+    // __entryParser = null;
+  }
+
+  // parse reply for pass()
+  private void __parsePassiveModeReply(String reply)
+      throws MalformedServerReplyException {
+    int i, index, lastIndex;
+    String octet1, octet2;
+    StringBuffer host;
+
+    reply = reply.substring(reply.indexOf('(') + 1, reply.indexOf(')')).trim();
+
+    host = new StringBuffer(24);
+    lastIndex = 0;
+    index = reply.indexOf(',');
+    host.append(reply.substring(lastIndex, index));
+
+    for (i = 0; i < 3; i++) {
+      host.append('.');
+      lastIndex = index + 1;
+      index = reply.indexOf(',', lastIndex);
+      host.append(reply.substring(lastIndex, index));
+    }
+
+    lastIndex = index + 1;
+    index = reply.indexOf(',', lastIndex);
+
+    octet1 = reply.substring(lastIndex, index);
+    octet2 = reply.substring(index + 1);
+
+    // index and lastIndex now used as temporaries
+    try {
+      index = Integer.parseInt(octet1);
+      lastIndex = Integer.parseInt(octet2);
+    } catch (NumberFormatException e) {
+      throw new MalformedServerReplyException(
+          "Could not parse passive host information.\nServer Reply: " + reply);
+    }
+
+    index <<= 8;
+    index |= lastIndex;
+
+    __passiveHost = host.toString();
+    __passivePort = index;
+  }
+
+  /**
+   * open a passive data connection socket
+   * 
+   * @param command
+   * @param arg
+   * @return
+   * @throws IOException
+   * @throws FtpExceptionCanNotHaveDataConnection
+   */
+  protected Socket __openPassiveDataConnection(int command, String arg)
+      throws IOException, FtpExceptionCanNotHaveDataConnection {
+    Socket socket;
+
+    // // 20040317, xing, accommodate ill-behaved servers, see below
+    // int port_previous = __passivePort;
+
+    if (pasv() != FTPReply.ENTERING_PASSIVE_MODE)
+      throw new FtpExceptionCanNotHaveDataConnection("pasv() failed. "
+          + getReplyString());
+
+    try {
+      __parsePassiveModeReply(getReplyStrings()[0]);
+    } catch (MalformedServerReplyException e) {
+      throw new FtpExceptionCanNotHaveDataConnection(e.getMessage());
+    }
+
+    // // 20040317, xing, accommodate ill-behaved servers, see above
+    // int count = 0;
+    // System.err.println("__passivePort "+__passivePort);
+    // System.err.println("port_previous "+port_previous);
+    // while (__passivePort == port_previous) {
+    // // just quit if too many tries. make it an exception here?
+    // if (count++ > 10)
+    // return null;
+    // // slow down further for each new try
+    // Thread.sleep(500*count);
+    // if (pasv() != FTPReply.ENTERING_PASSIVE_MODE)
+    // throw new FtpExceptionCanNotHaveDataConnection(
+    // "pasv() failed. " + getReplyString());
+    // //return null;
+    // try {
+    // __parsePassiveModeReply(getReplyStrings()[0]);
+    // } catch (MalformedServerReplyException e) {
+    // throw new FtpExceptionCanNotHaveDataConnection(e.getMessage());
+    // }
+    // }
+
+    socket = _socketFactory_.createSocket(__passiveHost, __passivePort);
+
+    if (!FTPReply.isPositivePreliminary(sendCommand(command, arg))) {
+      socket.close();
+      return null;
+    }
+
+    if (__remoteVerificationEnabled && !verifyRemote(socket)) {
+      InetAddress host1, host2;
+
+      host1 = socket.getInetAddress();
+      host2 = getRemoteAddress();
+
+      socket.close();
+
+      // our precaution
+      throw new FtpExceptionCanNotHaveDataConnection(
+          "Host attempting data connection " + host1.getHostAddress()
+              + " is not same as server " + host2.getHostAddress()
+              + " So we intentionally close it for security precaution.");
+    }
+
+    if (__dataTimeout >= 0)
+      socket.setSoTimeout(__dataTimeout);
+
+    return socket;
+  }
+
+  /***
+   * Sets the timeout in milliseconds to use for data connection. set
+   * immediately after opening the data connection.
+   ***/
+  public void setDataTimeout(int timeout) {
+    __dataTimeout = timeout;
+  }
+
+  /***
+   * Closes the connection to the FTP server and restores connection parameters
+   * to the default values.
+   * <p>
+   * 
+   * @exception IOException
+   *              If an error occurs while disconnecting.
+   ***/
+  public void disconnect() throws IOException {
+    __initDefaults();
+    super.disconnect();
+    // no worry for data connection, since we always close it
+    // in every ftp command that invloves data connection
+  }
+
+  /***
+   * Enable or disable verification that the remote host taking part of a data
+   * connection is the same as the host to which the control connection is
+   * attached. The default is for verification to be enabled. You may set this
+   * value at any time, whether the FTPClient is currently connected or not.
+   * <p>
+   * 
+   * @param enable
+   *          True to enable verification, false to disable verification.
+   ***/
+  public void setRemoteVerificationEnabled(boolean enable) {
+    __remoteVerificationEnabled = enable;
+  }
+
+  /***
+   * Return whether or not verification of the remote host participating in data
+   * connections is enabled. The default behavior is for verification to be
+   * enabled.
+   * <p>
+   * 
+   * @return True if verification is enabled, false if not.
+   ***/
+  public boolean isRemoteVerificationEnabled() {
+    return __remoteVerificationEnabled;
+  }
+
+  /***
+   * Login to the FTP server using the provided username and password.
+   * <p>
+   * 
+   * @param username
+   *          The username to login under.
+   * @param password
+   *          The password to use.
+   * @return True if successfully completed, false if not.
+   * @exception FTPConnectionClosedException
+   *              If the FTP server prematurely closes the connection as a
+   *              result of the client being idle or some other reason causing
+   *              the server to send FTP reply code 421. This exception may be
+   *              caught either as an IOException or independently as itself.
+   * @exception IOException
+   *              If an I/O error occurs while either sending a command to the
+   *              server or receiving a reply from the server.
+   ***/
+  public boolean login(String username, String password) throws IOException {
+    user(username);
+
+    if (FTPReply.isPositiveCompletion(getReplyCode()))
+      return true;
+
+    // If we get here, we either have an error code, or an intermmediate
+    // reply requesting password.
+    if (!FTPReply.isPositiveIntermediate(getReplyCode()))
+      return false;
+
+    return FTPReply.isPositiveCompletion(pass(password));
+  }
+
+  /***
+   * Logout of the FTP server by sending the QUIT command.
+   * <p>
+   * 
+   * @return True if successfully completed, false if not.
+   * @exception FTPConnectionClosedException
+   *              If the FTP server prematurely closes the connection as a
+   *              result of the client being idle or some other reason causing
+   *              the server to send FTP reply code 421. This exception may be
+   *              caught either as an IOException or independently as itself.
+   * @exception IOException
+   *              If an I/O error occurs while either sending a command to the
+   *              server or receiving a reply from the server.
+   ***/
+  public boolean logout() throws IOException {
+    return FTPReply.isPositiveCompletion(quit());
+  }
+
+  /**
+   * retrieve list reply for path
+   * 
+   * @param path
+   * @param entries
+   * @param limit
+   * @param parser
+   * @throws IOException
+   * @throws FtpExceptionCanNotHaveDataConnection
+   * @throws FtpExceptionUnknownForcedDataClose
+   * @throws FtpExceptionControlClosedByForcedDataClose
+   */
+  public void retrieveList(String path, List<FTPFile> entries, int limit,
+      FTPFileEntryParser parser) throws IOException,
+      FtpExceptionCanNotHaveDataConnection, FtpExceptionUnknownForcedDataClose,
+      FtpExceptionControlClosedByForcedDataClose {
+    Socket socket = __openPassiveDataConnection(FTPCommand.LIST, path);
+
+    if (socket == null)
+      throw new FtpExceptionCanNotHaveDataConnection("LIST "
+          + ((path == null) ? "" : path));
+
+    BufferedReader reader = new BufferedReader(new InputStreamReader(
+        socket.getInputStream()));
+
+    // force-close data channel socket, when download limit is reached
+    // boolean mandatory_close = false;
+
+    // List entries = new LinkedList();
+    int count = 0;
+    String line = parser.readNextEntry(reader);
+    while (line != null) {
+      FTPFile ftpFile = parser.parseFTPEntry(line);
+      // skip non-formatted lines
+      if (ftpFile == null) {
+        line = parser.readNextEntry(reader);
+        continue;
+      }
+      entries.add(ftpFile);
+      count += line.length();
+      // impose download limit if limit >= 0, otherwise no limit
+      // here, cut off is up to the line when total bytes is just over limit
+      if (limit >= 0 && count > limit) {
+        // mandatory_close = true;
+        break;
+      }
+      line = parser.readNextEntry(reader);
+    }
+
+    // if (mandatory_close)
+    // you always close here, no matter mandatory_close or not.
+    // however different ftp servers respond differently, see below.
+    socket.close();
+
+    // scenarios:
+    // (1) mandatory_close is false, download limit not reached
+    // no special care here
+    // (2) mandatory_close is true, download limit is reached
+    // different servers have different reply codes:
+
+    try {
+      int reply = getReply();
+      if (!_notBadReply(reply))
+        throw new FtpExceptionUnknownForcedDataClose(getReplyString());
+    } catch (FTPConnectionClosedException e) {
+      // some ftp servers will close control channel if data channel socket
+      // is closed by our end before all data has been read out. Check:
+      // tux414.q-tam.hp.com FTP server (hp.com version whp02)
+      // so must catch FTPConnectionClosedException thrown by getReply() above
+      // disconnect();
+      throw new FtpExceptionControlClosedByForcedDataClose(e.getMessage());
+    }
+
+  }
+
+  /**
+   * retrieve file for path
+   * 
+   * @param path
+   * @param os
+   * @param limit
+   * @throws IOException
+   * @throws FtpExceptionCanNotHaveDataConnection
+   * @throws FtpExceptionUnknownForcedDataClose
+   * @throws FtpExceptionControlClosedByForcedDataClose
+   */
+  public void retrieveFile(String path, OutputStream os, int limit)
+      throws IOException, FtpExceptionCanNotHaveDataConnection,
+      FtpExceptionUnknownForcedDataClose,
+      FtpExceptionControlClosedByForcedDataClose {
+
+    Socket socket = __openPassiveDataConnection(FTPCommand.RETR, path);
+
+    if (socket == null)
+      throw new FtpExceptionCanNotHaveDataConnection("RETR "
+          + ((path == null) ? "" : path));
+
+    InputStream input = socket.getInputStream();
+
+    // 20040318, xing, treat everything as BINARY_FILE_TYPE for now
+    // do we ever need ASCII_FILE_TYPE?
+    // if (__fileType == ASCII_FILE_TYPE)
+    // input = new FromNetASCIIInputStream(input);
+
+    // fixme, should we instruct server here for binary file type?
+
+    // force-close data channel socket
+    // boolean mandatory_close = false;
+
+    int len;
+    int count = 0;
+    byte[] buf = new byte[org.apache.commons.net.io.Util.DEFAULT_COPY_BUFFER_SIZE];
+    while ((len = input.read(buf, 0, buf.length)) != -1) {
+      count += len;
+      // impose download limit if limit >= 0, otherwise no limit
+      // here, cut off is exactly of limit bytes
+      if (limit >= 0 && count > limit) {
+        os.write(buf, 0, len - (count - limit));
+        // mandatory_close = true;
+        break;
+      }
+      os.write(buf, 0, len);
+      os.flush();
+    }
+
+    // if (mandatory_close)
+    // you always close here, no matter mandatory_close or not.
+    // however different ftp servers respond differently, see below.
+    socket.close();
+
+    // scenarios:
+    // (1) mandatory_close is false, download limit not reached
+    // no special care here
+    // (2) mandatory_close is true, download limit is reached
+    // different servers have different reply codes:
+
+    // do not need this
+    // sendCommand("ABOR");
+
+    try {
+      int reply = getReply();
+      if (!_notBadReply(reply))
+        throw new FtpExceptionUnknownForcedDataClose(getReplyString());
+    } catch (FTPConnectionClosedException e) {
+      // some ftp servers will close control channel if data channel socket
+      // is closed by our end before all data has been read out. Check:
+      // tux414.q-tam.hp.com FTP server (hp.com version whp02)
+      // so must catch FTPConnectionClosedException thrown by getReply() above
+      // disconnect();
+      throw new FtpExceptionControlClosedByForcedDataClose(e.getMessage());
+    }
+
+  }
+
+  /**
+   * reply check after closing data connection
+   * 
+   * @param reply
+   * @return
+   */
+  private boolean _notBadReply(int reply) {
+
+    if (FTPReply.isPositiveCompletion(reply)) {
+      // do nothing
+    } else if (reply == 426) { // FTPReply.TRANSFER_ABORTED
+      // some ftp servers reply 426, e.g.,
+      // foggy FTP server (Version wu-2.6.2(2)
+      // there is second reply witing? no!
+      // getReply();
+    } else if (reply == 450) { // FTPReply.FILE_ACTION_NOT_TAKEN
+      // some ftp servers reply 450, e.g.,
+      // ProFTPD [ftp.kernel.org]
+      // there is second reply witing? no!
+      // getReply();
+    } else if (reply == 451) { // FTPReply.ACTION_ABORTED
+      // some ftp servers reply 451, e.g.,
+      // ProFTPD [ftp.kernel.org]
+      // there is second reply witing? no!
+      // getReply();
+    } else if (reply == 451) { // FTPReply.ACTION_ABORTED
+    } else {
+      // what other kind of ftp server out there?
+      return false;
+    }
+
+    return true;
+  }
+
+  /***
+   * Sets the file type to be transferred. This should be one of
+   * <code> FTP.ASCII_FILE_TYPE </code>, <code> FTP.IMAGE_FILE_TYPE </code>,
+   * etc. The file type only needs to be set when you want to change the type.
+   * After changing it, the new type stays in effect until you change it again.
+   * The default file type is <code> FTP.ASCII_FILE_TYPE </code> if this method
+   * is never called.
+   * <p>
+   * 
+   * @param fileType
+   *          The <code> _FILE_TYPE </code> constant indcating the type of file.
+   * @return True if successfully completed, false if not.
+   * @exception FTPConnectionClosedException
+   *              If the FTP server prematurely closes the connection as a
+   *              result of the client being idle or some other reason causing
+   *              the server to send FTP reply code 421. This exception may be
+   *              caught either as an IOException or independently as itself.
+   * @exception IOException
+   *              If an I/O error occurs while either sending a command to the
+   *              server or receiving a reply from the server.
+   ***/
+  public boolean setFileType(int fileType) throws IOException {
+    if (FTPReply.isPositiveCompletion(type(fileType))) {
+      /*
+       * __fileType = fileType; __fileFormat = FTP.NON_PRINT_TEXT_FORMAT;
+       */
+      return true;
+    }
+    return false;
+  }
+
+  /***
+   * Fetches the system type name from the server and returns the string. This
+   * value is cached for the duration of the connection after the first call to
+   * this method. In other words, only the first time that you invoke this
+   * method will it issue a SYST command to the FTP server. FTPClient will
+   * remember the value and return the cached value until a call to disconnect.
+   * <p>
+   * 
+   * @return The system type name obtained from the server. null if the
+   *         information could not be obtained.
+   * @exception FTPConnectionClosedException
+   *              If the FTP server prematurely closes the connection as a
+   *              result of the client being idle or some other reason causing
+   *              the server to send FTP reply code 421. This exception may be
+   *              caught either as an IOException or independently as itself.
+   * @exception IOException
+   *              If an I/O error occurs while either sending a command to the
+   *              server or receiving a reply from the server.
+   ***/
+  public String getSystemName() throws IOException, FtpExceptionBadSystResponse {
+    // if (syst() == FTPReply.NAME_SYSTEM_TYPE)
+    // Technically, we should expect a NAME_SYSTEM_TYPE response, but
+    // in practice FTP servers deviate, so we soften the condition to
+    // a positive completion.
+    if (__systemName == null && FTPReply.isPositiveCompletion(syst())) {
+      __systemName = (getReplyStrings()[0]).substring(4);
+    } else {
+      throw new FtpExceptionBadSystResponse("Bad response of SYST: "
+          + getReplyString());
+    }
+
+    return __systemName;
+  }
+
+  /***
+   * Sends a NOOP command to the FTP server. This is useful for preventing
+   * server timeouts.
+   * <p>
+   * 
+   * @return True if successfully completed, false if not.
+   * @exception FTPConnectionClosedException
+   *              If the FTP server prematurely closes the connection as a
+   *              result of the client being idle or some other reason causing
+   *              the server to send FTP reply code 421. This exception may be
+   *              caught either as an IOException or independently as itself.
+   * @exception IOException
+   *              If an I/O error occurs while either sending a command to the
+   *              server or receiving a reply from the server.
+   ***/
+  public boolean sendNoOp() throws IOException {
+    return FTPReply.isPositiveCompletion(noop());
+  }
+
+  // client.stat(path);
+  // client.sendCommand("STAT");
+  // client.sendCommand("STAT",path);
+  // client.sendCommand("MDTM",path);
+  // client.sendCommand("SIZE",path);
+  // client.sendCommand("HELP","SITE");
+  // client.sendCommand("SYST");
+  // client.setRestartOffset(120);
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/Ftp.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/Ftp.java b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/Ftp.java
new file mode 100644
index 0000000..772f3bb
--- /dev/null
+++ b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/Ftp.java
@@ -0,0 +1,267 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol.ftp;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.commons.net.ftp.FTPFileEntryParser;
+
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.net.protocols.Response;
+
+import org.apache.hadoop.conf.Configuration;
+
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolOutput;
+import org.apache.nutch.protocol.ProtocolStatus;
+import crawlercommons.robots.BaseRobotRules;
+
+import java.net.URL;
+
+import java.io.IOException;
+
+/**
+ * This class is a protocol plugin used for ftp: scheme. It creates
+ * {@link FtpResponse} object and gets the content of the url from it.
+ * Configurable parameters are {@code ftp.username}, {@code ftp.password},
+ * {@code ftp.content.limit}, {@code ftp.timeout}, {@code ftp.server.timeout},
+ * {@code ftp.password}, {@code ftp.keep.connection} and {@code ftp.follow.talk}
+ * . For details see "FTP properties" section in {@code nutch-default.xml}.
+ */
+public class Ftp implements Protocol {
+
+  public static final Logger LOG = LoggerFactory.getLogger(Ftp.class);
+
+  private static final int BUFFER_SIZE = 16384; // 16*1024 = 16384
+
+  static final int MAX_REDIRECTS = 5;
+
+  int timeout;
+
+  int maxContentLength;
+
+  String userName;
+  String passWord;
+
+  // typical/default server timeout is 120*1000 millisec.
+  // better be conservative here
+  int serverTimeout;
+
+  // when to have client start anew
+  long renewalTime = -1;
+
+  boolean keepConnection;
+
+  boolean followTalk;
+
+  // ftp client
+  Client client = null;
+  // ftp dir list entry parser
+  FTPFileEntryParser parser = null;
+
+  private Configuration conf;
+
+  private FtpRobotRulesParser robots = null;
+
+  // constructor
+  public Ftp() {
+    robots = new FtpRobotRulesParser();
+  }
+
+  /** Set the timeout. */
+  public void setTimeout(int to) {
+    timeout = to;
+  }
+
+  /** Set the point at which content is truncated. */
+  public void setMaxContentLength(int length) {
+    maxContentLength = length;
+  }
+
+  /** Set followTalk */
+  public void setFollowTalk(boolean followTalk) {
+    this.followTalk = followTalk;
+  }
+
+  /** Set keepConnection */
+  public void setKeepConnection(boolean keepConnection) {
+    this.keepConnection = keepConnection;
+  }
+
+  /**
+   * Creates a {@link FtpResponse} object corresponding to the url and returns a
+   * {@link ProtocolOutput} object as per the content received
+   * 
+   * @param url
+   *          Text containing the ftp url
+   * @param datum
+   *          The CrawlDatum object corresponding to the url
+   * 
+   * @return {@link ProtocolOutput} object for the url
+   */
+  public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) {
+    String urlString = url.toString();
+    try {
+      URL u = new URL(urlString);
+
+      int redirects = 0;
+
+      while (true) {
+        FtpResponse response;
+        response = new FtpResponse(u, datum, this, getConf()); // make a request
+
+        int code = response.getCode();
+        datum.getMetaData().put(Nutch.PROTOCOL_STATUS_CODE_KEY,
+          new Text(Integer.toString(code)));
+        
+
+        if (code == 200) { // got a good response
+          return new ProtocolOutput(response.toContent()); // return it
+
+        } else if (code >= 300 && code < 400) { // handle redirect
+          if (redirects == MAX_REDIRECTS)
+            throw new FtpException("Too many redirects: " + url);
+          u = new URL(response.getHeader("Location"));
+          redirects++;
+          if (LOG.isTraceEnabled()) {
+            LOG.trace("redirect to " + u);
+          }
+        } else { // convert to exception
+          throw new FtpError(code);
+        }
+      }
+    } catch (Exception e) {
+      return new ProtocolOutput(null, new ProtocolStatus(e));
+    }
+  }
+
+  protected void finalize() {
+    try {
+      if (this.client != null && this.client.isConnected()) {
+        this.client.logout();
+        this.client.disconnect();
+      }
+    } catch (IOException e) {
+      // do nothing
+    }
+  }
+
+  /** For debugging. */
+  public static void main(String[] args) throws Exception {
+    int timeout = Integer.MIN_VALUE;
+    int maxContentLength = Integer.MIN_VALUE;
+    String logLevel = "info";
+    boolean followTalk = false;
+    boolean keepConnection = false;
+    boolean dumpContent = false;
+    String urlString = null;
+
+    String usage = "Usage: Ftp [-logLevel level] [-followTalk] [-keepConnection] [-timeout N] [-maxContentLength L] [-dumpContent] url";
+
+    if (args.length == 0) {
+      System.err.println(usage);
+      System.exit(-1);
+    }
+
+    for (int i = 0; i < args.length; i++) {
+      if (args[i].equals("-logLevel")) {
+        logLevel = args[++i];
+      } else if (args[i].equals("-followTalk")) {
+        followTalk = true;
+      } else if (args[i].equals("-keepConnection")) {
+        keepConnection = true;
+      } else if (args[i].equals("-timeout")) {
+        timeout = Integer.parseInt(args[++i]) * 1000;
+      } else if (args[i].equals("-maxContentLength")) {
+        maxContentLength = Integer.parseInt(args[++i]);
+      } else if (args[i].equals("-dumpContent")) {
+        dumpContent = true;
+      } else if (i != args.length - 1) {
+        System.err.println(usage);
+        System.exit(-1);
+      } else {
+        urlString = args[i];
+      }
+    }
+
+    Ftp ftp = new Ftp();
+
+    ftp.setFollowTalk(followTalk);
+    ftp.setKeepConnection(keepConnection);
+
+    if (timeout != Integer.MIN_VALUE) // set timeout
+      ftp.setTimeout(timeout);
+
+    if (maxContentLength != Integer.MIN_VALUE) // set maxContentLength
+      ftp.setMaxContentLength(maxContentLength);
+
+    // set log level
+    // LOG.setLevel(Level.parse((new String(logLevel)).toUpperCase()));
+
+    Content content = ftp.getProtocolOutput(new Text(urlString),
+        new CrawlDatum()).getContent();
+
+    System.err.println("Content-Type: " + content.getContentType());
+    System.err.println("Content-Length: "
+        + content.getMetadata().get(Response.CONTENT_LENGTH));
+    System.err.println("Last-Modified: "
+        + content.getMetadata().get(Response.LAST_MODIFIED));
+    if (dumpContent) {
+      System.out.print(new String(content.getContent()));
+    }
+
+    ftp = null;
+  }
+
+  /**
+   * Set the {@link Configuration} object
+   */
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    this.maxContentLength = conf.getInt("ftp.content.limit", 64 * 1024);
+    this.timeout = conf.getInt("ftp.timeout", 10000);
+    this.userName = conf.get("ftp.username", "anonymous");
+    this.passWord = conf.get("ftp.password", "anonymous@example.com");
+    this.serverTimeout = conf.getInt("ftp.server.timeout", 60 * 1000);
+    this.keepConnection = conf.getBoolean("ftp.keep.connection", false);
+    this.followTalk = conf.getBoolean("ftp.follow.talk", false);
+    this.robots.setConf(conf);
+  }
+
+  /**
+   * Get the {@link Configuration} object
+   */
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+  /**
+   * Get the robots rules for a given url
+   */
+  public BaseRobotRules getRobotRules(Text url, CrawlDatum datum) {
+    return robots.getRobotRulesSet(this, url);
+  }
+
+  public int getBufferSize() {
+    return BUFFER_SIZE;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpError.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpError.java b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpError.java
new file mode 100644
index 0000000..b63a67e
--- /dev/null
+++ b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpError.java
@@ -0,0 +1,36 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol.ftp;
+
+/**
+ * Thrown for Ftp error codes.
+ */
+public class FtpError extends FtpException {
+
+  private int code;
+
+  public int getCode(int code) {
+    return code;
+  }
+
+  public FtpError(int code) {
+    super("Ftp Error: " + code);
+    this.code = code;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpException.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpException.java b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpException.java
new file mode 100644
index 0000000..5a29668
--- /dev/null
+++ b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpException.java
@@ -0,0 +1,46 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol.ftp;
+
+import org.apache.nutch.protocol.ProtocolException;
+
+/***
+ * Superclass for important exceptions thrown during FTP talk, that must be
+ * handled with care.
+ * 
+ * @author John Xing
+ */
+public class FtpException extends ProtocolException {
+
+  public FtpException() {
+    super();
+  }
+
+  public FtpException(String message) {
+    super(message);
+  }
+
+  public FtpException(String message, Throwable cause) {
+    super(message, cause);
+  }
+
+  public FtpException(Throwable cause) {
+    super(cause);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpExceptionBadSystResponse.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpExceptionBadSystResponse.java b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpExceptionBadSystResponse.java
new file mode 100644
index 0000000..689ac8e
--- /dev/null
+++ b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpExceptionBadSystResponse.java
@@ -0,0 +1,29 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol.ftp;
+
+/**
+ * Exception indicating bad reply of SYST command.
+ * 
+ * @author John Xing
+ */
+public class FtpExceptionBadSystResponse extends FtpException {
+  FtpExceptionBadSystResponse(String msg) {
+    super(msg);
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpExceptionCanNotHaveDataConnection.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpExceptionCanNotHaveDataConnection.java b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpExceptionCanNotHaveDataConnection.java
new file mode 100644
index 0000000..9f35b74
--- /dev/null
+++ b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpExceptionCanNotHaveDataConnection.java
@@ -0,0 +1,29 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol.ftp;
+
+/**
+ * Exception indicating failure of opening data connection.
+ * 
+ * @author John Xing
+ */
+public class FtpExceptionCanNotHaveDataConnection extends FtpException {
+  FtpExceptionCanNotHaveDataConnection(String msg) {
+    super(msg);
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpExceptionControlClosedByForcedDataClose.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpExceptionControlClosedByForcedDataClose.java b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpExceptionControlClosedByForcedDataClose.java
new file mode 100644
index 0000000..c058fcb
--- /dev/null
+++ b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpExceptionControlClosedByForcedDataClose.java
@@ -0,0 +1,30 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol.ftp;
+
+/**
+ * Exception indicating control channel is closed by server end, due to forced
+ * closure of data channel at client (our) end.
+ * 
+ * @author John Xing
+ */
+public class FtpExceptionControlClosedByForcedDataClose extends FtpException {
+  FtpExceptionControlClosedByForcedDataClose(String msg) {
+    super(msg);
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpExceptionUnknownForcedDataClose.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpExceptionUnknownForcedDataClose.java b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpExceptionUnknownForcedDataClose.java
new file mode 100644
index 0000000..9083d7c
--- /dev/null
+++ b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpExceptionUnknownForcedDataClose.java
@@ -0,0 +1,30 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol.ftp;
+
+/**
+ * Exception indicating unrecognizable reply from server after forced closure of
+ * data channel by client (our) side.
+ * 
+ * @author John Xing
+ */
+public class FtpExceptionUnknownForcedDataClose extends FtpException {
+  FtpExceptionUnknownForcedDataClose(String msg) {
+    super(msg);
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpResponse.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpResponse.java b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpResponse.java
new file mode 100644
index 0000000..f7c7c6d
--- /dev/null
+++ b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpResponse.java
@@ -0,0 +1,521 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol.ftp;
+
+import org.apache.commons.net.ftp.FTP;
+import org.apache.commons.net.ftp.FTPFile;
+import org.apache.commons.net.ftp.FTPReply;
+import org.apache.commons.net.ftp.parser.DefaultFTPFileEntryParserFactory;
+import org.apache.commons.net.ftp.parser.ParserInitializationException;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.net.protocols.HttpDateFormat;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.hadoop.conf.Configuration;
+
+import java.net.InetAddress;
+import java.net.URL;
+import java.util.List;
+import java.util.LinkedList;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+
+/**
+ * FtpResponse.java mimics ftp replies as http response. It tries its best to
+ * follow http's way for headers, response codes as well as exceptions.
+ * 
+ * Comments: In this class, all FtpException*.java thrown by Client.java and
+ * some important commons-net exceptions passed by Client.java must have been
+ * properly dealt with. They'd better not be leaked to the caller of this class.
+ */
+public class FtpResponse {
+
+  private String orig;
+  private String base;
+  private byte[] content;
+  private static final byte[] EMPTY_CONTENT = new byte[0];
+  private int code;
+  private Metadata headers = new Metadata();
+
+  private final Ftp ftp;
+  private Configuration conf;
+
+  /** Returns the response code. */
+  public int getCode() {
+    return code;
+  }
+
+  /** Returns the value of a named header. */
+  public String getHeader(String name) {
+    return headers.get(name);
+  }
+
+  public byte[] getContent() {
+    return content;
+  }
+
+  public Content toContent() {
+    return new Content(orig, base, (content != null ? content : EMPTY_CONTENT),
+        getHeader(Response.CONTENT_TYPE), headers, this.conf);
+  }
+
+  public FtpResponse(URL url, CrawlDatum datum, Ftp ftp, Configuration conf)
+      throws FtpException, IOException {
+
+    this.orig = url.toString();
+    this.base = url.toString();
+    this.ftp = ftp;
+    this.conf = conf;
+
+    if (!"ftp".equals(url.getProtocol()))
+      throw new FtpException("Not a ftp url:" + url);
+
+    if (url.getPath() != url.getFile()) {
+      if (Ftp.LOG.isWarnEnabled()) {
+        Ftp.LOG.warn("url.getPath() != url.getFile(): " + url);
+      }
+    }
+
+    String path = "".equals(url.getPath()) ? "/" : url.getPath();
+
+    try {
+
+      if (ftp.followTalk) {
+        if (Ftp.LOG.isInfoEnabled()) {
+          Ftp.LOG.info("fetching " + url);
+        }
+      } else {
+        if (Ftp.LOG.isTraceEnabled()) {
+          Ftp.LOG.trace("fetching " + url);
+        }
+      }
+
+      InetAddress addr = InetAddress.getByName(url.getHost());
+      if (addr != null && conf.getBoolean("store.ip.address", false) == true) {
+        headers.add("_ip_", addr.getHostAddress());
+      }
+
+      // idled too long, remote server or ourselves may have timed out,
+      // should start anew.
+      if (ftp.client != null && ftp.keepConnection
+          && ftp.renewalTime < System.currentTimeMillis()) {
+        if (Ftp.LOG.isInfoEnabled()) {
+          Ftp.LOG.info("delete client because idled too long");
+        }
+        ftp.client = null;
+      }
+
+      // start anew if needed
+      if (ftp.client == null) {
+        if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) {
+          Ftp.LOG.info("start client");
+        }
+        // the real client
+        ftp.client = new Client();
+        // when to renew, take the lesser
+        // ftp.renewalTime = System.currentTimeMillis()
+        // + ((ftp.timeout<ftp.serverTimeout) ? ftp.timeout :
+        // ftp.serverTimeout);
+
+        // timeout for control connection
+        ftp.client.setDefaultTimeout(ftp.timeout);
+        // timeout for data connection
+        ftp.client.setDataTimeout(ftp.timeout);
+
+        // follow ftp talk?
+        if (ftp.followTalk)
+          ftp.client.addProtocolCommandListener(new PrintCommandListener(
+              Ftp.LOG));
+      }
+
+      // quit from previous site if at a different site now
+      if (ftp.client.isConnected()) {
+        InetAddress remoteAddress = ftp.client.getRemoteAddress();
+        if (!addr.equals(remoteAddress)) {
+          if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) {
+            Ftp.LOG.info("disconnect from " + remoteAddress
+                + " before connect to " + addr);
+          }
+          // quit from current site
+          ftp.client.logout();
+          ftp.client.disconnect();
+        }
+      }
+
+      // connect to current site if needed
+      if (!ftp.client.isConnected()) {
+
+        if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) {
+          Ftp.LOG.info("connect to " + addr);
+        }
+
+        ftp.client.connect(addr);
+        if (!FTPReply.isPositiveCompletion(ftp.client.getReplyCode())) {
+          ftp.client.disconnect();
+          if (Ftp.LOG.isWarnEnabled()) {
+            Ftp.LOG.warn("ftp.client.connect() failed: " + addr + " "
+                + ftp.client.getReplyString());
+          }
+          this.code = 500; // http Internal Server Error
+          return;
+        }
+
+        if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) {
+          Ftp.LOG.info("log into " + addr);
+        }
+
+        if (!ftp.client.login(ftp.userName, ftp.passWord)) {
+          // login failed.
+          // please note that some server may return 421 immediately
+          // after USER anonymous, thus ftp.client.login() won't return false,
+          // but throw exception, which then will be handled by caller
+          // (not dealt with here at all) .
+          ftp.client.disconnect();
+          if (Ftp.LOG.isWarnEnabled()) {
+            Ftp.LOG.warn("ftp.client.login() failed: " + addr);
+          }
+          this.code = 401; // http Unauthorized
+          return;
+        }
+
+        // insist on binary file type
+        if (!ftp.client.setFileType(FTP.BINARY_FILE_TYPE)) {
+          ftp.client.logout();
+          ftp.client.disconnect();
+          if (Ftp.LOG.isWarnEnabled()) {
+            Ftp.LOG.warn("ftp.client.setFileType() failed: " + addr);
+          }
+          this.code = 500; // http Internal Server Error
+          return;
+        }
+
+        if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) {
+          Ftp.LOG.info("set parser for " + addr);
+        }
+
+        // SYST is valid only after login
+        try {
+          ftp.parser = null;
+          String parserKey = ftp.client.getSystemName();
+          // some server reports as UNKNOWN Type: L8, but in fact UNIX Type: L8
+          if (parserKey.startsWith("UNKNOWN Type: L8"))
+            parserKey = "UNIX Type: L8";
+          ftp.parser = (new DefaultFTPFileEntryParserFactory())
+              .createFileEntryParser(parserKey);
+        } catch (FtpExceptionBadSystResponse e) {
+          if (Ftp.LOG.isWarnEnabled()) {
+            Ftp.LOG
+                .warn("ftp.client.getSystemName() failed: " + addr + " " + e);
+          }
+          ftp.parser = null;
+        } catch (ParserInitializationException e) {
+          // ParserInitializationException is RuntimeException defined in
+          // org.apache.commons.net.ftp.parser.ParserInitializationException
+          if (Ftp.LOG.isWarnEnabled()) {
+            Ftp.LOG.warn("createFileEntryParser() failed. " + addr + " " + e);
+          }
+          ftp.parser = null;
+        } finally {
+          if (ftp.parser == null) {
+            // do not log as severe, otherwise
+            // FetcherThread/RequestScheduler will abort
+            if (Ftp.LOG.isWarnEnabled()) {
+              Ftp.LOG.warn("ftp.parser is null: " + addr);
+            }
+            ftp.client.logout();
+            ftp.client.disconnect();
+            this.code = 500; // http Internal Server Error
+            return;
+          }
+        }
+
+      } else {
+        if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) {
+          Ftp.LOG.info("use existing connection");
+        }
+      }
+
+      this.content = null;
+
+      if (path.endsWith("/")) {
+        getDirAsHttpResponse(path, datum.getModifiedTime());
+      } else {
+        getFileAsHttpResponse(path, datum.getModifiedTime());
+      }
+
+      // reset next renewalTime, take the lesser
+      if (ftp.client != null && ftp.keepConnection) {
+        ftp.renewalTime = System.currentTimeMillis()
+            + ((ftp.timeout < ftp.serverTimeout) ? ftp.timeout
+                : ftp.serverTimeout);
+        if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) {
+          Ftp.LOG.info("reset renewalTime to "
+              + HttpDateFormat.toString(ftp.renewalTime));
+        }
+      }
+
+      // getDirAsHttpResponse() or getFileAsHttpResponse() above
+      // may have deleted ftp.client
+      if (ftp.client != null && !ftp.keepConnection) {
+        if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) {
+          Ftp.LOG.info("disconnect from " + addr);
+        }
+        ftp.client.logout();
+        ftp.client.disconnect();
+      }
+
+    } catch (Exception e) {
+      if (Ftp.LOG.isWarnEnabled()) {
+        Ftp.LOG.warn("Error: ", e);
+      }
+      // for any un-foreseen exception (run time exception or not),
+      // do ultimate clean and leave ftp.client for garbage collection
+      if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) {
+        Ftp.LOG.info("delete client due to exception");
+      }
+      ftp.client = null;
+      // or do explicit garbage collection?
+      // System.gc();
+      // can we be less dramatic, using the following instead?
+      // probably unnecessary for our practical purpose here
+      // try {
+      // ftp.client.logout();
+      // ftp.client.disconnect();
+      // }
+      throw new FtpException(e);
+      // throw e;
+    }
+
+  }
+
+  // get ftp file as http response
+  private void getFileAsHttpResponse(String path, long lastModified)
+      throws IOException {
+
+    ByteArrayOutputStream os = null;
+    List<FTPFile> list = null;
+
+    try {
+      // first get its possible attributes
+      list = new LinkedList<FTPFile>();
+      ftp.client.retrieveList(path, list, ftp.maxContentLength, ftp.parser);
+
+      FTPFile ftpFile = (FTPFile) list.get(0);
+      this.headers.set(Response.CONTENT_LENGTH,
+          new Long(ftpFile.getSize()).toString());
+      this.headers.set(Response.LAST_MODIFIED,
+          HttpDateFormat.toString(ftpFile.getTimestamp()));
+      // don't retrieve the file if not changed.
+      if (ftpFile.getTimestamp().getTimeInMillis() <= lastModified) {
+        code = 304;
+        return;
+      }
+      os = new ByteArrayOutputStream(ftp.getBufferSize());
+      ftp.client.retrieveFile(path, os, ftp.maxContentLength);
+
+      this.content = os.toByteArray();
+
+      // // approximate bytes sent and read
+      // if (this.httpAccounting != null) {
+      // this.httpAccounting.incrementBytesSent(path.length());
+      // this.httpAccounting.incrementBytesRead(this.content.length);
+      // }
+
+      this.code = 200; // http OK
+
+    } catch (FtpExceptionControlClosedByForcedDataClose e) {
+
+      // control connection is off, clean up
+      // ftp.client.disconnect();
+      if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) {
+        Ftp.LOG.info("delete client because server cut off control channel: "
+            + e);
+      }
+      ftp.client = null;
+
+      // in case this FtpExceptionControlClosedByForcedDataClose is
+      // thrown by retrieveList() (not retrieveFile()) above,
+      if (os == null) { // indicating throwing by retrieveList()
+        // throw new FtpException("fail to get attibutes: "+path);
+        if (Ftp.LOG.isWarnEnabled()) {
+          Ftp.LOG
+              .warn("Please try larger maxContentLength for ftp.client.retrieveList(). "
+                  + e);
+        }
+        // in a way, this is our request fault
+        this.code = 400; // http Bad request
+        return;
+      }
+
+      FTPFile ftpFile = (FTPFile) list.get(0);
+      this.headers.set(Response.CONTENT_LENGTH,
+          new Long(ftpFile.getSize()).toString());
+      // this.headers.put("content-type", "text/html");
+      this.headers.set(Response.LAST_MODIFIED,
+          HttpDateFormat.toString(ftpFile.getTimestamp()));
+      this.content = os.toByteArray();
+      if (ftpFile.getTimestamp().getTimeInMillis() <= lastModified) {
+        code = 304;
+        return;
+      }
+
+      // // approximate bytes sent and read
+      // if (this.httpAccounting != null) {
+      // this.httpAccounting.incrementBytesSent(path.length());
+      // this.httpAccounting.incrementBytesRead(this.content.length);
+      // }
+
+      this.code = 200; // http OK
+
+    } catch (FtpExceptionCanNotHaveDataConnection e) {
+
+      if (FTPReply.isPositiveCompletion(ftp.client.cwd(path))) {
+        // it is not a file, but dir, so redirect as a dir
+        this.headers.set(Response.LOCATION, path + "/");
+        this.code = 300; // http redirect
+        // fixme, should we do ftp.client.cwd("/"), back to top dir?
+      } else {
+        // it is not a dir either
+        this.code = 404; // http Not Found
+      }
+
+    } catch (FtpExceptionUnknownForcedDataClose e) {
+      // Please note control channel is still live.
+      // in a way, this is our request fault
+      if (Ftp.LOG.isWarnEnabled()) {
+        Ftp.LOG.warn("Unrecognized reply after forced close of data channel. "
+            + "If this is acceptable, please modify Client.java accordingly. "
+            + e);
+      }
+      this.code = 400; // http Bad Request
+    }
+
+  }
+
+  // get ftp dir list as http response
+  private void getDirAsHttpResponse(String path, long lastModified)
+      throws IOException {
+    List<FTPFile> list = new LinkedList<FTPFile>();
+
+    try {
+
+      // change to that dir first
+      if (!FTPReply.isPositiveCompletion(ftp.client.cwd(path))) {
+        this.code = 404; // http Not Found
+        return;
+      }
+
+      // fixme, should we do ftp.client.cwd("/"), back to top dir?
+
+      ftp.client.retrieveList(null, list, ftp.maxContentLength, ftp.parser);
+      this.content = list2html(list, path, "/".equals(path) ? false : true);
+      this.headers.set(Response.CONTENT_LENGTH,
+          new Integer(this.content.length).toString());
+      this.headers.set(Response.CONTENT_TYPE, "text/html");
+      // this.headers.put("Last-Modified", null);
+
+      // // approximate bytes sent and read
+      // if (this.httpAccounting != null) {
+      // this.httpAccounting.incrementBytesSent(path.length());
+      // this.httpAccounting.incrementBytesRead(this.content.length);
+      // }
+
+      this.code = 200; // http OK
+
+    } catch (FtpExceptionControlClosedByForcedDataClose e) {
+
+      // control connection is off, clean up
+      // ftp.client.disconnect();
+      if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) {
+        Ftp.LOG.info("delete client because server cut off control channel: "
+            + e);
+      }
+      ftp.client = null;
+
+      this.content = list2html(list, path, "/".equals(path) ? false : true);
+      this.headers.set(Response.CONTENT_LENGTH,
+          new Integer(this.content.length).toString());
+      this.headers.set(Response.CONTENT_TYPE, "text/html");
+      // this.headers.put("Last-Modified", null);
+
+      // // approximate bytes sent and read
+      // if (this.httpAccounting != null) {
+      // this.httpAccounting.incrementBytesSent(path.length());
+      // this.httpAccounting.incrementBytesRead(this.content.length);
+      // }
+
+      this.code = 200; // http OK
+
+    } catch (FtpExceptionUnknownForcedDataClose e) {
+      // Please note control channel is still live.
+      // in a way, this is our request fault
+      if (Ftp.LOG.isWarnEnabled()) {
+        Ftp.LOG.warn("Unrecognized reply after forced close of data channel. "
+            + "If this is acceptable, please modify Client.java accordingly. "
+            + e);
+      }
+      this.code = 400; // http Bad Request
+    } catch (FtpExceptionCanNotHaveDataConnection e) {
+      if (Ftp.LOG.isWarnEnabled()) {
+        Ftp.LOG.warn("" + e);
+      }
+      this.code = 500; // http Iternal Server Error
+    }
+
+  }
+
+  // generate html page from ftp dir list
+  private byte[] list2html(List<FTPFile> list, String path,
+      boolean includeDotDot) {
+
+    // StringBuffer x = new
+    // StringBuffer("<!doctype html public \"-//ietf//dtd html//en\"><html><head>");
+    StringBuffer x = new StringBuffer("<html><head>");
+    x.append("<title>Index of " + path + "</title></head>\n");
+    x.append("<body><h1>Index of " + path + "</h1><pre>\n");
+
+    if (includeDotDot) {
+      x.append("<a href='../'>../</a>\t-\t-\t-\n");
+    }
+
+    for (int i = 0; i < list.size(); i++) {
+      FTPFile f = (FTPFile) list.get(i);
+      String name = f.getName();
+      String time = HttpDateFormat.toString(f.getTimestamp());
+      if (f.isDirectory()) {
+        // some ftp server LIST "." and "..", we skip them here
+        if (name.equals(".") || name.equals(".."))
+          continue;
+        x.append("<a href='" + name + "/" + "'>" + name + "/</a>\t");
+        x.append(time + "\t-\n");
+      } else if (f.isFile()) {
+        x.append("<a href='" + name + "'>" + name + "</a>\t");
+        x.append(time + "\t" + f.getSize() + "\n");
+      } else {
+        // ignore isSymbolicLink()
+        // ignore isUnknown()
+      }
+    }
+
+    x.append("</pre></body></html>\n");
+
+    return new String(x).getBytes();
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java
new file mode 100644
index 0000000..3764864
--- /dev/null
+++ b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java
@@ -0,0 +1,121 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol.ftp;
+
+import java.net.URL;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolOutput;
+import org.apache.nutch.protocol.ProtocolStatus;
+import org.apache.nutch.protocol.RobotRulesParser;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import crawlercommons.robots.BaseRobotRules;
+import crawlercommons.robots.SimpleRobotRules;
+
+/**
+ * This class is used for parsing robots for urls belonging to FTP protocol. It
+ * extends the generic {@link RobotRulesParser} class and contains Ftp protocol
+ * specific implementation for obtaining the robots file.
+ */
+public class FtpRobotRulesParser extends RobotRulesParser {
+
+  private static final String CONTENT_TYPE = "text/plain";
+  public static final Logger LOG = LoggerFactory
+      .getLogger(FtpRobotRulesParser.class);
+
+  FtpRobotRulesParser() {
+  }
+
+  public FtpRobotRulesParser(Configuration conf) {
+    super(conf);
+  }
+
+  /**
+   * The hosts for which the caching of robots rules is yet to be done, it sends
+   * a Ftp request to the host corresponding to the {@link URL} passed, gets
+   * robots file, parses the rules and caches the rules object to avoid re-work
+   * in future.
+   * 
+   * @param ftp
+   *          The {@link Protocol} object
+   * @param url
+   *          URL
+   * 
+   * @return robotRules A {@link BaseRobotRules} object for the rules
+   */
+  public BaseRobotRules getRobotRulesSet(Protocol ftp, URL url) {
+
+    String protocol = url.getProtocol().toLowerCase(); // normalize to lower
+                                                       // case
+    String host = url.getHost().toLowerCase(); // normalize to lower case
+
+    if (LOG.isTraceEnabled() && isWhiteListed(url)) {
+      LOG.trace("Ignoring robots.txt (host is whitelisted) for URL: {}", url);
+    }
+
+    BaseRobotRules robotRules = CACHE.get(protocol + ":" + host);
+
+    if (robotRules != null) {
+      return robotRules; // cached rule
+    } else if (LOG.isTraceEnabled()) {
+      LOG.trace("cache miss " + url);
+    }
+
+    boolean cacheRule = true;
+
+    if (isWhiteListed(url)) {
+      // check in advance whether a host is whitelisted
+      // (we do not need to fetch robots.txt)
+      robotRules = EMPTY_RULES;
+      LOG.info("Whitelisted host found for: {}", url);
+      LOG.info("Ignoring robots.txt for all URLs from whitelisted host: {}", host);
+
+    } else {
+      try {
+        Text robotsUrl = new Text(new URL(url, "/robots.txt").toString());
+        ProtocolOutput output = ((Ftp) ftp).getProtocolOutput(robotsUrl,
+            new CrawlDatum());
+        ProtocolStatus status = output.getStatus();
+
+        if (status.getCode() == ProtocolStatus.SUCCESS) {
+          robotRules = parseRules(url.toString(), output.getContent()
+              .getContent(), CONTENT_TYPE, agentNames);
+        } else {
+          robotRules = EMPTY_RULES; // use default rules
+        }
+      } catch (Throwable t) {
+        if (LOG.isInfoEnabled()) {
+          LOG.info("Couldn't get robots.txt for " + url + ": " + t.toString());
+        }
+        cacheRule = false; // try again later to fetch robots.txt
+        robotRules = EMPTY_RULES;
+      }
+
+    }
+
+    if (cacheRule)
+      CACHE.put(protocol + ":" + host, robotRules); // cache rules for host
+
+    return robotRules;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/PrintCommandListener.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/PrintCommandListener.java b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/PrintCommandListener.java
new file mode 100644
index 0000000..c68eac8
--- /dev/null
+++ b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/PrintCommandListener.java
@@ -0,0 +1,71 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol.ftp;
+
+import java.io.BufferedReader;
+import java.io.StringReader;
+import java.io.IOException;
+
+import org.slf4j.Logger;
+
+import org.apache.commons.net.ProtocolCommandEvent;
+import org.apache.commons.net.ProtocolCommandListener;
+
+/***
+ * This is a support class for logging all ftp command/reply traffic.
+ * 
+ * @author John Xing
+ ***/
+public class PrintCommandListener implements ProtocolCommandListener {
+  private Logger __logger;
+
+  public PrintCommandListener(Logger logger) {
+    __logger = logger;
+  }
+
+  public void protocolCommandSent(ProtocolCommandEvent event) {
+    try {
+      __logIt(event);
+    } catch (IOException e) {
+      if (__logger.isInfoEnabled()) {
+        __logger.info("PrintCommandListener.protocolCommandSent(): " + e);
+      }
+    }
+  }
+
+  public void protocolReplyReceived(ProtocolCommandEvent event) {
+    try {
+      __logIt(event);
+    } catch (IOException e) {
+      if (__logger.isInfoEnabled()) {
+        __logger.info("PrintCommandListener.protocolReplyReceived(): " + e);
+      }
+    }
+  }
+
+  private void __logIt(ProtocolCommandEvent event) throws IOException {
+    if (!__logger.isInfoEnabled()) {
+      return;
+    }
+    BufferedReader br = new BufferedReader(new StringReader(event.getMessage()));
+    String line;
+    while ((line = br.readLine()) != null) {
+      __logger.info("ftp> " + line);
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/package.html
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/package.html b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/package.html
new file mode 100644
index 0000000..d936930
--- /dev/null
+++ b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/package.html
@@ -0,0 +1,5 @@
+<html>
+<body>
+<p>Protocol plugin which supports retrieving documents via the ftp protocol.</p><p></p>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-htmlunit/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-htmlunit/build.xml b/nutch-plugins/protocol-htmlunit/build.xml
new file mode 100644
index 0000000..899214c
--- /dev/null
+++ b/nutch-plugins/protocol-htmlunit/build.xml
@@ -0,0 +1,37 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="protocol-htmlunit" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+  <!-- Build compilation dependencies -->
+  <target name="deps-jar">
+    <ant target="jar" inheritall="false" dir="../lib-http"/>
+    <ant target="jar" inheritall="false" dir="../lib-htmlunit"/>
+  </target>
+
+  <!-- Add compilation dependencies to classpath -->
+  <path id="plugin.deps">
+    <fileset dir="${nutch.root}/build">
+      <include name="**/lib-http/*.jar" />
+      <include name="**/lib-htmlunit/*.jar" />
+    </fileset>
+    <pathelement location="${build.dir}/test/conf"/>
+  </path>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-htmlunit/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-htmlunit/ivy.xml b/nutch-plugins/protocol-htmlunit/ivy.xml
new file mode 100644
index 0000000..8aa78d2
--- /dev/null
+++ b/nutch-plugins/protocol-htmlunit/ivy.xml
@@ -0,0 +1,38 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-htmlunit/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-htmlunit/plugin.xml b/nutch-plugins/protocol-htmlunit/plugin.xml
new file mode 100644
index 0000000..36bcb80
--- /dev/null
+++ b/nutch-plugins/protocol-htmlunit/plugin.xml
@@ -0,0 +1,51 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="protocol-htmlunit"
+   name="HtmlUnit Protocol Plug-in"
+   version="1.0.0"
+   provider-name="nutch.apache.org">
+
+  <runtime>
+    <library name="protocol-htmlunit.jar">
+      <export name="*"/>
+    </library>
+  </runtime>
+
+  <requires>
+    <import plugin="nutch-extensionpoints"/>
+    <import plugin="lib-http"/>
+    <import plugin="lib-htmlunit"/>
+  </requires>
+
+  <extension id="org.apache.nutch.protocol.http"
+             name="HttpProtocol"
+             point="org.apache.nutch.protocol.Protocol">
+
+    <implementation id="org.apache.nutch.protocol.htmlunit.Http"
+                    class="org.apache.nutch.protocol.htmlunit.Http">
+      <parameter name="protocolName" value="http"/>
+    </implementation>
+      
+    <implementation id="org.apache.nutch.protocol.htmlunit.Http"
+                    class="org.apache.nutch.protocol.htmlunit.Http">
+      <parameter name="protocolName" value="https"/>
+    </implementation>
+
+   </extension>
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-htmlunit/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-htmlunit/pom.xml b/nutch-plugins/protocol-htmlunit/pom.xml
new file mode 100644
index 0000000..e5a57d7
--- /dev/null
+++ b/nutch-plugins/protocol-htmlunit/pom.xml
@@ -0,0 +1,51 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>protocol-htmlunit</artifactId>
+    <packaging>jar</packaging>
+
+    <name>protocol-htmlunit</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+    <dependencies>
+        <dependency>
+            <groupId>org.apache.nutch</groupId>
+            <artifactId>lib-htmlunit</artifactId>
+            <version>${project.parent.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.nutch</groupId>
+            <artifactId>lib-http</artifactId>
+            <version>${project.parent.version}</version>
+        </dependency>
+    </dependencies>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-htmlunit/src/main/java/org/apache/nutch/protocol/htmlunit/Http.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-htmlunit/src/main/java/org/apache/nutch/protocol/htmlunit/Http.java b/nutch-plugins/protocol-htmlunit/src/main/java/org/apache/nutch/protocol/htmlunit/Http.java
new file mode 100644
index 0000000..c40ed69
--- /dev/null
+++ b/nutch-plugins/protocol-htmlunit/src/main/java/org/apache/nutch/protocol/htmlunit/Http.java
@@ -0,0 +1,63 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.htmlunit;
+
+import java.io.IOException;
+import java.net.URL;
+
+import org.apache.hadoop.conf.Configuration;
+
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.protocol.http.api.HttpBase;
+import org.apache.nutch.util.NutchConfiguration;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class Http extends HttpBase {
+
+  public static final Logger LOG = LoggerFactory.getLogger(Http.class);
+
+  /**
+   * Default constructor.
+   */
+  public Http() {
+    super(LOG);
+  }
+
+  /**
+   * Set the {@link org.apache.hadoop.conf.Configuration} object.
+   * 
+   * @param conf
+   */
+  public void setConf(Configuration conf) {
+    super.setConf(conf);
+  }
+
+  public static void main(String[] args) throws Exception {
+    Http http = new Http();
+    http.setConf(NutchConfiguration.create());
+    main(http, args);
+  }
+  
+  protected Response getResponse(URL url, CrawlDatum datum, boolean redirect)
+      throws ProtocolException, IOException {
+    return new HttpResponse(this, url, datum);
+  }
+}

[31/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build for nutch-core and nutch-plugins

Posted by th...@apache.org.

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/model/NutchConfig.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/model/NutchConfig.java b/nutch-core/src/main/java/org/apache/nutch/webui/model/NutchConfig.java
new file mode 100644
index 0000000..2106d23
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/model/NutchConfig.java
@@ -0,0 +1,24 @@
+package org.apache.nutch.webui.model;
+
+import java.io.Serializable;
+
+public class NutchConfig implements Serializable {
+  private String name = "name";
+  private String value;
+
+  public void setName(String name) {
+    this.name = name;
+  }
+
+  public String getName() {
+    return this.name;
+  }
+
+  public String getValue() {
+    return value;
+  }
+
+  public void setValue(String value) {
+    this.value = value;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/model/NutchInstance.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/model/NutchInstance.java b/nutch-core/src/main/java/org/apache/nutch/webui/model/NutchInstance.java
new file mode 100644
index 0000000..2c1f1c5
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/model/NutchInstance.java
@@ -0,0 +1,118 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.webui.model;
+
+import java.io.Serializable;
+import java.net.URI;
+import java.net.URISyntaxException;
+
+import javax.persistence.Column;
+import javax.persistence.Entity;
+import javax.persistence.GeneratedValue;
+import javax.persistence.Id;
+
+import org.apache.nutch.webui.client.model.ConnectionStatus;
+
+@Entity
+public class NutchInstance implements Serializable {
+
+  @Id
+  @GeneratedValue
+  private Long id;
+
+  @Column
+  private String name = "localhost";
+
+  @Column
+  private String host = "localhost";
+
+  @Column
+  private Integer port = 8081;
+
+  @Column
+  private String username;
+
+  @Column
+  private String password;
+
+  private ConnectionStatus connectionStatus;
+
+  public String getName() {
+    return name;
+  }
+
+  public void setName(String name) {
+    this.name = name;
+  }
+
+  public String getHost() {
+    return host;
+  }
+
+  public void setUsername(String username) {
+    this.username = username;
+  }
+
+  public String getUsername() {
+    return username;
+  }
+
+  public void setHost(String host) {
+    this.host = host;
+  }
+
+  public Integer getPort() {
+    return port;
+  }
+
+  public void setPort(Integer port) {
+    this.port = port;
+  }
+
+  public ConnectionStatus getConnectionStatus() {
+    return connectionStatus;
+  }
+
+  public void setConnectionStatus(ConnectionStatus connectionStatus) {
+    this.connectionStatus = connectionStatus;
+  }
+
+  public URI getUrl() {
+    try {
+      return new URI("http", null, host, port, null, null, null);
+    } catch (URISyntaxException e) {
+      throw new IllegalStateException("Cannot parse url parameters", e);
+    }
+  }
+
+  public String getPassword() {
+    return password;
+  }
+
+  public void setPassword(String password) {
+    this.password = password;
+  }
+
+  public Long getId() {
+    return id;
+  }
+
+  public void setId(Long id) {
+    this.id = id;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/model/SeedList.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/model/SeedList.java b/nutch-core/src/main/java/org/apache/nutch/webui/model/SeedList.java
new file mode 100644
index 0000000..72d3d75
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/model/SeedList.java
@@ -0,0 +1,106 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.webui.model;
+
+import java.io.Serializable;
+import java.util.Collection;
+
+import javax.persistence.Column;
+import javax.persistence.Entity;
+import javax.persistence.GeneratedValue;
+import javax.persistence.Id;
+import javax.persistence.OneToMany;
+
+import org.apache.commons.collections4.CollectionUtils;
+import org.codehaus.jackson.annotate.JsonIgnore;
+
+import com.fasterxml.jackson.annotation.JsonManagedReference;
+import com.j256.ormlite.field.ForeignCollectionField;
+
+@Entity
+public class SeedList implements Serializable {
+
+  @Id
+  @GeneratedValue
+  private Long id;
+
+  @Column
+  private String name;
+
+  @OneToMany
+  @ForeignCollectionField(eager = true)
+  @JsonManagedReference
+  private Collection<SeedUrl> seedUrls;
+
+  public Long getId() {
+    return id;
+  }
+
+  public void setId(Long id) {
+    this.id = id;
+  }
+
+  @JsonIgnore
+  public int getSeedUrlsCount() {
+    if (CollectionUtils.isEmpty(seedUrls)) {
+      return 0;
+    }
+    return seedUrls.size();
+  }
+
+  public Collection<SeedUrl> getSeedUrls() {
+    return seedUrls;
+  }
+
+  public void setSeedUrls(Collection<SeedUrl> seedUrls) {
+    this.seedUrls = seedUrls;
+  }
+
+  public String getName() {
+    return name;
+  }
+
+  public void setName(String name) {
+    this.name = name;
+  }
+
+  @Override
+  public int hashCode() {
+    final int prime = 31;
+    int result = 1;
+    result = prime * result + ((id == null) ? 0 : id.hashCode());
+    return result;
+  }
+
+  @Override
+  public boolean equals(Object obj) {
+    if (this == obj)
+      return true;
+    if (obj == null)
+      return false;
+    if (getClass() != obj.getClass())
+      return false;
+    SeedList other = (SeedList) obj;
+    if (id == null) {
+      if (other.id != null)
+        return false;
+    } else if (!id.equals(other.id))
+      return false;
+    return true;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/model/SeedUrl.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/model/SeedUrl.java b/nutch-core/src/main/java/org/apache/nutch/webui/model/SeedUrl.java
new file mode 100644
index 0000000..5f89241
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/model/SeedUrl.java
@@ -0,0 +1,96 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.webui.model;
+
+import java.io.Serializable;
+
+import javax.persistence.Column;
+import javax.persistence.Entity;
+import javax.persistence.GeneratedValue;
+import javax.persistence.Id;
+
+import org.codehaus.jackson.annotate.JsonIgnore;
+
+import com.fasterxml.jackson.annotation.JsonBackReference;
+import com.j256.ormlite.field.DatabaseField;
+
+@Entity
+public class SeedUrl implements Serializable {
+
+  @Id
+  @GeneratedValue
+  private Long id;
+
+  @Column
+  @DatabaseField(foreign = true, foreignAutoCreate = true, foreignAutoRefresh = true)
+  @JsonBackReference
+  private SeedList seedList;
+
+  @Column
+  private String url;
+
+  public Long getId() {
+    return id;
+  }
+
+  public void setId(Long id) {
+    this.id = id;
+  }
+
+  public String getUrl() {
+    return url;
+  }
+
+  public void setUrl(String url) {
+    this.url = url;
+  }
+
+  @JsonIgnore
+  public SeedList getSeedList() {
+    return seedList;
+  }
+
+  @JsonIgnore
+  public void setSeedList(SeedList seedList) {
+    this.seedList = seedList;
+  }
+
+  @Override
+  public int hashCode() {
+    final int prime = 31;
+    int result = 1;
+    result = prime * result + ((id == null) ? 0 : id.hashCode());
+    return result;
+  }
+
+  @Override
+  public boolean equals(Object obj) {
+    if (this == obj)
+      return true;
+    if (obj == null)
+      return false;
+    if (getClass() != obj.getClass())
+      return false;
+    SeedUrl other = (SeedUrl) obj;
+    if (id == null) {
+      if (other.id != null)
+        return false;
+    } else if (!id.equals(other.id))
+      return false;
+    return true;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/pages/AbstractBasePage.html
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/pages/AbstractBasePage.html b/nutch-core/src/main/java/org/apache/nutch/webui/pages/AbstractBasePage.html
new file mode 100644
index 0000000..101a0f3
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/pages/AbstractBasePage.html
@@ -0,0 +1,33 @@
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
+	license agreements. See the NOTICE file distributed with this work for additional 
+	information regarding copyright ownership. The ASF licenses this file to 
+	You under the Apache License, Version 2.0 (the "License"); you may not use 
+	this file except in compliance with the License. You may obtain a copy of 
+	the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
+	by applicable law or agreed to in writing, software distributed under the 
+	License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
+	OF ANY KIND, either express or implied. See the License for the specific 
+	language governing permissions and limitations under the License. -->
+<!DOCTYPE html>
+<html xmlns:wicket="http://wicket.apache.org">
+<head>
+<meta charset="utf-8">
+<meta http-equiv="X-UA-Compatible" content="IE=edge">
+<meta name="viewport" content="width=device-width, initial-scale=1">
+<meta name="description" content="">
+<meta name="author" content="">
+<link rel="shortcut icon" href="../../assets/ico/favicon.ico">
+
+<title>Apache Nutch</title>
+
+</head>
+<body>
+	<div id="wrapper">
+		<nav wicket:id="navigation" class="bs-docs-nav"></nav>
+		<div id="page-wrapper">
+			<div wicket:id="globalNotificationPanel"></div>
+			<wicket:child></wicket:child>
+		</div>
+	</div>
+</body>
+</html>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/pages/AbstractBasePage.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/pages/AbstractBasePage.java b/nutch-core/src/main/java/org/apache/nutch/webui/pages/AbstractBasePage.java
new file mode 100644
index 0000000..5611d74
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/pages/AbstractBasePage.java
@@ -0,0 +1,206 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.webui.pages;
+
+import static de.agilecoders.wicket.core.markup.html.bootstrap.navbar.Navbar.ComponentPosition.LEFT;
+import static de.agilecoders.wicket.core.markup.html.bootstrap.navbar.NavbarComponents.transform;
+
+import java.util.List;
+
+import org.apache.nutch.webui.model.NutchInstance;
+import org.apache.nutch.webui.pages.crawls.CrawlsPage;
+import org.apache.nutch.webui.pages.instances.InstancesPage;
+import org.apache.nutch.webui.pages.menu.VerticalMenu;
+import org.apache.nutch.webui.pages.seed.SeedListsPage;
+import org.apache.nutch.webui.pages.settings.SettingsPage;
+import org.apache.nutch.webui.service.NutchInstanceService;
+import org.apache.nutch.webui.service.NutchService;
+import org.apache.wicket.Component;
+import org.apache.wicket.Page;
+import org.apache.wicket.markup.html.GenericWebPage;
+import org.apache.wicket.markup.html.link.AbstractLink;
+import org.apache.wicket.markup.html.link.Link;
+import org.apache.wicket.model.IModel;
+import org.apache.wicket.model.LoadableDetachableModel;
+import org.apache.wicket.model.Model;
+import org.apache.wicket.model.PropertyModel;
+import org.apache.wicket.model.ResourceModel;
+import org.apache.wicket.spring.injection.annot.SpringBean;
+
+import com.google.common.collect.Iterables;
+import com.google.common.collect.Lists;
+
+import de.agilecoders.wicket.core.markup.html.bootstrap.button.dropdown.DropDownButton;
+import de.agilecoders.wicket.core.markup.html.bootstrap.button.dropdown.MenuBookmarkablePageLink;
+import de.agilecoders.wicket.core.markup.html.bootstrap.button.dropdown.MenuDivider;
+import de.agilecoders.wicket.core.markup.html.bootstrap.common.NotificationPanel;
+import de.agilecoders.wicket.core.markup.html.bootstrap.image.IconType;
+import de.agilecoders.wicket.core.markup.html.bootstrap.navbar.Navbar.ComponentPosition;
+import de.agilecoders.wicket.core.markup.html.bootstrap.navbar.Navbar.Position;
+import de.agilecoders.wicket.core.markup.html.bootstrap.navbar.NavbarButton;
+import de.agilecoders.wicket.core.markup.html.bootstrap.navbar.NavbarComponents;
+import de.agilecoders.wicket.core.markup.html.bootstrap.navbar.NavbarDropDownButton;
+import de.agilecoders.wicket.extensions.markup.html.bootstrap.icon.FontAwesomeIconType;
+
+public abstract class AbstractBasePage<T> extends GenericWebPage<T> {
+  /**
+   * 
+   */
+  private static final long serialVersionUID = 1L;
+
+  @SpringBean
+  private NutchService service;
+
+  @SpringBean
+  private NutchInstanceService instanceService;
+
+  private VerticalMenu navbar;
+
+  protected IModel<NutchInstance> currentInstance = new InstanceModel();
+
+  public AbstractBasePage() {
+    navbar = new VerticalMenu("navigation");
+    navbar.brandName(Model.of("Apache Nutch GUI"));
+    navbar.setInverted(true);
+    navbar.setPosition(Position.TOP);
+    add(navbar);
+
+    addMenuItem(DashboardPage.class, "navbar.menu.dashboard",
+        FontAwesomeIconType.dashboard);
+    addMenuItem(StatisticsPage.class, "navbar.menu.statistics",
+        FontAwesomeIconType.bar_chart_o);
+    addMenuItem(InstancesPage.class, "navbar.menu.instances",
+        FontAwesomeIconType.gears);
+    addMenuItem(SettingsPage.class, "navbar.menu.settings",
+        FontAwesomeIconType.wrench);
+    addMenuItem(CrawlsPage.class, "navbar.menu.crawls",
+        FontAwesomeIconType.refresh);
+    addMenuItem(SchedulingPage.class, "navbar.menu.scheduling",
+        FontAwesomeIconType.clock_o);
+    addMenuItem(SearchPage.class, "navbar.menu.search",
+        FontAwesomeIconType.search);
+    addMenuItem(SeedListsPage.class, "navbar.menu.seedLists",
+        FontAwesomeIconType.file);
+
+    navbar.addComponents(transform(ComponentPosition.RIGHT,
+        addInstancesMenuMenu()));
+    navbar.addComponents(transform(ComponentPosition.RIGHT, addUserMenu()));
+
+    add(new NotificationPanel("globalNotificationPanel"));
+
+    if (currentInstance.getObject() == null && !(this instanceof InstancesPage)) {
+      getSession().error("No running instances found!");
+      setResponsePage(InstancesPage.class);
+    }
+  }
+
+  protected Component addUserMenu() {
+    DropDownButton userMenu = new NavbarDropDownButton(Model.of("Username")) {
+      /**
+       * 
+       */
+      private static final long serialVersionUID = 1L;
+
+      @Override
+      protected List<AbstractLink> newSubMenuButtons(final String buttonMarkupId) {
+        List<AbstractLink> subMenu = Lists.newArrayList();
+        subMenu.add(new MenuBookmarkablePageLink<Void>(UserSettingsPage.class,
+            new ResourceModel("navbar.userMenu.settings"))
+            .setIconType(FontAwesomeIconType.gear));
+        subMenu.add(new MenuDivider());
+        subMenu.add(new MenuBookmarkablePageLink<Void>(LogOutPage.class,
+            new ResourceModel("navbar.userMenu.logout"))
+            .setIconType(FontAwesomeIconType.power_off));
+        return subMenu;
+      }
+    }.setIconType(FontAwesomeIconType.user);
+    return userMenu;
+  }
+
+  protected Component addInstancesMenuMenu() {
+    IModel<String> instanceName = PropertyModel.of(currentInstance, "name");
+    DropDownButton instancesMenu = new NavbarDropDownButton(instanceName) {
+
+      /**
+       * 
+       */
+      private static final long serialVersionUID = 1L;
+
+      @Override
+      protected List<AbstractLink> newSubMenuButtons(String buttonMarkupId) {
+        List<NutchInstance> instances = instanceService.getInstances();
+        List<AbstractLink> subMenu = Lists.newArrayList();
+        for (NutchInstance instance : instances) {
+          subMenu.add(new Link<NutchInstance>(buttonMarkupId, Model
+              .of(instance)) {
+            /**
+                 * 
+                 */
+                private static final long serialVersionUID = 1L;
+
+            @Override
+            public void onClick() {
+              currentInstance.setObject(getModelObject());
+              setResponsePage(DashboardPage.class);
+            }
+          }.setBody(Model.of(instance.getName())));
+        }
+        return subMenu;
+      }
+    }.setIconType(FontAwesomeIconType.gears);
+
+    return instancesMenu;
+  }
+
+  private <P extends Page> void addMenuItem(Class<P> page, String label,
+      IconType icon) {
+    Component button = new NavbarButton<Void>(page, Model.of(getString(label)))
+        .setIconType(icon);
+    navbar.addComponents(NavbarComponents.transform(LEFT, button));
+  }
+
+  protected NutchInstance getCurrentInstance() {
+    return currentInstance.getObject();
+  }
+
+  private class InstanceModel extends LoadableDetachableModel<NutchInstance> {
+
+    /**
+     * 
+     */
+    private static final long serialVersionUID = 1L;
+
+    @Override
+    public void setObject(NutchInstance instance) {
+      super.setObject(instance);
+      getSession().setAttribute("instanceId", instance.getId());
+    }
+
+    @Override
+    protected NutchInstance load() {
+      Long instanceId = (Long) getSession().getAttribute("instanceId");
+      if (instanceId == null) {
+        return getFirstInstance();
+      }
+      return instanceService.getInstance(instanceId);
+    }
+
+    private NutchInstance getFirstInstance() {
+      return Iterables.getFirst(instanceService.getInstances(), null);
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/pages/DashboardPage.html
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/pages/DashboardPage.html b/nutch-core/src/main/java/org/apache/nutch/webui/pages/DashboardPage.html
new file mode 100644
index 0000000..b6d5426
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/pages/DashboardPage.html
@@ -0,0 +1,52 @@
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
+	license agreements. See the NOTICE file distributed with this work for additional 
+	information regarding copyright ownership. The ASF licenses this file to 
+	You under the Apache License, Version 2.0 (the "License"); you may not use 
+	this file except in compliance with the License. You may obtain a copy of 
+	the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
+	by applicable law or agreed to in writing, software distributed under the 
+	License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
+	OF ANY KIND, either express or implied. See the License for the specific 
+	language governing permissions and limitations under the License. -->
+<!DOCTYPE html>
+<html xmlns:wicket="http://wicket.apache.org">
+<head>
+<meta charset="utf-8" />
+<title>Wicket extend</title>
+</head>
+
+<body>
+	<wicket:extend>
+		<h2>
+			<wicket:message key="navbar.menu.dashboard">Instances</wicket:message>
+		</h2>
+		<div class="row">
+			<div class="col-lg-3">
+				<div class="panel panel-info">
+					<div wicket:id="panel" class="panel-heading">
+						<div class="row">
+							<div class="col-xs-6">
+								<i class="fa fa-gears fa-5x"></i>
+							</div>
+							<div class="col-xs-6 text-right">
+								<p class="announcement-heading" wicket:id="jobsRunning">2</p>
+								<p class="announcement-text">Jobs running</p>
+							</div>
+						</div>
+					</div>
+					<a href="#non-existing-id" wicket:id="viewInstances">
+						<div class="panel-footer announcement-bottom">
+							<div class="row">
+								<div class="col-xs-6">View instances</div>
+								<div class="col-xs-6 text-right">
+									<i class="fa fa-arrow-circle-right"></i>
+								</div>
+							</div>
+						</div>
+					</a>
+				</div>
+			</div>
+		</div>
+	</wicket:extend>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/pages/DashboardPage.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/pages/DashboardPage.java b/nutch-core/src/main/java/org/apache/nutch/webui/pages/DashboardPage.java
new file mode 100644
index 0000000..50586b9
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/pages/DashboardPage.java
@@ -0,0 +1,65 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.webui.pages;
+
+import org.apache.nutch.webui.client.model.NutchStatus;
+import org.apache.nutch.webui.model.NutchInstance;
+import org.apache.nutch.webui.pages.instances.InstancesPage;
+import org.apache.nutch.webui.service.NutchService;
+import org.apache.wicket.ajax.AjaxSelfUpdatingTimerBehavior;
+import org.apache.wicket.markup.html.WebMarkupContainer;
+import org.apache.wicket.markup.html.basic.Label;
+import org.apache.wicket.markup.html.link.BookmarkablePageLink;
+import org.apache.wicket.model.LoadableDetachableModel;
+import org.apache.wicket.spring.injection.annot.SpringBean;
+import org.apache.wicket.util.time.Duration;
+
+public class DashboardPage extends AbstractBasePage<Object> {
+  /**
+   * 
+   */
+  private static final long serialVersionUID = 1L;
+
+  @SpringBean
+  private NutchService nutchService;
+
+  private WebMarkupContainer panel;
+
+  public DashboardPage() {
+    panel = new WebMarkupContainer("panel");
+    panel.setOutputMarkupId(true);
+    panel.add(new AjaxSelfUpdatingTimerBehavior(Duration.ONE_SECOND));
+    panel.add(new Label("jobsRunning", new JobsModel()));
+    add(panel);
+    add(new BookmarkablePageLink<Void>("viewInstances", InstancesPage.class));
+  }
+
+  private class JobsModel extends LoadableDetachableModel<Integer> {
+    /**
+     * 
+     */
+    private static final long serialVersionUID = 1L;
+
+    @Override
+    protected Integer load() {
+      NutchInstance currentInstance = getCurrentInstance();
+      Long id = currentInstance.getId();
+      NutchStatus nutchStatus = nutchService.getNutchStatus(id);
+      return nutchStatus.getRunningJobs().size();
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/pages/LogOutPage.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/pages/LogOutPage.java b/nutch-core/src/main/java/org/apache/nutch/webui/pages/LogOutPage.java
new file mode 100644
index 0000000..9d0298f
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/pages/LogOutPage.java
@@ -0,0 +1,21 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.webui.pages;
+
+public class LogOutPage extends AbstractBasePage {
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/pages/SchedulingPage.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/pages/SchedulingPage.java b/nutch-core/src/main/java/org/apache/nutch/webui/pages/SchedulingPage.java
new file mode 100644
index 0000000..54876a4
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/pages/SchedulingPage.java
@@ -0,0 +1,21 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.webui.pages;
+
+public class SchedulingPage extends AbstractBasePage {
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/pages/SearchPage.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/pages/SearchPage.java b/nutch-core/src/main/java/org/apache/nutch/webui/pages/SearchPage.java
new file mode 100644
index 0000000..4a5a736
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/pages/SearchPage.java
@@ -0,0 +1,21 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.webui.pages;
+
+public class SearchPage extends AbstractBasePage {
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/pages/StatisticsPage.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/pages/StatisticsPage.java b/nutch-core/src/main/java/org/apache/nutch/webui/pages/StatisticsPage.java
new file mode 100644
index 0000000..048fb3c
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/pages/StatisticsPage.java
@@ -0,0 +1,21 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.webui.pages;
+
+public class StatisticsPage extends AbstractBasePage {
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/pages/UrlsUploadPage.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/pages/UrlsUploadPage.java b/nutch-core/src/main/java/org/apache/nutch/webui/pages/UrlsUploadPage.java
new file mode 100644
index 0000000..e7c1b28
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/pages/UrlsUploadPage.java
@@ -0,0 +1,21 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.webui.pages;
+
+public class UrlsUploadPage extends AbstractBasePage {
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/pages/UserSettingsPage.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/pages/UserSettingsPage.java b/nutch-core/src/main/java/org/apache/nutch/webui/pages/UserSettingsPage.java
new file mode 100644
index 0000000..3e64963
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/pages/UserSettingsPage.java
@@ -0,0 +1,21 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.webui.pages;
+
+public class UserSettingsPage extends AbstractBasePage {
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/pages/assets/NutchUiCssReference.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/pages/assets/NutchUiCssReference.java b/nutch-core/src/main/java/org/apache/nutch/webui/pages/assets/NutchUiCssReference.java
new file mode 100644
index 0000000..52fe98e
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/pages/assets/NutchUiCssReference.java
@@ -0,0 +1,39 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.webui.pages.assets;
+
+import org.apache.wicket.request.resource.CssResourceReference;
+
+public class NutchUiCssReference extends CssResourceReference {
+  private static final long serialVersionUID = 1L;
+
+  /**
+   * Singleton instance of this reference
+   */
+  private static final NutchUiCssReference INSTANCE = new NutchUiCssReference();
+
+  public static NutchUiCssReference instance() {
+    return INSTANCE;
+  }
+
+  /**
+   * Private constructor.
+   */
+  private NutchUiCssReference() {
+    super(NutchUiCssReference.class, "nutch-style.css");
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/pages/assets/nutch-style.css
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/pages/assets/nutch-style.css b/nutch-core/src/main/java/org/apache/nutch/webui/pages/assets/nutch-style.css
new file mode 100644
index 0000000..8cc01ac
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/pages/assets/nutch-style.css
@@ -0,0 +1,149 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+@CHARSET "UTF-8";
+
+body {
+	margin-top: 50px;
+}
+
+#wrapper {
+	padding-left: 0;
+}
+
+#page-wrapper {
+	width: 100%;
+	padding: 5px 15px;
+}
+
+/* Nav Messages */
+.messages-dropdown .dropdown-menu .message-preview .avatar,.messages-dropdown .dropdown-menu .message-preview .name,.messages-dropdown .dropdown-menu .message-preview .message,.messages-dropdown .dropdown-menu .message-preview .time
+	{
+	display: block;
+}
+
+.messages-dropdown .dropdown-menu .message-preview .avatar {
+	float: left;
+	margin-right: 15px;
+}
+
+.messages-dropdown .dropdown-menu .message-preview .name {
+	font-weight: bold;
+}
+
+.messages-dropdown .dropdown-menu .message-preview .message {
+	font-size: 12px;
+}
+
+.messages-dropdown .dropdown-menu .message-preview .time {
+	font-size: 12px;
+}
+
+/* Nav Announcements */
+.announcement-heading {
+	font-size: 50px;
+	margin: 0;
+}
+
+.announcement-text {
+	margin: 0;
+}
+
+/* Table Headers */
+table.tablesorter thead {
+	cursor: pointer;
+}
+
+table.tablesorter thead tr th:hover {
+	background-color: #f5f5f5;
+}
+
+/* Flot Chart Containers */
+.flot-chart {
+	display: block;
+	height: 400px;
+}
+
+.flot-chart-content {
+	width: 100%;
+	height: 100%;
+}
+
+/* Edit Below to Customize Widths > 768px */
+@media ( min-width :768px) {
+	/* Wrappers */
+	#wrapper {
+		padding-left: 225px;
+	}
+	#page-wrapper {
+		padding: 15px 25px;
+	}
+
+	/* Side Nav */
+	.side-nav {
+		margin-left: -225px;
+		left: 225px;
+		width: 225px;
+		position: fixed;
+		top: 50px;
+		height: 100%;
+		border-radius: 0;
+		border: none;
+		background-color: #222222;
+		overflow-y: auto;
+	}
+
+	/* Bootstrap Default Overrides - Customized Dropdowns for the Side Nav */
+	.side-nav>li.dropdown>ul.dropdown-menu {
+		position: relative;
+		min-width: 225px;
+		margin: 0;
+		padding: 0;
+		border: none;
+		border-radius: 0;
+		background-color: transparent;
+		box-shadow: none;
+		-webkit-box-shadow: none;
+	}
+	.side-nav>li.dropdown>ul.dropdown-menu>li>a {
+		color: #999999;
+		padding: 15px 15px 15px 25px;
+	}
+	.side-nav>li.dropdown>ul.dropdown-menu>li>a:hover,.side-nav>li.dropdown>ul.dropdown-menu>li>a.active,.side-nav>li.dropdown>ul.dropdown-menu>li>a:focus
+		{
+		color: #fff;
+		background-color: #080808;
+	}
+	.side-nav>li>a {
+		width: 225px;
+	}
+	.navbar-inverse .navbar-nav>li>a:hover,.navbar-inverse .navbar-nav>li>a:focus
+		{
+		background-color: #080808;
+	}
+
+	/* Nav Messages */
+	.messages-dropdown .dropdown-menu {
+		min-width: 300px;
+	}
+	.messages-dropdown .dropdown-menu li a {
+		white-space: normal;
+	}
+	.navbar-collapse {
+		padding-left: 15px !important;
+		padding-right: 15px !important;
+	}
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/pages/components/ColorEnumLabel.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/pages/components/ColorEnumLabel.java b/nutch-core/src/main/java/org/apache/nutch/webui/pages/components/ColorEnumLabel.java
new file mode 100644
index 0000000..f509bd5
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/pages/components/ColorEnumLabel.java
@@ -0,0 +1,71 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.webui.pages.components;
+
+import java.util.Map;
+
+import org.apache.wicket.markup.html.basic.EnumLabel;
+import org.apache.wicket.model.AbstractReadOnlyModel;
+import org.apache.wicket.model.IModel;
+
+import de.agilecoders.wicket.core.markup.html.bootstrap.block.LabelBehavior;
+import de.agilecoders.wicket.core.markup.html.bootstrap.block.LabelType;
+
+/**
+ * Label which renders connection status as bootstrap label
+ * 
+ * @author feodor
+ * 
+ */
+public class ColorEnumLabel<E extends Enum<E>> extends EnumLabel<E> {
+  private Map<E, LabelType> labelTypeMap;
+
+  ColorEnumLabel(String id, IModel<E> model, Map<E, LabelType> labelTypeMap) {
+    super(id, model);
+    this.labelTypeMap = labelTypeMap;
+  }
+
+  @Override
+  protected void onInitialize() {
+    super.onInitialize();
+    setOutputMarkupId(true);
+    add(new LabelBehavior(new EnumCssModel(getModel())));
+  }
+
+  private class EnumCssModel extends AbstractReadOnlyModel<LabelType> {
+    private IModel<E> model;
+
+    public EnumCssModel(IModel<E> model) {
+      this.model = model;
+    }
+
+    @Override
+    public LabelType getObject() {
+      LabelType labelType = labelTypeMap.get(model.getObject());
+      if (labelType == null) {
+        return LabelType.Default;
+      }
+      return labelType;
+    }
+  }
+
+  public static <E extends Enum<E>> ColorEnumLabelBuilder<E> getBuilder(
+      String id) {
+    return new ColorEnumLabelBuilder<E>(id);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/pages/components/ColorEnumLabelBuilder.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/pages/components/ColorEnumLabelBuilder.java b/nutch-core/src/main/java/org/apache/nutch/webui/pages/components/ColorEnumLabelBuilder.java
new file mode 100644
index 0000000..3ddaede
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/pages/components/ColorEnumLabelBuilder.java
@@ -0,0 +1,49 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.webui.pages.components;
+
+import java.util.Map;
+
+import org.apache.wicket.model.IModel;
+
+import com.google.common.collect.Maps;
+
+import de.agilecoders.wicket.core.markup.html.bootstrap.block.LabelType;
+
+public class ColorEnumLabelBuilder<E extends Enum<E>> {
+  private Map<E, LabelType> labelTypeMap = Maps.newHashMap();
+  private IModel<E> model;
+  private String id;
+
+  public ColorEnumLabelBuilder(String id) {
+    this.id = id;
+  }
+
+  public ColorEnumLabelBuilder<E> withModel(IModel<E> model) {
+    this.model = model;
+    return this;
+  }
+
+  public ColorEnumLabelBuilder<E> withEnumColor(E e, LabelType type) {
+    labelTypeMap.put(e, type);
+    return this;
+  }
+
+  public ColorEnumLabel<E> build() {
+    return new ColorEnumLabel<E>(id, model, labelTypeMap);
+  }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/pages/components/CpmIteratorAdapter.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/pages/components/CpmIteratorAdapter.java b/nutch-core/src/main/java/org/apache/nutch/webui/pages/components/CpmIteratorAdapter.java
new file mode 100644
index 0000000..91874c3
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/pages/components/CpmIteratorAdapter.java
@@ -0,0 +1,41 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.webui.pages.components;
+
+import org.apache.wicket.markup.repeater.util.ModelIteratorAdapter;
+import org.apache.wicket.model.CompoundPropertyModel;
+import org.apache.wicket.model.IModel;
+
+/**
+ * This is iterator adapter, which wraps iterable items with
+ * CompoundPropertyModel.
+ * 
+ * @author feodor
+ * 
+ * @param <T>
+ */
+public class CpmIteratorAdapter<T> extends ModelIteratorAdapter<T> {
+  public CpmIteratorAdapter(Iterable<T> iterable) {
+    super(iterable);
+  }
+
+  @Override
+  protected IModel<T> model(T object) {
+    return new CompoundPropertyModel<T>(object);
+  }
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/pages/crawls/CrawlPanel.html
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/pages/crawls/CrawlPanel.html b/nutch-core/src/main/java/org/apache/nutch/webui/pages/crawls/CrawlPanel.html
new file mode 100644
index 0000000..81095f0
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/pages/crawls/CrawlPanel.html
@@ -0,0 +1,58 @@
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
+	license agreements. See the NOTICE file distributed with this work for additional 
+	information regarding copyright ownership. The ASF licenses this file to 
+	You under the Apache License, Version 2.0 (the "License"); you may not use 
+	this file except in compliance with the License. You may obtain a copy of 
+	the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
+	by applicable law or agreed to in writing, software distributed under the 
+	License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
+	OF ANY KIND, either express or implied. See the License for the specific 
+	language governing permissions and limitations under the License. -->
+<!DOCTYPE html>
+<html xmlns:wicket="http://wicket.apache.org">
+<head>
+<meta charset="utf-8" />
+<title>Wicket extend</title>
+</head>
+
+<body>
+	<wicket:extend>
+		<div wicket:id="notificationPanel"></div>
+
+		<form class="form-horizontal" wicket:id="crawlForm">
+			<div class="form-group">
+				<label for="inputEmail" class="control-label col-xs-2">Crawl id</label>
+				<div class="col-xs-10">
+					<span wicket:id="crawlId">123-1321-123</span>
+				</div>
+			</div>
+			<div class="form-group">
+				<label for="seedDir" class="control-label col-xs-2">Crawl name</label>
+				<div class="col-xs-10">
+					<input class="form-control" id="seedDir" wicket:id="crawlName" placeholder="Crawl name">
+				</div>
+			</div>
+			<div class="form-group">
+				<label for="seedDir" class="control-label col-xs-2">Seed list</label>
+				<div class="col-xs-10">
+					<select wicket:id="seedList">
+						<option>Google list</option>
+						<option>Yahoo list</option>
+					</select>
+				</div>
+			</div>
+
+			<div class="form-group">
+				<label for="numberOfRounds" class="control-label col-xs-2">Rounds</label>
+				<div class="col-xs-10">
+					<select wicket:id="numberOfRounds">
+						<option>1</option>
+						<option>2</option>
+					</select>
+				</div>
+			</div>
+		</form>
+
+	</wicket:extend>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/pages/crawls/CrawlPanel.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/pages/crawls/CrawlPanel.java b/nutch-core/src/main/java/org/apache/nutch/webui/pages/crawls/CrawlPanel.java
new file mode 100644
index 0000000..be2cf42
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/pages/crawls/CrawlPanel.java
@@ -0,0 +1,98 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.webui.pages.crawls;
+
+import java.util.List;
+
+import org.apache.nutch.webui.client.model.Crawl;
+import org.apache.nutch.webui.model.SeedList;
+import org.apache.nutch.webui.service.CrawlService;
+import org.apache.nutch.webui.service.SeedListService;
+import org.apache.wicket.ajax.AjaxRequestTarget;
+import org.apache.wicket.ajax.markup.html.form.AjaxSubmitLink;
+import org.apache.wicket.markup.html.basic.Label;
+import org.apache.wicket.markup.html.form.ChoiceRenderer;
+import org.apache.wicket.markup.html.form.DropDownChoice;
+import org.apache.wicket.markup.html.form.Form;
+import org.apache.wicket.markup.html.form.TextField;
+import org.apache.wicket.model.IModel;
+import org.apache.wicket.model.Model;
+import org.apache.wicket.spring.injection.annot.SpringBean;
+
+import com.google.common.collect.Lists;
+
+import de.agilecoders.wicket.core.markup.html.bootstrap.common.NotificationPanel;
+import de.agilecoders.wicket.core.markup.html.bootstrap.dialog.Modal;
+import de.agilecoders.wicket.core.markup.html.bootstrap.form.BootstrapForm;
+
+public class CrawlPanel extends Modal {
+  private static final int MAX_ROUNDS = 10;
+
+  private BootstrapForm<Crawl> form;
+
+  @SpringBean
+  private CrawlService crawlService;
+
+  @SpringBean
+  private SeedListService seedListService;
+
+  private NotificationPanel notificationPanel;
+
+  public CrawlPanel(String markupId) {
+    super(markupId);
+    header(Model.of("Crawl"));
+
+    notificationPanel = new NotificationPanel("notificationPanel");
+    notificationPanel.setOutputMarkupId(true);
+    add(notificationPanel);
+
+    form = new BootstrapForm<Crawl>("crawlForm");
+    form.add(new Label("crawlId"));
+    form.add(new TextField<String>("crawlName").setRequired(true));
+
+    form.add(new DropDownChoice<Integer>("numberOfRounds", getNumbersOfRounds()));
+    form.add(new DropDownChoice<SeedList>("seedList",
+        seedListService.findAll(), new ChoiceRenderer<SeedList>("name"))
+        .setRequired(true));
+
+    addButton(new AjaxSubmitLink("button", form) {
+      @Override
+      protected void onSubmit(AjaxRequestTarget target, Form<?> ajaxForm) {
+        crawlService.saveCrawl(form.getModelObject());
+        target.add(this.getPage());
+      }
+
+      protected void onError(AjaxRequestTarget target, Form<?> form) {
+        target.add(notificationPanel);
+      };
+    }.setBody(Model.of("Save")));
+    add(form);
+  }
+
+  public void setModel(IModel<Crawl> model) {
+    form.setModel(model);
+  }
+
+  private List<Integer> getNumbersOfRounds() {
+    List<Integer> numbers = Lists.newArrayList();
+    for (int i = 1; i <= MAX_ROUNDS; i++) {
+      numbers.add(i);
+    }
+    return numbers;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/pages/crawls/CrawlsPage.html
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/pages/crawls/CrawlsPage.html b/nutch-core/src/main/java/org/apache/nutch/webui/pages/crawls/CrawlsPage.html
new file mode 100644
index 0000000..3c5d789
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/pages/crawls/CrawlsPage.html
@@ -0,0 +1,90 @@
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
+	license agreements. See the NOTICE file distributed with this work for additional 
+	information regarding copyright ownership. The ASF licenses this file to 
+	You under the Apache License, Version 2.0 (the "License"); you may not use 
+	this file except in compliance with the License. You may obtain a copy of 
+	the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
+	by applicable law or agreed to in writing, software distributed under the 
+	License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
+	OF ANY KIND, either express or implied. See the License for the specific 
+	language governing permissions and limitations under the License. -->
+<!DOCTYPE html>
+<html xmlns:wicket="http://wicket.apache.org">
+<head>
+<meta charset="utf-8" />
+<title>Wicket extend</title>
+</head>
+
+<body>
+	<wicket:extend>
+		<h2>
+			<wicket:message key="navbar.menu.crawls">Crawls</wicket:message>
+		</h2>
+
+		<div class="row">
+			<div class="col-lg-8">
+				<div class="row">
+					<div class="col-lg-8 col-md-offset-10">
+						<button class="btn btn-success btn-default" wicket:id="newCrawl">
+							<i class="fa fa-plus"></i> Add new crawl
+						</button>
+					</div>
+				</div>
+				<table class="table table-hover table-striped tablesorter">
+					<thead>
+						<tr>
+							<th class="header col-md-2">Crawl name</th>
+							<th class="header col-md-2">Seed list</th>
+							<th class="header col-md-2">Status</th>
+							<th class="header col-md-2">Progress</th>
+							<th></th>
+						</tr>
+					</thead>
+
+					<tbody wicket:id="crawlsTable">
+						<tr wicket:id="crawls">
+							<td>
+								<a href="#" data-toggle="modal" data-target="#crawlInfo" wicket:id="edit">
+									<span wicket:id="crawlName">Crawl name</span>
+								</a>
+							</td>
+							<td>
+								<span wicket:id="seedList.name">Google list</span>
+							</td>
+							<td>
+								<span wicket:id="status" class="label">Finished</span>
+							</td>
+							<td>
+								<span wicket:id="progress">50</span>
+								%
+
+							</td>
+
+							<td>
+								<button class="btn btn-sm btn-default" type="button" wicket:id="start">
+									<span class="fa fa-play"></span>
+								</button>
+								<button class="btn btn-sm btn-danger" type="button" wicket:id="delete">
+									<span class="fa fa-trash-o"></span>
+								</button>
+							</td>
+						</tr>
+					</tbody>
+				</table>
+				<div wicket:id="crawl"></div>
+			</div>
+			<div class="col-lg-4">
+				<div class="panel panel-primary">
+					<div class="panel-heading">
+						<h3 class="panel-title">Help</h3>
+					</div>
+					<div class="panel-body">
+						<p>Some help about crawling</p>
+					</div>
+				</div>
+			</div>
+		</div>
+		<!--row-->
+	</wicket:extend>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/pages/crawls/CrawlsPage.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/pages/crawls/CrawlsPage.java b/nutch-core/src/main/java/org/apache/nutch/webui/pages/crawls/CrawlsPage.java
new file mode 100644
index 0000000..5117520
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/pages/crawls/CrawlsPage.java
@@ -0,0 +1,139 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.webui.pages.crawls;
+
+import static de.agilecoders.wicket.core.markup.html.bootstrap.block.LabelType.Danger;
+import static de.agilecoders.wicket.core.markup.html.bootstrap.block.LabelType.Default;
+import static de.agilecoders.wicket.core.markup.html.bootstrap.block.LabelType.Info;
+import static de.agilecoders.wicket.core.markup.html.bootstrap.block.LabelType.Success;
+import static org.apache.nutch.webui.client.model.Crawl.CrawlStatus.CRAWLING;
+import static org.apache.nutch.webui.client.model.Crawl.CrawlStatus.ERROR;
+import static org.apache.nutch.webui.client.model.Crawl.CrawlStatus.FINISHED;
+import static org.apache.nutch.webui.client.model.Crawl.CrawlStatus.NEW;
+
+import java.util.Iterator;
+
+import org.apache.nutch.webui.client.model.Crawl;
+import org.apache.nutch.webui.client.model.Crawl.CrawlStatus;
+import org.apache.nutch.webui.pages.AbstractBasePage;
+import org.apache.nutch.webui.pages.components.ColorEnumLabelBuilder;
+import org.apache.nutch.webui.pages.components.CpmIteratorAdapter;
+import org.apache.nutch.webui.service.CrawlService;
+import org.apache.wicket.ajax.AjaxRequestTarget;
+import org.apache.wicket.ajax.AjaxSelfUpdatingTimerBehavior;
+import org.apache.wicket.ajax.markup.html.AjaxLink;
+import org.apache.wicket.markup.html.WebMarkupContainer;
+import org.apache.wicket.markup.html.basic.EnumLabel;
+import org.apache.wicket.markup.html.basic.Label;
+import org.apache.wicket.markup.html.link.Link;
+import org.apache.wicket.markup.repeater.Item;
+import org.apache.wicket.markup.repeater.RefreshingView;
+import org.apache.wicket.model.CompoundPropertyModel;
+import org.apache.wicket.model.IModel;
+import org.apache.wicket.spring.injection.annot.SpringBean;
+import org.apache.wicket.util.time.Duration;
+
+/**
+ * This page is for crawls management
+ * 
+ * @author feodor
+ * 
+ */
+public class CrawlsPage extends AbstractBasePage<Void> {
+
+  private static final Duration UPDATE_TIMEOUT = Duration.seconds(2);
+
+  @SpringBean
+  private CrawlService crawlService;
+
+  private WebMarkupContainer crawlsTable;
+  private CrawlPanel crawlPanel;
+
+  public CrawlsPage() {
+    crawlsTable = new WebMarkupContainer("crawlsTable");
+    crawlsTable.setOutputMarkupId(true);
+    crawlsTable.add(new AjaxSelfUpdatingTimerBehavior(UPDATE_TIMEOUT));
+
+    RefreshingView<Crawl> crawls = new RefreshingView<Crawl>("crawls") {
+
+      @Override
+      protected Iterator<IModel<Crawl>> getItemModels() {
+        return new CpmIteratorAdapter<Crawl>(crawlService.getCrawls());
+      }
+
+      @Override
+      protected void populateItem(Item<Crawl> item) {
+        populateCrawlRow(item);
+      }
+    };
+
+    crawlsTable.add(crawls);
+    add(crawlsTable);
+
+    crawlPanel = new CrawlPanel("crawl");
+    add(crawlPanel);
+
+    add(new AjaxLink<Crawl>("newCrawl") {
+      @Override
+      public void onClick(AjaxRequestTarget target) {
+        editCrawl(target, new CompoundPropertyModel<Crawl>(createNewCrawl()));
+      }
+    });
+  }
+
+  private void populateCrawlRow(Item<Crawl> item) {
+    item.add(new AjaxLink<Crawl>("edit", item.getModel()) {
+      @Override
+      public void onClick(AjaxRequestTarget target) {
+        editCrawl(target, getModel());
+      }
+    }.add(new Label("crawlName")));
+    item.add(new Label("seedList.name"));
+
+    item.add(new Label("progress"));
+    item.add(createStatusLabel());
+    item.add(new Link<Crawl>("start", item.getModel()) {
+      @Override
+      public void onClick() {
+        crawlService.startCrawl(getModelObject().getId(), getCurrentInstance());
+      }
+    });
+
+    item.add(new Link<Crawl>("delete", item.getModel()) {
+      @Override
+      public void onClick() {
+        crawlService.deleteCrawl(getModelObject().getId());
+      }
+    });
+  }
+
+  private void editCrawl(AjaxRequestTarget target, IModel<Crawl> model) {
+    crawlPanel.setModel(model);
+    target.add(crawlPanel);
+    crawlPanel.appendShowDialogJavaScript(target);
+  }
+
+  private Crawl createNewCrawl() {
+    return new Crawl();
+  }
+
+  private EnumLabel<CrawlStatus> createStatusLabel() {
+    return new ColorEnumLabelBuilder<CrawlStatus>("status")
+        .withEnumColor(NEW, Default).withEnumColor(ERROR, Danger)
+        .withEnumColor(FINISHED, Success).withEnumColor(CRAWLING, Info).build();
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/pages/instances/InstancePanel.html
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/pages/instances/InstancePanel.html b/nutch-core/src/main/java/org/apache/nutch/webui/pages/instances/InstancePanel.html
new file mode 100644
index 0000000..cdbc242
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/pages/instances/InstancePanel.html
@@ -0,0 +1,46 @@
+<!DOCTYPE html>
+<html>
+<head>
+<meta charset="UTF-8">
+<title>Insert title here</title>
+</head>
+<body>
+	<wicket:extend>
+		<div wicket:id="notificationPanel"></div>
+		<form class="form-horizontal" wicket:id="instanceForm">
+			<div class="form-group">
+				<label class="control-label col-xs-2"><wicket:message key="instances.label.name">Instance name</wicket:message></label>
+				<div class="col-xs-10">
+					<input class="form-control" wicket:id="name" placeholder="Localhost instance">
+				</div>
+			</div>
+			<div class="form-group">
+				<label class="control-label col-xs-2"><wicket:message key="instances.label.hostname">Host</wicket:message></label>
+				<div class="col-xs-10">
+					<input class="form-control" wicket:id="host" placeholder="http://localhost:8080">
+				</div>
+			</div>
+
+			<div class="form-group">
+				<label class="control-label col-xs-2"><wicket:message key="instances.label.port">Port</wicket:message></label>
+				<div class="col-xs-10">
+					<input class="form-control" wicket:id="port" placeholder="http://localhost:8080">
+				</div>
+			</div>
+			
+			<div class="form-group">
+				<label class="control-label col-xs-2"><wicket:message key="instances.label.username">Username</wicket:message></label>
+				<div class="col-xs-10">
+					<input class="form-control" wicket:id="username" placeholder="">
+				</div>
+			</div>
+			<div class="form-group">
+				<label class="control-label col-xs-2"><wicket:message key="instances.label.password">Password</wicket:message></label>
+				<div class="col-xs-10">
+					<input class="form-control" type="password" wicket:id="password" placeholder="">
+				</div>
+			</div>
+		</form>
+	</wicket:extend>
+</body>
+</html>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/pages/instances/InstancePanel.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/pages/instances/InstancePanel.java b/nutch-core/src/main/java/org/apache/nutch/webui/pages/instances/InstancePanel.java
new file mode 100644
index 0000000..a03ec1d
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/pages/instances/InstancePanel.java
@@ -0,0 +1,62 @@
+package org.apache.nutch.webui.pages.instances;
+
+import org.apache.nutch.webui.model.NutchInstance;
+import org.apache.nutch.webui.service.NutchInstanceService;
+import org.apache.wicket.ajax.AjaxRequestTarget;
+import org.apache.wicket.ajax.markup.html.form.AjaxSubmitLink;
+import org.apache.wicket.markup.html.form.Form;
+import org.apache.wicket.markup.html.form.PasswordTextField;
+import org.apache.wicket.markup.html.form.TextField;
+import org.apache.wicket.model.IModel;
+import org.apache.wicket.model.Model;
+import org.apache.wicket.spring.injection.annot.SpringBean;
+
+import de.agilecoders.wicket.core.markup.html.bootstrap.common.NotificationPanel;
+import de.agilecoders.wicket.core.markup.html.bootstrap.dialog.Modal;
+import de.agilecoders.wicket.core.markup.html.bootstrap.form.BootstrapForm;
+
+public class InstancePanel extends Modal {
+
+  private BootstrapForm<NutchInstance> form;
+
+  private NotificationPanel notificationPanel;
+
+  @SpringBean
+  private NutchInstanceService instanceService;
+
+  public InstancePanel(String markupId) {
+    super(markupId);
+    header(Model.of("Instance"));
+
+    notificationPanel = new NotificationPanel("notificationPanel");
+    notificationPanel.setOutputMarkupId(true);
+    add(notificationPanel);
+
+    form = new BootstrapForm<NutchInstance>("instanceForm");
+    form.add(new TextField<String>("name").setRequired(true));
+    form.add(new TextField<String>("host").setRequired(true));
+    form.add(new TextField<Integer>("port").setRequired(true));
+    form.add(new TextField<String>("username"));
+    form.add(new PasswordTextField("password").setResetPassword(false)
+        .setRequired(false));
+
+    addButton(new AjaxSubmitLink("button", form) {
+      @Override
+      protected void onSubmit(AjaxRequestTarget target, Form<?> ajaxForm) {
+        instanceService.saveInstance(form.getModelObject());
+        target.add(this.getPage());
+
+      }
+
+      protected void onError(AjaxRequestTarget target, Form<?> form) {
+        target.add(notificationPanel);
+      };
+    }.setBody(Model.of("Save")));
+    add(form);
+  }
+
+  public void setModel(IModel<NutchInstance> model) {
+    form.setModel(model);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/pages/instances/InstancesPage.html
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/pages/instances/InstancesPage.html b/nutch-core/src/main/java/org/apache/nutch/webui/pages/instances/InstancesPage.html
new file mode 100644
index 0000000..15e6ed8
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/pages/instances/InstancesPage.html
@@ -0,0 +1,66 @@
+<!DOCTYPE html>
+<html>
+<head>
+<meta charset="UTF-8">
+<title>Instances</title>
+</head>
+<body>
+	<wicket:extend>
+		<h2>
+			<wicket:message key="navbar.menu.instances">Instances</wicket:message>
+		</h2>
+		<div class="row">
+			<div class="col-lg-8">
+				<table class="table table-hover table-striped tablesorter">
+					<thead>
+						<tr>
+							<th class="header col-md-2"><wicket:message key="instances.header.name">Name</wicket:message></th>
+							<th class="header col-md-2"><wicket:message key="instances.header.hostname">Host</wicket:message></th>
+							<th class="header col-md-2"><wicket:message key="instances.header.username">Username</wicket:message></th>
+							<th class="header col-md-2"><wicket:message key="instances.header.status">Status</wicket:message></th>
+							<th></th>
+						</tr>
+					</thead>
+					<tbody wicket:id="instancesTable">
+						<tr wicket:id="instances">
+							<td>
+								<a href="#" data-toggle="modal" data-target="#instanceInfo" wicket:id="editInstance">
+									<span wicket:id="name">Instance name</span>
+								</a>
+							</td>
+							<td>
+								<span wicket:id="host">Host</span>
+							</td>
+							<td>
+								<span wicket:id="username">Username</span>
+							</td>
+							<td>
+								<span wicket:id="connectionStatus" class="label">Status</span>
+							</td>
+							<td>
+								<button class="btn btn-sm btn-danger" type="button" wicket:id="instanceDelete">
+									<span class="fa fa-trash-o"></span>
+								</button>
+							</td>
+						</tr>
+					</tbody>
+				</table>
+				<div wicket:id="instanceForm"></div>
+				<button class="btn btn-sm btn-primary" wicket:id="addInstance">
+					<i class="fa fa-plus"></i> <wicket:message key="instances.buttons.addInstance">Add instance</wicket:message>
+				</button>
+			</div>
+			<div class="col-lg-4">
+				<div class="panel panel-primary">
+					<div class="panel-heading">
+						<h3 class="panel-title">Help</h3>
+					</div>
+					<div class="panel-body">
+						<p>Some help about crawling</p>
+					</div>
+				</div>
+			</div>
+		</div>
+	</wicket:extend>
+</body>
+</html>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/pages/instances/InstancesPage.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/pages/instances/InstancesPage.java b/nutch-core/src/main/java/org/apache/nutch/webui/pages/instances/InstancesPage.java
new file mode 100644
index 0000000..62b7806
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/pages/instances/InstancesPage.java
@@ -0,0 +1,127 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.webui.pages.instances;
+
+import static de.agilecoders.wicket.core.markup.html.bootstrap.block.LabelType.Danger;
+import static de.agilecoders.wicket.core.markup.html.bootstrap.block.LabelType.Info;
+import static de.agilecoders.wicket.core.markup.html.bootstrap.block.LabelType.Success;
+import static org.apache.nutch.webui.client.model.ConnectionStatus.CONNECTED;
+import static org.apache.nutch.webui.client.model.ConnectionStatus.CONNECTING;
+import static org.apache.nutch.webui.client.model.ConnectionStatus.DISCONNECTED;
+
+import java.util.Iterator;
+
+import org.apache.nutch.webui.client.model.ConnectionStatus;
+import org.apache.nutch.webui.model.NutchInstance;
+import org.apache.nutch.webui.pages.AbstractBasePage;
+import org.apache.nutch.webui.pages.components.ColorEnumLabel;
+import org.apache.nutch.webui.pages.components.ColorEnumLabelBuilder;
+import org.apache.nutch.webui.pages.components.CpmIteratorAdapter;
+import org.apache.nutch.webui.service.NutchInstanceService;
+import org.apache.wicket.ajax.AjaxRequestTarget;
+import org.apache.wicket.ajax.AjaxSelfUpdatingTimerBehavior;
+import org.apache.wicket.ajax.markup.html.AjaxLink;
+import org.apache.wicket.markup.html.WebMarkupContainer;
+import org.apache.wicket.markup.html.basic.Label;
+import org.apache.wicket.markup.repeater.Item;
+import org.apache.wicket.markup.repeater.RefreshingView;
+import org.apache.wicket.model.CompoundPropertyModel;
+import org.apache.wicket.model.IModel;
+import org.apache.wicket.spring.injection.annot.SpringBean;
+import org.apache.wicket.util.time.Duration;
+
+public class InstancesPage extends AbstractBasePage<Void> {
+  @SpringBean
+  private NutchInstanceService instanceService;
+
+  private InstancePanel instancePanel;
+
+  private WebMarkupContainer instancesTable;
+  private static final Duration UPDATE_TIMEOUT = Duration.seconds(1);
+
+  public InstancesPage() {
+
+    instancesTable = new WebMarkupContainer("instancesTable");
+    instancesTable.setOutputMarkupId(true);
+    instancesTable.add(new AjaxSelfUpdatingTimerBehavior(UPDATE_TIMEOUT));
+
+    instancePanel = new InstancePanel("instanceForm");
+
+    RefreshingView<NutchInstance> instances = refreshingView();
+    instancesTable.add(instances);
+    add(instancesTable);
+    add(instancePanel);
+    add(addInstanceButton());
+  }
+
+  private RefreshingView<NutchInstance> refreshingView() {
+    RefreshingView<NutchInstance> instances = new RefreshingView<NutchInstance>(
+        "instances") {
+
+      @Override
+      protected Iterator<IModel<NutchInstance>> getItemModels() {
+        return new CpmIteratorAdapter<NutchInstance>(
+            instanceService.getInstances());
+      }
+
+      @Override
+      protected void populateItem(Item<NutchInstance> item) {
+        populateInstanceRow(item);
+      }
+    };
+    return instances;
+  }
+
+  private AjaxLink<NutchInstance> addInstanceButton() {
+    return new AjaxLink<NutchInstance>("addInstance") {
+      @Override
+      public void onClick(AjaxRequestTarget target) {
+        instancePanel.setModel(new CompoundPropertyModel<NutchInstance>(
+            new NutchInstance()));
+        target.add(instancePanel);
+        instancePanel.appendShowDialogJavaScript(target);
+      }
+    };
+  }
+
+  private void populateInstanceRow(final Item<NutchInstance> item) {
+    item.add(new AjaxLink<NutchInstance>("editInstance") {
+      @Override
+      public void onClick(AjaxRequestTarget target) {
+        instancePanel.setModel(item.getModel());
+        target.add(instancePanel);
+        instancePanel.appendShowDialogJavaScript(target);
+      }
+    }.add(new Label("name")));
+    item.add(new Label("host"));
+    item.add(new Label("username"));
+    item.add(createStatusLabel());
+    item.add(new AjaxLink<NutchInstance>("instanceDelete", item.getModel()) {
+      @Override
+      public void onClick(AjaxRequestTarget target) {
+        instanceService.removeInstance(getModelObject().getId());
+        target.add(instancesTable);
+      }
+    });
+  }
+
+  private ColorEnumLabel<ConnectionStatus> createStatusLabel() {
+    return new ColorEnumLabelBuilder<ConnectionStatus>("connectionStatus")
+        .withEnumColor(CONNECTED, Success).withEnumColor(CONNECTING, Info)
+        .withEnumColor(DISCONNECTED, Danger).build();
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/pages/menu/VerticalMenu.html
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/pages/menu/VerticalMenu.html b/nutch-core/src/main/java/org/apache/nutch/webui/pages/menu/VerticalMenu.html
new file mode 100644
index 0000000..32d6e01
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/pages/menu/VerticalMenu.html
@@ -0,0 +1,48 @@
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
+	license agreements. See the NOTICE file distributed with this work for additional 
+	information regarding copyright ownership. The ASF licenses this file to 
+	You under the Apache License, Version 2.0 (the "License"); you may not use 
+	this file except in compliance with the License. You may obtain a copy of 
+	the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
+	by applicable law or agreed to in writing, software distributed under the 
+	License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
+	OF ANY KIND, either express or implied. See the License for the specific 
+	language governing permissions and limitations under the License. -->
+<html xmlns:wicket="http://wicket.apache.org">
+<head>
+<title>Navbar</title>
+</head>
+<body>
+	<wicket:panel>
+		<div class="fluid-container" wicket:id="container">
+			<!-- Brand and toggle get grouped for better mobile display -->
+			<div class="navbar-header">
+				<button type="button" class="navbar-toggle" data-toggle="collapse"
+					wicket:id="collapseButton">
+					<span class="sr-only" wicket:id="toggleNavigationLabel">[[CONTENT]]</span>
+					<span class="icon-bar"></span> <span class="icon-bar"></span> <span
+						class="icon-bar"></span>
+				</button>
+				<a wicket:id="brandName" class="navbar-brand" href="#"> <img
+					wicket:id="brandImage" /> <span wicket:id="brandLabel"></span>
+				</a>
+			</div>
+
+			<div class="collapse navbar-collapse navbar-ex1-collapse"
+				role="navigation" wicket:id="collapse">
+				<ul class="nav navbar-nav side-nav">
+					<li wicket:id="navLeftList">
+						<div wicket:id="component">[[CONTENT]]</div>
+					</li>
+				</ul>
+				<ul wicket:enclosure="navRightList"
+					class="nav navbar-nav navbar-right">
+					<li wicket:id="navRightList">
+						<div wicket:id="component">[[CONTENT]]</div>
+					</li>
+				</ul>
+			</div>
+		</div>
+	</wicket:panel>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/pages/menu/VerticalMenu.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/pages/menu/VerticalMenu.java b/nutch-core/src/main/java/org/apache/nutch/webui/pages/menu/VerticalMenu.java
new file mode 100644
index 0000000..bcdaa4d
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/pages/menu/VerticalMenu.java
@@ -0,0 +1,27 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.webui.pages.menu;
+
+import de.agilecoders.wicket.core.markup.html.bootstrap.navbar.Navbar;
+
+public class VerticalMenu extends Navbar {
+
+  public VerticalMenu(String componentId) {
+    super(componentId);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/pages/seed/SeedListsPage.html
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/pages/seed/SeedListsPage.html b/nutch-core/src/main/java/org/apache/nutch/webui/pages/seed/SeedListsPage.html
new file mode 100644
index 0000000..f9aff87
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/pages/seed/SeedListsPage.html
@@ -0,0 +1,75 @@
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
+	license agreements. See the NOTICE file distributed with this work for additional 
+	information regarding copyright ownership. The ASF licenses this file to 
+	You under the Apache License, Version 2.0 (the "License"); you may not use 
+	this file except in compliance with the License. You may obtain a copy of 
+	the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
+	by applicable law or agreed to in writing, software distributed under the 
+	License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
+	OF ANY KIND, either express or implied. See the License for the specific 
+	language governing permissions and limitations under the License. -->
+<!DOCTYPE html>
+<html xmlns:wicket="http://wicket.apache.org">
+<head>
+<meta charset="utf-8" />
+<title>Wicket extend</title>
+</head>
+
+<body>
+	<wicket:extend>
+		<h2>
+			<wicket:message key="navbar.menu.seedLists">Seed lists</wicket:message>
+		</h2>
+
+		<div class="row">
+			<div class="col-lg-8">
+				<div class="row">
+					<div class="col-lg-8 col-md-offset-10">
+						<button class="btn btn-success btn-default" wicket:id="newSeedList">
+							<i class="fa fa-plus"></i> Add new list
+						</button>
+					</div>
+				</div>
+				<table class="table table-hover table-striped tablesorter">
+					<thead>
+						<tr>
+							<th class="header col-md-3">Name</th>
+							<th class="header col-md-2">Urls</th>
+							<th></th>
+						</tr>
+					</thead>
+
+					<tbody>
+						<tr wicket:id="seedLists">
+							<td>
+								<a href="#" wicket:id="edit">
+									<span wicket:id="name">List name</span>
+								</a>
+							</td>
+							<td>
+								<span wicket:id="seedUrlsCount">10</span>
+							</td>
+							<td>
+								<button class="btn btn-sm btn-danger" type="button" wicket:id="delete">
+									<span class="fa fa-trash-o"></span>
+								</button>
+							</td>
+						</tr>
+					</tbody>
+				</table>
+			</div>
+			<div class="col-lg-4">
+				<div class="panel panel-primary">
+					<div class="panel-heading">
+						<h3 class="panel-title">Help</h3>
+					</div>
+					<div class="panel-body">
+						<p>Some help about seed lists</p>
+					</div>
+				</div>
+			</div>
+		</div>
+		<!--row-->
+	</wicket:extend>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/pages/seed/SeedListsPage.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/pages/seed/SeedListsPage.java b/nutch-core/src/main/java/org/apache/nutch/webui/pages/seed/SeedListsPage.java
new file mode 100644
index 0000000..c5ac288
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/pages/seed/SeedListsPage.java
@@ -0,0 +1,79 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.webui.pages.seed;
+
+import java.util.Iterator;
+
+import org.apache.nutch.webui.model.SeedList;
+import org.apache.nutch.webui.pages.AbstractBasePage;
+import org.apache.nutch.webui.pages.components.CpmIteratorAdapter;
+import org.apache.nutch.webui.service.SeedListService;
+import org.apache.wicket.markup.html.basic.Label;
+import org.apache.wicket.markup.html.link.BookmarkablePageLink;
+import org.apache.wicket.markup.html.link.Link;
+import org.apache.wicket.markup.repeater.Item;
+import org.apache.wicket.markup.repeater.RefreshingView;
+import org.apache.wicket.model.IModel;
+import org.apache.wicket.request.mapper.parameter.PageParameters;
+import org.apache.wicket.spring.injection.annot.SpringBean;
+
+/**
+ * This page is for seed lists management
+ * 
+ * @author feodor
+ * 
+ */
+public class SeedListsPage extends AbstractBasePage<Void> {
+
+  @SpringBean
+  private SeedListService seedListService;
+
+  public SeedListsPage() {
+
+    RefreshingView<SeedList> seedLists = new RefreshingView<SeedList>(
+        "seedLists") {
+
+      @Override
+      protected Iterator<IModel<SeedList>> getItemModels() {
+        return new CpmIteratorAdapter<SeedList>(seedListService.findAll());
+      }
+
+      @Override
+      protected void populateItem(final Item<SeedList> item) {
+        PageParameters params = new PageParameters();
+        params.add("id", item.getModelObject().getId());
+
+        Link<Void> edit = new BookmarkablePageLink<Void>("edit",
+            SeedPage.class, params);
+        edit.add(new Label("name"));
+        item.add(edit);
+
+        item.add(new Label("seedUrlsCount"));
+
+        item.add(new Link<SeedList>("delete", item.getModel()) {
+          @Override
+          public void onClick() {
+            seedListService.delete(item.getModelObject().getId());
+          }
+        });
+      }
+    };
+
+    add(seedLists);
+    add(new BookmarkablePageLink<Void>("newSeedList", SeedPage.class));
+  }
+}

[39/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build for nutch-core and nutch-plugins

Posted by th...@apache.org.

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/scoring/webgraph/Node.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/scoring/webgraph/Node.java b/nutch-core/src/main/java/org/apache/nutch/scoring/webgraph/Node.java
new file mode 100644
index 0000000..a35e842
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/scoring/webgraph/Node.java
@@ -0,0 +1,102 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.scoring.webgraph;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.hadoop.io.Writable;
+import org.apache.nutch.metadata.Metadata;
+
+/**
+ * A class which holds the number of inlinks and outlinks for a given url along
+ * with an inlink score from a link analysis program and any metadata.
+ * 
+ * The Node is the core unit of the NodeDb in the WebGraph.
+ */
+public class Node implements Writable {
+
+  private int numInlinks = 0;
+  private int numOutlinks = 0;
+  private float inlinkScore = 1.0f;
+  private Metadata metadata = new Metadata();
+
+  public Node() {
+
+  }
+
+  public int getNumInlinks() {
+    return numInlinks;
+  }
+
+  public void setNumInlinks(int numInlinks) {
+    this.numInlinks = numInlinks;
+  }
+
+  public int getNumOutlinks() {
+    return numOutlinks;
+  }
+
+  public void setNumOutlinks(int numOutlinks) {
+    this.numOutlinks = numOutlinks;
+  }
+
+  public float getInlinkScore() {
+    return inlinkScore;
+  }
+
+  public void setInlinkScore(float inlinkScore) {
+    this.inlinkScore = inlinkScore;
+  }
+
+  public float getOutlinkScore() {
+    return (numOutlinks > 0) ? inlinkScore / numOutlinks : inlinkScore;
+  }
+
+  public Metadata getMetadata() {
+    return metadata;
+  }
+
+  public void setMetadata(Metadata metadata) {
+    this.metadata = metadata;
+  }
+
+  public void readFields(DataInput in) throws IOException {
+
+    numInlinks = in.readInt();
+    numOutlinks = in.readInt();
+    inlinkScore = in.readFloat();
+    metadata.clear();
+    metadata.readFields(in);
+  }
+
+  public void write(DataOutput out) throws IOException {
+
+    out.writeInt(numInlinks);
+    out.writeInt(numOutlinks);
+    out.writeFloat(inlinkScore);
+    metadata.write(out);
+  }
+
+  public String toString() {
+    return "num inlinks: " + numInlinks + ", num outlinks: " + numOutlinks
+        + ", inlink score: " + inlinkScore + ", outlink score: "
+        + getOutlinkScore() + ", metadata: " + metadata.toString();
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/scoring/webgraph/NodeDumper.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/scoring/webgraph/NodeDumper.java b/nutch-core/src/main/java/org/apache/nutch/scoring/webgraph/NodeDumper.java
new file mode 100644
index 0000000..4a57c29
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/scoring/webgraph/NodeDumper.java
@@ -0,0 +1,433 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.scoring.webgraph;
+
+import java.io.IOException;
+import java.text.SimpleDateFormat;
+import java.util.Iterator;
+
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.CommandLineParser;
+import org.apache.commons.cli.GnuParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.OptionBuilder;
+import org.apache.commons.cli.Options;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.FloatWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.WritableUtils;
+import org.apache.hadoop.mapred.FileInputFormat;
+import org.apache.hadoop.mapred.FileOutputFormat;
+import org.apache.hadoop.mapred.JobClient;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.Mapper;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reducer;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.hadoop.mapred.SequenceFileInputFormat;
+import org.apache.hadoop.mapred.SequenceFileOutputFormat;
+import org.apache.hadoop.mapred.TextOutputFormat;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.NutchJob;
+import org.apache.nutch.util.TimingUtil;
+import org.apache.nutch.util.URLUtil;
+
+/**
+ * A tools that dumps out the top urls by number of inlinks, number of outlinks,
+ * or by score, to a text file. One of the major uses of this tool is to check
+ * the top scoring urls of a link analysis program such as LinkRank.
+ * 
+ * For number of inlinks or number of outlinks the WebGraph program will need to
+ * have been run. For link analysis score a program such as LinkRank will need
+ * to have been run which updates the NodeDb of the WebGraph.
+ */
+public class NodeDumper extends Configured implements Tool {
+
+  public static final Logger LOG = LoggerFactory.getLogger(NodeDumper.class);
+
+  private static enum DumpType {
+    INLINKS, OUTLINKS, SCORES
+  }
+
+  private static enum AggrType {
+    SUM, MAX
+  }
+
+  private static enum NameType {
+    HOST, DOMAIN
+  }
+
+  /**
+   * Outputs the top urls sorted in descending order. Depending on the flag set
+   * on the command line, the top urls could be for number of inlinks, for
+   * number of outlinks, or for link analysis score.
+   */
+  public static class Sorter extends Configured implements
+      Mapper<Text, Node, FloatWritable, Text>,
+      Reducer<FloatWritable, Text, Text, FloatWritable> {
+
+    private JobConf conf;
+    private boolean inlinks = false;
+    private boolean outlinks = false;
+    private boolean scores = false;
+    private long topn = Long.MAX_VALUE;
+
+    /**
+     * Configures the job, sets the flag for type of content and the topN number
+     * if any.
+     */
+    public void configure(JobConf conf) {
+      this.conf = conf;
+      this.inlinks = conf.getBoolean("inlinks", false);
+      this.outlinks = conf.getBoolean("outlinks", false);
+      this.scores = conf.getBoolean("scores", true);
+      this.topn = conf.getLong("topn", Long.MAX_VALUE);
+    }
+
+    public void close() {
+    }
+
+    /**
+     * Outputs the url with the appropriate number of inlinks, outlinks, or for
+     * score.
+     */
+    public void map(Text key, Node node,
+        OutputCollector<FloatWritable, Text> output, Reporter reporter)
+        throws IOException {
+
+      float number = 0;
+      if (inlinks) {
+        number = node.getNumInlinks();
+      } else if (outlinks) {
+        number = node.getNumOutlinks();
+      } else {
+        number = node.getInlinkScore();
+      }
+
+      // number collected with negative to be descending
+      output.collect(new FloatWritable(-number), key);
+    }
+
+    /**
+     * Flips and collects the url and numeric sort value.
+     */
+    public void reduce(FloatWritable key, Iterator<Text> values,
+        OutputCollector<Text, FloatWritable> output, Reporter reporter)
+        throws IOException {
+
+      // take the negative of the negative to get original value, sometimes 0
+      // value are a little weird
+      float val = key.get();
+      FloatWritable number = new FloatWritable(val == 0 ? 0 : -val);
+      long numCollected = 0;
+
+      // collect all values, this time with the url as key
+      while (values.hasNext() && (numCollected < topn)) {
+        Text url = WritableUtils.clone(values.next(), conf);
+        output.collect(url, number);
+        numCollected++;
+      }
+    }
+  }
+
+  /**
+   * Outputs the hosts or domains with an associated value. This value consists
+   * of either the number of inlinks, the number of outlinks or the score. The
+   * computed value is then either the sum of all parts or the top value.
+   */
+  public static class Dumper extends Configured implements
+      Mapper<Text, Node, Text, FloatWritable>,
+      Reducer<Text, FloatWritable, Text, FloatWritable> {
+
+    private JobConf conf;
+    private boolean inlinks = false;
+    private boolean outlinks = false;
+    private boolean scores = false;
+    private long topn = Long.MAX_VALUE;
+    private boolean host = false;
+    private boolean domain = false;
+    private boolean sum = false;
+    private boolean max = false;
+
+    public void configure(JobConf conf) {
+      this.conf = conf;
+      this.inlinks = conf.getBoolean("inlinks", false);
+      this.outlinks = conf.getBoolean("outlinks", false);
+      this.scores = conf.getBoolean("scores", true);
+      this.topn = conf.getLong("topn", Long.MAX_VALUE);
+      this.host = conf.getBoolean("host", false);
+      this.domain = conf.getBoolean("domain", false);
+      this.sum = conf.getBoolean("sum", false);
+      this.max = conf.getBoolean("max", false);
+    }
+
+    public void close() {
+    }
+
+    /**
+     * Outputs the host or domain as key for this record and numInlinks,
+     * numOutlinks or score as the value.
+     */
+    public void map(Text key, Node node,
+        OutputCollector<Text, FloatWritable> output, Reporter reporter)
+        throws IOException {
+
+      float number = 0;
+      if (inlinks) {
+        number = node.getNumInlinks();
+      } else if (outlinks) {
+        number = node.getNumOutlinks();
+      } else {
+        number = node.getInlinkScore();
+      }
+
+      if (host) {
+        key.set(URLUtil.getHost(key.toString()));
+      } else {
+        key.set(URLUtil.getDomainName(key.toString()));
+      }
+
+      output.collect(key, new FloatWritable(number));
+    }
+
+    /**
+     * Outputs either the sum or the top value for this record.
+     */
+    public void reduce(Text key, Iterator<FloatWritable> values,
+        OutputCollector<Text, FloatWritable> output, Reporter reporter)
+        throws IOException {
+
+      long numCollected = 0;
+      float sumOrMax = 0;
+      float val = 0;
+
+      // collect all values, this time with the url as key
+      while (values.hasNext() && (numCollected < topn)) {
+        val = values.next().get();
+
+        if (sum) {
+          sumOrMax += val;
+        } else {
+          if (sumOrMax < val) {
+            sumOrMax = val;
+          }
+        }
+
+        numCollected++;
+      }
+
+      output.collect(key, new FloatWritable(sumOrMax));
+    }
+  }
+
+  /**
+   * Runs the process to dump the top urls out to a text file.
+   * 
+   * @param webGraphDb
+   *          The WebGraph from which to pull values.
+   * 
+   * @param topN
+   * @param output
+   * 
+   * @throws IOException
+   *           If an error occurs while dumping the top values.
+   */
+  public void dumpNodes(Path webGraphDb, DumpType type, long topN, Path output,
+      boolean asEff, NameType nameType, AggrType aggrType,
+      boolean asSequenceFile) throws Exception {
+
+    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+    long start = System.currentTimeMillis();
+    LOG.info("NodeDumper: starting at " + sdf.format(start));
+    Path nodeDb = new Path(webGraphDb, WebGraph.NODE_DIR);
+    Configuration conf = getConf();
+
+    JobConf dumper = new NutchJob(conf);
+    dumper.setJobName("NodeDumper: " + webGraphDb);
+    FileInputFormat.addInputPath(dumper, nodeDb);
+    dumper.setInputFormat(SequenceFileInputFormat.class);
+
+    if (nameType == null) {
+      dumper.setMapperClass(Sorter.class);
+      dumper.setReducerClass(Sorter.class);
+      dumper.setMapOutputKeyClass(FloatWritable.class);
+      dumper.setMapOutputValueClass(Text.class);
+    } else {
+      dumper.setMapperClass(Dumper.class);
+      dumper.setReducerClass(Dumper.class);
+      dumper.setMapOutputKeyClass(Text.class);
+      dumper.setMapOutputValueClass(FloatWritable.class);
+    }
+
+    dumper.setOutputKeyClass(Text.class);
+    dumper.setOutputValueClass(FloatWritable.class);
+    FileOutputFormat.setOutputPath(dumper, output);
+
+    if (asSequenceFile) {
+      dumper.setOutputFormat(SequenceFileOutputFormat.class);
+    } else {
+      dumper.setOutputFormat(TextOutputFormat.class);
+    }
+
+    dumper.setNumReduceTasks(1);
+    dumper.setBoolean("inlinks", type == DumpType.INLINKS);
+    dumper.setBoolean("outlinks", type == DumpType.OUTLINKS);
+    dumper.setBoolean("scores", type == DumpType.SCORES);
+
+    dumper.setBoolean("host", nameType == NameType.HOST);
+    dumper.setBoolean("domain", nameType == NameType.DOMAIN);
+    dumper.setBoolean("sum", aggrType == AggrType.SUM);
+    dumper.setBoolean("max", aggrType == AggrType.MAX);
+
+    dumper.setLong("topn", topN);
+
+    // Set equals-sign as separator for Solr's ExternalFileField
+    if (asEff) {
+      dumper.set("mapred.textoutputformat.separator", "=");
+    }
+
+    try {
+      LOG.info("NodeDumper: running");
+      JobClient.runJob(dumper);
+    } catch (IOException e) {
+      LOG.error(StringUtils.stringifyException(e));
+      throw e;
+    }
+    long end = System.currentTimeMillis();
+    LOG.info("NodeDumper: finished at " + sdf.format(end) + ", elapsed: "
+        + TimingUtil.elapsedTime(start, end));
+  }
+
+  public static void main(String[] args) throws Exception {
+    int res = ToolRunner.run(NutchConfiguration.create(), new NodeDumper(),
+        args);
+    System.exit(res);
+  }
+
+  /**
+   * Runs the node dumper tool.
+   */
+  public int run(String[] args) throws Exception {
+
+    Options options = new Options();
+    OptionBuilder.withArgName("help");
+    OptionBuilder.withDescription("show this help message");
+    Option helpOpts = OptionBuilder.create("help");
+    options.addOption(helpOpts);
+
+    OptionBuilder.withArgName("webgraphdb");
+    OptionBuilder.hasArg();
+    OptionBuilder.withDescription("the web graph database to use");
+    Option webGraphDbOpts = OptionBuilder.create("webgraphdb");
+    options.addOption(webGraphDbOpts);
+
+    OptionBuilder.withArgName("inlinks");
+    OptionBuilder.withDescription("show highest inlinks");
+    Option inlinkOpts = OptionBuilder.create("inlinks");
+    options.addOption(inlinkOpts);
+
+    OptionBuilder.withArgName("outlinks");
+    OptionBuilder.withDescription("show highest outlinks");
+    Option outlinkOpts = OptionBuilder.create("outlinks");
+    options.addOption(outlinkOpts);
+
+    OptionBuilder.withArgName("scores");
+    OptionBuilder.withDescription("show highest scores");
+    Option scoreOpts = OptionBuilder.create("scores");
+    options.addOption(scoreOpts);
+
+    OptionBuilder.withArgName("topn");
+    OptionBuilder.hasOptionalArg();
+    OptionBuilder.withDescription("show topN scores");
+    Option topNOpts = OptionBuilder.create("topn");
+    options.addOption(topNOpts);
+
+    OptionBuilder.withArgName("output");
+    OptionBuilder.hasArg();
+    OptionBuilder.withDescription("the output directory to use");
+    Option outputOpts = OptionBuilder.create("output");
+    options.addOption(outputOpts);
+
+    OptionBuilder.withArgName("asEff");
+    OptionBuilder
+        .withDescription("Solr ExternalFileField compatible output format");
+    Option effOpts = OptionBuilder.create("asEff");
+    options.addOption(effOpts);
+
+    OptionBuilder.hasArgs(2);
+    OptionBuilder.withDescription("group <host|domain> <sum|max>");
+    Option groupOpts = OptionBuilder.create("group");
+    options.addOption(groupOpts);
+
+    OptionBuilder.withArgName("asSequenceFile");
+    OptionBuilder.withDescription("whether to output as a sequencefile");
+    Option sequenceFileOpts = OptionBuilder.create("asSequenceFile");
+    options.addOption(sequenceFileOpts);
+
+    CommandLineParser parser = new GnuParser();
+    try {
+
+      CommandLine line = parser.parse(options, args);
+      if (line.hasOption("help") || !line.hasOption("webgraphdb")) {
+        HelpFormatter formatter = new HelpFormatter();
+        formatter.printHelp("NodeDumper", options);
+        return -1;
+      }
+
+      String webGraphDb = line.getOptionValue("webgraphdb");
+      boolean inlinks = line.hasOption("inlinks");
+      boolean outlinks = line.hasOption("outlinks");
+
+      long topN = (line.hasOption("topn") ? Long.parseLong(line
+          .getOptionValue("topn")) : Long.MAX_VALUE);
+
+      // get the correct dump type
+      String output = line.getOptionValue("output");
+      DumpType type = (inlinks ? DumpType.INLINKS
+          : outlinks ? DumpType.OUTLINKS : DumpType.SCORES);
+
+      NameType nameType = null;
+      AggrType aggrType = null;
+      String[] group = line.getOptionValues("group");
+      if (group != null && group.length == 2) {
+        nameType = (group[0].equals("host") ? NameType.HOST : group[0]
+            .equals("domain") ? NameType.DOMAIN : null);
+        aggrType = (group[1].equals("sum") ? AggrType.SUM : group[1]
+            .equals("sum") ? AggrType.MAX : null);
+      }
+
+      // Use ExternalFileField?
+      boolean asEff = line.hasOption("asEff");
+      boolean asSequenceFile = line.hasOption("asSequenceFile");
+
+      dumpNodes(new Path(webGraphDb), type, topN, new Path(output), asEff,
+          nameType, aggrType, asSequenceFile);
+      return 0;
+    } catch (Exception e) {
+      LOG.error("NodeDumper: " + StringUtils.stringifyException(e));
+      return -2;
+    }
+  }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/scoring/webgraph/NodeReader.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/scoring/webgraph/NodeReader.java b/nutch-core/src/main/java/org/apache/nutch/scoring/webgraph/NodeReader.java
new file mode 100644
index 0000000..e6b6815
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/scoring/webgraph/NodeReader.java
@@ -0,0 +1,136 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.scoring.webgraph;
+
+import java.io.IOException;
+
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.CommandLineParser;
+import org.apache.commons.cli.GnuParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.OptionBuilder;
+import org.apache.commons.cli.Options;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.MapFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.MapFileOutputFormat;
+import org.apache.hadoop.mapred.lib.HashPartitioner;
+import org.apache.nutch.util.FSUtils;
+import org.apache.nutch.util.NutchConfiguration;
+
+/**
+ * Reads and prints to system out information for a single node from the NodeDb
+ * in the WebGraph.
+ */
+public class NodeReader extends Configured {
+
+  private FileSystem fs;
+  private MapFile.Reader[] nodeReaders;
+
+  public NodeReader() {
+
+  }
+
+  public NodeReader(Configuration conf) {
+    super(conf);
+  }
+
+  /**
+   * Prints the content of the Node represented by the url to system out.
+   * 
+   * @param webGraphDb
+   *          The webgraph from which to get the node.
+   * @param url
+   *          The url of the node.
+   * 
+   * @throws IOException
+   *           If an error occurs while getting the node.
+   */
+  public void dumpUrl(Path webGraphDb, String url) throws IOException {
+
+    fs = FileSystem.get(getConf());
+    nodeReaders = MapFileOutputFormat.getReaders(fs, new Path(webGraphDb,
+        WebGraph.NODE_DIR), getConf());
+
+    // open the readers, get the node, print out the info, and close the readers
+    Text key = new Text(url);
+    Node node = new Node();
+    MapFileOutputFormat.getEntry(nodeReaders,
+        new HashPartitioner<Text, Node>(), key, node);
+    System.out.println(url + ":");
+    System.out.println("  inlink score: " + node.getInlinkScore());
+    System.out.println("  outlink score: " + node.getOutlinkScore());
+    System.out.println("  num inlinks: " + node.getNumInlinks());
+    System.out.println("  num outlinks: " + node.getNumOutlinks());
+    FSUtils.closeReaders(nodeReaders);
+  }
+
+  /**
+   * Runs the NodeReader tool. The command line arguments must contain a
+   * webgraphdb path and a url. The url must match the normalized url that is
+   * contained in the NodeDb of the WebGraph.
+   */
+  public static void main(String[] args) throws Exception {
+
+    Options options = new Options();
+    OptionBuilder.withArgName("help");
+    OptionBuilder.withDescription("show this help message");
+    Option helpOpts = OptionBuilder.create("help");
+    options.addOption(helpOpts);
+
+    OptionBuilder.withArgName("webgraphdb");
+    OptionBuilder.hasArg();
+    OptionBuilder.withDescription("the webgraphdb to use");
+    Option webGraphOpts = OptionBuilder.create("webgraphdb");
+    options.addOption(webGraphOpts);
+
+    OptionBuilder.withArgName("url");
+    OptionBuilder.hasOptionalArg();
+    OptionBuilder.withDescription("the url to dump");
+    Option urlOpts = OptionBuilder.create("url");
+    options.addOption(urlOpts);
+
+    CommandLineParser parser = new GnuParser();
+    try {
+
+      // command line must take a webgraphdb and a url
+      CommandLine line = parser.parse(options, args);
+      if (line.hasOption("help") || !line.hasOption("webgraphdb")
+          || !line.hasOption("url")) {
+        HelpFormatter formatter = new HelpFormatter();
+        formatter.printHelp("WebGraphReader", options);
+        return;
+      }
+
+      // dump the values to system out and return
+      String webGraphDb = line.getOptionValue("webgraphdb");
+      String url = line.getOptionValue("url");
+      NodeReader reader = new NodeReader(NutchConfiguration.create());
+      reader.dumpUrl(new Path(webGraphDb), url);
+
+      return;
+    } catch (Exception e) {
+      e.printStackTrace();
+      return;
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/scoring/webgraph/ScoreUpdater.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/scoring/webgraph/ScoreUpdater.java b/nutch-core/src/main/java/org/apache/nutch/scoring/webgraph/ScoreUpdater.java
new file mode 100644
index 0000000..19704eb
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/scoring/webgraph/ScoreUpdater.java
@@ -0,0 +1,253 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.scoring.webgraph;
+
+import java.io.IOException;
+import java.text.SimpleDateFormat;
+import java.util.Iterator;
+import java.util.Random;
+
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.CommandLineParser;
+import org.apache.commons.cli.GnuParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.OptionBuilder;
+import org.apache.commons.cli.Options;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.ObjectWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapred.FileInputFormat;
+import org.apache.hadoop.mapred.FileOutputFormat;
+import org.apache.hadoop.mapred.JobClient;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.MapFileOutputFormat;
+import org.apache.hadoop.mapred.Mapper;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reducer;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.hadoop.mapred.SequenceFileInputFormat;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.CrawlDb;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.NutchJob;
+import org.apache.nutch.util.TimingUtil;
+
+/**
+ * Updates the score from the WebGraph node database into the crawl database.
+ * Any score that is not in the node database is set to the clear score in the
+ * crawl database.
+ */
+public class ScoreUpdater extends Configured implements Tool,
+    Mapper<Text, Writable, Text, ObjectWritable>,
+    Reducer<Text, ObjectWritable, Text, CrawlDatum> {
+
+  public static final Logger LOG = LoggerFactory.getLogger(ScoreUpdater.class);
+
+  private JobConf conf;
+  private float clearScore = 0.0f;
+
+  public void configure(JobConf conf) {
+    this.conf = conf;
+    clearScore = conf.getFloat("link.score.updater.clear.score", 0.0f);
+  }
+
+  /**
+   * Changes input into ObjectWritables.
+   */
+  public void map(Text key, Writable value,
+      OutputCollector<Text, ObjectWritable> output, Reporter reporter)
+      throws IOException {
+
+    ObjectWritable objWrite = new ObjectWritable();
+    objWrite.set(value);
+    output.collect(key, objWrite);
+  }
+
+  /**
+   * Creates new CrawlDatum objects with the updated score from the NodeDb or
+   * with a cleared score.
+   */
+  public void reduce(Text key, Iterator<ObjectWritable> values,
+      OutputCollector<Text, CrawlDatum> output, Reporter reporter)
+      throws IOException {
+
+    String url = key.toString();
+    Node node = null;
+    CrawlDatum datum = null;
+
+    // set the node and the crawl datum, should be one of each unless no node
+    // for url in the crawldb
+    while (values.hasNext()) {
+      ObjectWritable next = values.next();
+      Object value = next.get();
+      if (value instanceof Node) {
+        node = (Node) value;
+      } else if (value instanceof CrawlDatum) {
+        datum = (CrawlDatum) value;
+      }
+    }
+
+    // datum should never be null, could happen if somehow the url was
+    // normalized or changed after being pulled from the crawldb
+    if (datum != null) {
+
+      if (node != null) {
+
+        // set the inlink score in the nodedb
+        float inlinkScore = node.getInlinkScore();
+        datum.setScore(inlinkScore);
+        LOG.debug(url + ": setting to score " + inlinkScore);
+      } else {
+
+        // clear out the score in the crawldb
+        datum.setScore(clearScore);
+        LOG.debug(url + ": setting to clear score of " + clearScore);
+      }
+
+      output.collect(key, datum);
+    } else {
+      LOG.debug(url + ": no datum");
+    }
+  }
+
+  public void close() {
+  }
+
+  /**
+   * Updates the inlink score in the web graph node databsae into the crawl
+   * database.
+   * 
+   * @param crawlDb
+   *          The crawl database to update
+   * @param webGraphDb
+   *          The webgraph database to use.
+   * 
+   * @throws IOException
+   *           If an error occurs while updating the scores.
+   */
+  public void update(Path crawlDb, Path webGraphDb) throws IOException {
+
+    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+    long start = System.currentTimeMillis();
+    LOG.info("ScoreUpdater: starting at " + sdf.format(start));
+
+    Configuration conf = getConf();
+    FileSystem fs = FileSystem.get(conf);
+
+    // create a temporary crawldb with the new scores
+    LOG.info("Running crawldb update " + crawlDb);
+    Path nodeDb = new Path(webGraphDb, WebGraph.NODE_DIR);
+    Path crawlDbCurrent = new Path(crawlDb, CrawlDb.CURRENT_NAME);
+    Path newCrawlDb = new Path(crawlDb, Integer.toString(new Random()
+        .nextInt(Integer.MAX_VALUE)));
+
+    // run the updater job outputting to the temp crawl database
+    JobConf updater = new NutchJob(conf);
+    updater.setJobName("Update CrawlDb from WebGraph");
+    FileInputFormat.addInputPath(updater, crawlDbCurrent);
+    FileInputFormat.addInputPath(updater, nodeDb);
+    FileOutputFormat.setOutputPath(updater, newCrawlDb);
+    updater.setInputFormat(SequenceFileInputFormat.class);
+    updater.setMapperClass(ScoreUpdater.class);
+    updater.setReducerClass(ScoreUpdater.class);
+    updater.setMapOutputKeyClass(Text.class);
+    updater.setMapOutputValueClass(ObjectWritable.class);
+    updater.setOutputKeyClass(Text.class);
+    updater.setOutputValueClass(CrawlDatum.class);
+    updater.setOutputFormat(MapFileOutputFormat.class);
+
+    try {
+      JobClient.runJob(updater);
+    } catch (IOException e) {
+      LOG.error(StringUtils.stringifyException(e));
+
+      // remove the temp crawldb on error
+      if (fs.exists(newCrawlDb)) {
+        fs.delete(newCrawlDb, true);
+      }
+      throw e;
+    }
+
+    // install the temp crawl database
+    LOG.info("ScoreUpdater: installing new crawldb " + crawlDb);
+    CrawlDb.install(updater, crawlDb);
+
+    long end = System.currentTimeMillis();
+    LOG.info("ScoreUpdater: finished at " + sdf.format(end) + ", elapsed: "
+        + TimingUtil.elapsedTime(start, end));
+  }
+
+  public static void main(String[] args) throws Exception {
+    int res = ToolRunner.run(NutchConfiguration.create(), new ScoreUpdater(),
+        args);
+    System.exit(res);
+  }
+
+  /**
+   * Runs the ScoreUpdater tool.
+   */
+  public int run(String[] args) throws Exception {
+
+    Options options = new Options();
+    OptionBuilder.withArgName("help");
+    OptionBuilder.withDescription("show this help message");
+    Option helpOpts = OptionBuilder.create("help");
+    options.addOption(helpOpts);
+
+    OptionBuilder.withArgName("crawldb");
+    OptionBuilder.hasArg();
+    OptionBuilder.withDescription("the crawldb to use");
+    Option crawlDbOpts = OptionBuilder.create("crawldb");
+    options.addOption(crawlDbOpts);
+
+    OptionBuilder.withArgName("webgraphdb");
+    OptionBuilder.hasArg();
+    OptionBuilder.withDescription("the webgraphdb to use");
+    Option webGraphOpts = OptionBuilder.create("webgraphdb");
+    options.addOption(webGraphOpts);
+
+    CommandLineParser parser = new GnuParser();
+    try {
+
+      CommandLine line = parser.parse(options, args);
+      if (line.hasOption("help") || !line.hasOption("webgraphdb")
+          || !line.hasOption("crawldb")) {
+        HelpFormatter formatter = new HelpFormatter();
+        formatter.printHelp("ScoreUpdater", options);
+        return -1;
+      }
+
+      String crawlDb = line.getOptionValue("crawldb");
+      String webGraphDb = line.getOptionValue("webgraphdb");
+      update(new Path(crawlDb), new Path(webGraphDb));
+      return 0;
+    } catch (Exception e) {
+      LOG.error("ScoreUpdater: " + StringUtils.stringifyException(e));
+      return -1;
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/scoring/webgraph/WebGraph.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/scoring/webgraph/WebGraph.java b/nutch-core/src/main/java/org/apache/nutch/scoring/webgraph/WebGraph.java
new file mode 100644
index 0000000..e2c3d8b
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/scoring/webgraph/WebGraph.java
@@ -0,0 +1,783 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.scoring.webgraph;
+
+import java.io.IOException;
+import java.text.SimpleDateFormat;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Random;
+import java.util.Set;
+
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.CommandLineParser;
+import org.apache.commons.cli.GnuParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.OptionBuilder;
+import org.apache.commons.cli.Options;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.BooleanWritable;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.io.WritableUtils;
+import org.apache.hadoop.mapred.FileInputFormat;
+import org.apache.hadoop.mapred.FileOutputFormat;
+import org.apache.hadoop.mapred.JobClient;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.MapFileOutputFormat;
+import org.apache.hadoop.mapred.Mapper;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reducer;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.hadoop.mapred.SequenceFileInputFormat;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.nutch.crawl.NutchWritable;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.net.URLFilters;
+import org.apache.nutch.net.URLNormalizers;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.util.FSUtils;
+import org.apache.nutch.util.HadoopFSUtil;
+import org.apache.nutch.util.LockUtil;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.NutchJob;
+import org.apache.nutch.util.TimingUtil;
+import org.apache.nutch.util.URLUtil;
+
+/**
+ * Creates three databases, one for inlinks, one for outlinks, and a node
+ * database that holds the number of in and outlinks to a url and the current
+ * score for the url.
+ * 
+ * The score is set by an analysis program such as LinkRank. The WebGraph is an
+ * update-able database. Outlinks are stored by their fetch time or by the
+ * current system time if no fetch time is available. Only the most recent
+ * version of outlinks for a given url is stored. As more crawls are executed
+ * and the WebGraph updated, newer Outlinks will replace older Outlinks. This
+ * allows the WebGraph to adapt to changes in the link structure of the web.
+ * 
+ * The Inlink database is created from the Outlink database and is regenerated
+ * when the WebGraph is updated. The Node database is created from both the
+ * Inlink and Outlink databases. Because the Node database is overwritten when
+ * the WebGraph is updated and because the Node database holds current scores
+ * for urls it is recommended that a crawl-cycle (one or more full crawls) fully
+ * complete before the WebGraph is updated and some type of analysis, such as
+ * LinkRank, is run to update scores in the Node database in a stable fashion.
+ */
+public class WebGraph extends Configured implements Tool {
+
+  public static final Logger LOG = LoggerFactory.getLogger(WebGraph.class);
+  public static final String LOCK_NAME = ".locked";
+  public static final String INLINK_DIR = "inlinks";
+  public static final String OUTLINK_DIR = "outlinks/current";
+  public static final String OLD_OUTLINK_DIR = "outlinks/old";
+  public static final String NODE_DIR = "nodes";
+
+  /**
+   * The OutlinkDb creates a database of all outlinks. Outlinks to internal urls
+   * by domain and host can be ignored. The number of Outlinks out to a given
+   * page or domain can also be limited.
+   */
+  public static class OutlinkDb extends Configured implements
+      Mapper<Text, Writable, Text, NutchWritable>,
+      Reducer<Text, NutchWritable, Text, LinkDatum> {
+
+    public static final String URL_NORMALIZING = "webgraph.url.normalizers";
+    public static final String URL_FILTERING = "webgraph.url.filters";
+
+    // ignoring internal domains, internal hosts
+    private boolean ignoreDomain = true;
+    private boolean ignoreHost = true;
+
+    // limiting urls out to a page or to a domain
+    private boolean limitPages = true;
+    private boolean limitDomains = true;
+
+    // using normalizers and/or filters
+    private boolean normalize = false;
+    private boolean filter = false;
+
+    // url normalizers, filters and job configuration
+    private URLNormalizers urlNormalizers;
+    private URLFilters filters;
+    private JobConf conf;
+
+    /**
+     * Normalizes and trims extra whitespace from the given url.
+     * 
+     * @param url
+     *          The url to normalize.
+     * 
+     * @return The normalized url.
+     */
+    private String normalizeUrl(String url) {
+
+      if (!normalize) {
+        return url;
+      }
+
+      String normalized = null;
+      if (urlNormalizers != null) {
+        try {
+
+          // normalize and trim the url
+          normalized = urlNormalizers.normalize(url,
+              URLNormalizers.SCOPE_DEFAULT);
+          normalized = normalized.trim();
+        } catch (Exception e) {
+          LOG.warn("Skipping " + url + ":" + e);
+          normalized = null;
+        }
+      }
+      return normalized;
+    }
+
+    /**
+     * Filters the given url.
+     * 
+     * @param url
+     *          The url to filter.
+     * 
+     * @return The filtered url or null.
+     */
+    private String filterUrl(String url) {
+
+      if (!filter) {
+        return url;
+      }
+
+      try {
+        url = filters.filter(url);
+      } catch (Exception e) {
+        url = null;
+      }
+
+      return url;
+    }
+
+    /**
+     * Returns the fetch time from the parse data or the current system time if
+     * the fetch time doesn't exist.
+     * 
+     * @param data
+     *          The parse data.
+     * 
+     * @return The fetch time as a long.
+     */
+    private long getFetchTime(ParseData data) {
+
+      // default to current system time
+      long fetchTime = System.currentTimeMillis();
+      String fetchTimeStr = data.getContentMeta().get(Nutch.FETCH_TIME_KEY);
+      try {
+        // get the fetch time from the parse data
+        fetchTime = Long.parseLong(fetchTimeStr);
+      } catch (Exception e) {
+        fetchTime = System.currentTimeMillis();
+      }
+      return fetchTime;
+    }
+
+    /**
+     * Default constructor.
+     */
+    public OutlinkDb() {
+    }
+
+    /**
+     * Configurable constructor.
+     */
+    public OutlinkDb(Configuration conf) {
+      setConf(conf);
+    }
+
+    /**
+     * Configures the OutlinkDb job. Sets up internal links and link limiting.
+     */
+    public void configure(JobConf conf) {
+      this.conf = conf;
+      ignoreHost = conf.getBoolean("link.ignore.internal.host", true);
+      ignoreDomain = conf.getBoolean("link.ignore.internal.domain", true);
+      limitPages = conf.getBoolean("link.ignore.limit.page", true);
+      limitDomains = conf.getBoolean("link.ignore.limit.domain", true);
+
+      normalize = conf.getBoolean(URL_NORMALIZING, false);
+      filter = conf.getBoolean(URL_FILTERING, false);
+
+      if (normalize) {
+        urlNormalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_DEFAULT);
+      }
+
+      if (filter) {
+        filters = new URLFilters(conf);
+      }
+    }
+
+    /**
+     * Passes through existing LinkDatum objects from an existing OutlinkDb and
+     * maps out new LinkDatum objects from new crawls ParseData.
+     */
+    public void map(Text key, Writable value,
+        OutputCollector<Text, NutchWritable> output, Reporter reporter)
+        throws IOException {
+
+      // normalize url, stop processing if null
+      String url = normalizeUrl(key.toString());
+      if (url == null) {
+        return;
+      }
+
+      // filter url
+      if (filterUrl(url) == null) {
+        return;
+      }
+
+      // Overwrite the key with the normalized URL
+      key.set(url);
+
+      if (value instanceof CrawlDatum) {
+        CrawlDatum datum = (CrawlDatum) value;
+
+        if (datum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_TEMP
+            || datum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_PERM
+            || datum.getStatus() == CrawlDatum.STATUS_FETCH_GONE) {
+
+          // Tell the reducer to get rid of all instances of this key
+          output.collect(key, new NutchWritable(new BooleanWritable(true)));
+        }
+      } else if (value instanceof ParseData) {
+        // get the parse data and the outlinks from the parse data, along with
+        // the fetch time for those links
+        ParseData data = (ParseData) value;
+        long fetchTime = getFetchTime(data);
+        Outlink[] outlinkAr = data.getOutlinks();
+        Map<String, String> outlinkMap = new LinkedHashMap<String, String>();
+
+        // normalize urls and put into map
+        if (outlinkAr != null && outlinkAr.length > 0) {
+          for (int i = 0; i < outlinkAr.length; i++) {
+            Outlink outlink = outlinkAr[i];
+            String toUrl = normalizeUrl(outlink.getToUrl());
+
+            if (filterUrl(toUrl) == null) {
+              continue;
+            }
+
+            // only put into map if the url doesn't already exist in the map or
+            // if it does and the anchor for that link is null, will replace if
+            // url is existing
+            boolean existingUrl = outlinkMap.containsKey(toUrl);
+            if (toUrl != null
+                && (!existingUrl || (existingUrl && outlinkMap.get(toUrl) == null))) {
+              outlinkMap.put(toUrl, outlink.getAnchor());
+            }
+          }
+        }
+
+        // collect the outlinks under the fetch time
+        for (String outlinkUrl : outlinkMap.keySet()) {
+          String anchor = outlinkMap.get(outlinkUrl);
+          LinkDatum datum = new LinkDatum(outlinkUrl, anchor, fetchTime);
+          output.collect(key, new NutchWritable(datum));
+        }
+      } else if (value instanceof LinkDatum) {
+        LinkDatum datum = (LinkDatum) value;
+        String linkDatumUrl = normalizeUrl(datum.getUrl());
+
+        if (filterUrl(linkDatumUrl) != null) {
+          datum.setUrl(linkDatumUrl);
+
+          // collect existing outlinks from existing OutlinkDb
+          output.collect(key, new NutchWritable(datum));
+        }
+      }
+    }
+
+    public void reduce(Text key, Iterator<NutchWritable> values,
+        OutputCollector<Text, LinkDatum> output, Reporter reporter)
+        throws IOException {
+
+      // aggregate all outlinks, get the most recent timestamp for a fetch
+      // which should be the timestamp for all of the most recent outlinks
+      long mostRecent = 0L;
+      List<LinkDatum> outlinkList = new ArrayList<LinkDatum>();
+      while (values.hasNext()) {
+        Writable value = values.next().get();
+
+        if (value instanceof LinkDatum) {
+          // loop through, change out most recent timestamp if needed
+          LinkDatum next = (LinkDatum) value;
+          long timestamp = next.getTimestamp();
+          if (mostRecent == 0L || mostRecent < timestamp) {
+            mostRecent = timestamp;
+          }
+          outlinkList.add(WritableUtils.clone(next, conf));
+          reporter.incrCounter("WebGraph.outlinks", "added links", 1);
+        } else if (value instanceof BooleanWritable) {
+          BooleanWritable delete = (BooleanWritable) value;
+          // Actually, delete is always true, otherwise we don't emit it in the
+          // mapper in the first place
+          if (delete.get() == true) {
+            // This page is gone, do not emit it's outlinks
+            reporter.incrCounter("WebGraph.outlinks", "removed links", 1);
+            return;
+          }
+        }
+      }
+
+      // get the url, domain, and host for the url
+      String url = key.toString();
+      String domain = URLUtil.getDomainName(url);
+      String host = URLUtil.getHost(url);
+
+      // setup checking sets for domains and pages
+      Set<String> domains = new HashSet<String>();
+      Set<String> pages = new HashSet<String>();
+
+      // loop through the link datums
+      for (LinkDatum datum : outlinkList) {
+
+        // get the url, host, domain, and page for each outlink
+        String toUrl = datum.getUrl();
+        String toDomain = URLUtil.getDomainName(toUrl);
+        String toHost = URLUtil.getHost(toUrl);
+        String toPage = URLUtil.getPage(toUrl);
+        datum.setLinkType(LinkDatum.OUTLINK);
+
+        // outlinks must be the most recent and conform to internal url and
+        // limiting rules, if it does collect it
+        if (datum.getTimestamp() == mostRecent
+            && (!limitPages || (limitPages && !pages.contains(toPage)))
+            && (!limitDomains || (limitDomains && !domains.contains(toDomain)))
+            && (!ignoreHost || (ignoreHost && !toHost.equalsIgnoreCase(host)))
+            && (!ignoreDomain || (ignoreDomain && !toDomain
+                .equalsIgnoreCase(domain)))) {
+          output.collect(key, datum);
+          pages.add(toPage);
+          domains.add(toDomain);
+        }
+      }
+    }
+
+    public void close() {
+    }
+  }
+
+  /**
+   * The InlinkDb creates a database of Inlinks. Inlinks are inverted from the
+   * OutlinkDb LinkDatum objects and are regenerated each time the WebGraph is
+   * updated.
+   */
+  private static class InlinkDb extends Configured implements
+      Mapper<Text, LinkDatum, Text, LinkDatum> {
+
+    private long timestamp;
+
+    /**
+     * Configures job. Sets timestamp for all Inlink LinkDatum objects to the
+     * current system time.
+     */
+    public void configure(JobConf conf) {
+      timestamp = System.currentTimeMillis();
+    }
+
+    public void close() {
+    }
+
+    /**
+     * Inverts the Outlink LinkDatum objects into new LinkDatum objects with a
+     * new system timestamp, type and to and from url switched.
+     */
+    public void map(Text key, LinkDatum datum,
+        OutputCollector<Text, LinkDatum> output, Reporter reporter)
+        throws IOException {
+
+      // get the to and from url and the anchor
+      String fromUrl = key.toString();
+      String toUrl = datum.getUrl();
+      String anchor = datum.getAnchor();
+
+      // flip the from and to url and set the new link type
+      LinkDatum inlink = new LinkDatum(fromUrl, anchor, timestamp);
+      inlink.setLinkType(LinkDatum.INLINK);
+      output.collect(new Text(toUrl), inlink);
+    }
+  }
+
+  /**
+   * Creates the Node database which consists of the number of in and outlinks
+   * for each url and a score slot for analysis programs such as LinkRank.
+   */
+  private static class NodeDb extends Configured implements
+      Reducer<Text, LinkDatum, Text, Node> {
+
+    /**
+     * Configures job.
+     */
+    public void configure(JobConf conf) {
+    }
+
+    public void close() {
+    }
+
+    /**
+     * Counts the number of inlinks and outlinks for each url and sets a default
+     * score of 0.0 for each url (node) in the webgraph.
+     */
+    public void reduce(Text key, Iterator<LinkDatum> values,
+        OutputCollector<Text, Node> output, Reporter reporter)
+        throws IOException {
+
+      Node node = new Node();
+      int numInlinks = 0;
+      int numOutlinks = 0;
+
+      // loop through counting number of in and out links
+      while (values.hasNext()) {
+        LinkDatum next = values.next();
+        if (next.getLinkType() == LinkDatum.INLINK) {
+          numInlinks++;
+        } else if (next.getLinkType() == LinkDatum.OUTLINK) {
+          numOutlinks++;
+        }
+      }
+
+      // set the in and outlinks and a default score of 0
+      node.setNumInlinks(numInlinks);
+      node.setNumOutlinks(numOutlinks);
+      node.setInlinkScore(0.0f);
+      output.collect(key, node);
+    }
+  }
+
+  /**
+   * Creates the three different WebGraph databases, Outlinks, Inlinks, and
+   * Node. If a current WebGraph exists then it is updated, if it doesn't exist
+   * then a new WebGraph database is created.
+   * 
+   * @param webGraphDb
+   *          The WebGraph to create or update.
+   * @param segments
+   *          The array of segments used to update the WebGraph. Newer segments
+   *          and fetch times will overwrite older segments.
+   * @param normalize
+   *          whether to use URLNormalizers on URL's in the segment
+   * @param filter
+   *          whether to use URLFilters on URL's in the segment
+   * 
+   * @throws IOException
+   *           If an error occurs while processing the WebGraph.
+   */
+  public void createWebGraph(Path webGraphDb, Path[] segments,
+      boolean normalize, boolean filter) throws IOException {
+
+    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+    long start = System.currentTimeMillis();
+    if (LOG.isInfoEnabled()) {
+      LOG.info("WebGraphDb: starting at " + sdf.format(start));
+      LOG.info("WebGraphDb: webgraphdb: " + webGraphDb);
+      LOG.info("WebGraphDb: URL normalize: " + normalize);
+      LOG.info("WebGraphDb: URL filter: " + filter);
+    }
+
+    Configuration conf = getConf();
+    FileSystem fs = FileSystem.get(conf);
+
+    // lock an existing webgraphdb to prevent multiple simultaneous updates
+    Path lock = new Path(webGraphDb, LOCK_NAME);
+    if (!fs.exists(webGraphDb)) {
+      fs.mkdirs(webGraphDb);
+    }
+
+    LockUtil.createLockFile(fs, lock, false);
+
+    // outlink and temp outlink database paths
+    Path outlinkDb = new Path(webGraphDb, OUTLINK_DIR);
+    Path oldOutlinkDb = new Path(webGraphDb, OLD_OUTLINK_DIR);
+
+    if (!fs.exists(outlinkDb)) {
+      fs.mkdirs(outlinkDb);
+    }
+
+    Path tempOutlinkDb = new Path(outlinkDb + "-"
+        + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
+    JobConf outlinkJob = new NutchJob(conf);
+    outlinkJob.setJobName("Outlinkdb: " + outlinkDb);
+
+    boolean deleteGone = conf.getBoolean("link.delete.gone", false);
+    boolean preserveBackup = conf.getBoolean("db.preserve.backup", true);
+
+    if (deleteGone) {
+      LOG.info("OutlinkDb: deleting gone links");
+    }
+
+    // get the parse data and crawl fetch data for all segments
+    if (segments != null) {
+      for (int i = 0; i < segments.length; i++) {
+        Path parseData = new Path(segments[i], ParseData.DIR_NAME);
+        if (fs.exists(parseData)) {
+          LOG.info("OutlinkDb: adding input: " + parseData);
+          FileInputFormat.addInputPath(outlinkJob, parseData);
+        }
+
+        if (deleteGone) {
+          Path crawlFetch = new Path(segments[i], CrawlDatum.FETCH_DIR_NAME);
+          if (fs.exists(crawlFetch)) {
+            LOG.info("OutlinkDb: adding input: " + crawlFetch);
+            FileInputFormat.addInputPath(outlinkJob, crawlFetch);
+          }
+        }
+      }
+    }
+
+    // add the existing webgraph
+    LOG.info("OutlinkDb: adding input: " + outlinkDb);
+    FileInputFormat.addInputPath(outlinkJob, outlinkDb);
+
+    outlinkJob.setBoolean(OutlinkDb.URL_NORMALIZING, normalize);
+    outlinkJob.setBoolean(OutlinkDb.URL_FILTERING, filter);
+
+    outlinkJob.setInputFormat(SequenceFileInputFormat.class);
+    outlinkJob.setMapperClass(OutlinkDb.class);
+    outlinkJob.setReducerClass(OutlinkDb.class);
+    outlinkJob.setMapOutputKeyClass(Text.class);
+    outlinkJob.setMapOutputValueClass(NutchWritable.class);
+    outlinkJob.setOutputKeyClass(Text.class);
+    outlinkJob.setOutputValueClass(LinkDatum.class);
+    FileOutputFormat.setOutputPath(outlinkJob, tempOutlinkDb);
+    outlinkJob.setOutputFormat(MapFileOutputFormat.class);
+    outlinkJob.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs",
+        false);
+
+    // run the outlinkdb job and replace any old outlinkdb with the new one
+    try {
+      LOG.info("OutlinkDb: running");
+      JobClient.runJob(outlinkJob);
+      LOG.info("OutlinkDb: installing " + outlinkDb);
+      FSUtils.replace(fs, oldOutlinkDb, outlinkDb, true);
+      FSUtils.replace(fs, outlinkDb, tempOutlinkDb, true);
+      if (!preserveBackup && fs.exists(oldOutlinkDb))
+        fs.delete(oldOutlinkDb, true);
+      LOG.info("OutlinkDb: finished");
+    } catch (IOException e) {
+
+      // remove lock file and and temporary directory if an error occurs
+      LockUtil.removeLockFile(fs, lock);
+      if (fs.exists(tempOutlinkDb)) {
+        fs.delete(tempOutlinkDb, true);
+      }
+      LOG.error(StringUtils.stringifyException(e));
+      throw e;
+    }
+
+    // inlink and temp link database paths
+    Path inlinkDb = new Path(webGraphDb, INLINK_DIR);
+    Path tempInlinkDb = new Path(inlinkDb + "-"
+        + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
+
+    JobConf inlinkJob = new NutchJob(conf);
+    inlinkJob.setJobName("Inlinkdb " + inlinkDb);
+    LOG.info("InlinkDb: adding input: " + outlinkDb);
+    FileInputFormat.addInputPath(inlinkJob, outlinkDb);
+    inlinkJob.setInputFormat(SequenceFileInputFormat.class);
+    inlinkJob.setMapperClass(InlinkDb.class);
+    inlinkJob.setMapOutputKeyClass(Text.class);
+    inlinkJob.setMapOutputValueClass(LinkDatum.class);
+    inlinkJob.setOutputKeyClass(Text.class);
+    inlinkJob.setOutputValueClass(LinkDatum.class);
+    FileOutputFormat.setOutputPath(inlinkJob, tempInlinkDb);
+    inlinkJob.setOutputFormat(MapFileOutputFormat.class);
+    inlinkJob.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs",
+        false);
+
+    try {
+
+      // run the inlink and replace any old with new
+      LOG.info("InlinkDb: running");
+      JobClient.runJob(inlinkJob);
+      LOG.info("InlinkDb: installing " + inlinkDb);
+      FSUtils.replace(fs, inlinkDb, tempInlinkDb, true);
+      LOG.info("InlinkDb: finished");
+    } catch (IOException e) {
+
+      // remove lock file and and temporary directory if an error occurs
+      LockUtil.removeLockFile(fs, lock);
+      if (fs.exists(tempInlinkDb)) {
+        fs.delete(tempInlinkDb, true);
+      }
+      LOG.error(StringUtils.stringifyException(e));
+      throw e;
+    }
+
+    // node and temp node database paths
+    Path nodeDb = new Path(webGraphDb, NODE_DIR);
+    Path tempNodeDb = new Path(nodeDb + "-"
+        + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
+
+    JobConf nodeJob = new NutchJob(conf);
+    nodeJob.setJobName("NodeDb " + nodeDb);
+    LOG.info("NodeDb: adding input: " + outlinkDb);
+    LOG.info("NodeDb: adding input: " + inlinkDb);
+    FileInputFormat.addInputPath(nodeJob, outlinkDb);
+    FileInputFormat.addInputPath(nodeJob, inlinkDb);
+    nodeJob.setInputFormat(SequenceFileInputFormat.class);
+    nodeJob.setReducerClass(NodeDb.class);
+    nodeJob.setMapOutputKeyClass(Text.class);
+    nodeJob.setMapOutputValueClass(LinkDatum.class);
+    nodeJob.setOutputKeyClass(Text.class);
+    nodeJob.setOutputValueClass(Node.class);
+    FileOutputFormat.setOutputPath(nodeJob, tempNodeDb);
+    nodeJob.setOutputFormat(MapFileOutputFormat.class);
+    nodeJob.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs",
+        false);
+
+    try {
+
+      // run the node job and replace old nodedb with new
+      LOG.info("NodeDb: running");
+      JobClient.runJob(nodeJob);
+      LOG.info("NodeDb: installing " + nodeDb);
+      FSUtils.replace(fs, nodeDb, tempNodeDb, true);
+      LOG.info("NodeDb: finished");
+    } catch (IOException e) {
+
+      // remove lock file and and temporary directory if an error occurs
+      LockUtil.removeLockFile(fs, lock);
+      if (fs.exists(tempNodeDb)) {
+        fs.delete(tempNodeDb, true);
+      }
+      LOG.error(StringUtils.stringifyException(e));
+      throw e;
+    }
+
+    // remove the lock file for the webgraph
+    LockUtil.removeLockFile(fs, lock);
+
+    long end = System.currentTimeMillis();
+    LOG.info("WebGraphDb: finished at " + sdf.format(end) + ", elapsed: "
+        + TimingUtil.elapsedTime(start, end));
+  }
+
+  public static void main(String[] args) throws Exception {
+    int res = ToolRunner.run(NutchConfiguration.create(), new WebGraph(), args);
+    System.exit(res);
+  }
+
+  /**
+   * Parses command link arguments and runs the WebGraph jobs.
+   */
+  public int run(String[] args) throws Exception {
+
+    // boolean options
+    Option helpOpt = new Option("h", "help", false, "show this help message");
+    Option normOpt = new Option("n", "normalize", false,
+        "whether to use URLNormalizers on the URL's in the segment");
+    Option filtOpt = new Option("f", "filter", false,
+        "whether to use URLFilters on the URL's in the segment");
+
+    // argument options
+    @SuppressWarnings("static-access")
+    Option graphOpt = OptionBuilder
+        .withArgName("webgraphdb")
+        .hasArg()
+        .withDescription(
+            "the web graph database to create (if none exists) or use if one does")
+        .create("webgraphdb");
+    @SuppressWarnings("static-access")
+    Option segOpt = OptionBuilder.withArgName("segment").hasArgs()
+        .withDescription("the segment(s) to use").create("segment");
+    @SuppressWarnings("static-access")
+    Option segDirOpt = OptionBuilder.withArgName("segmentDir").hasArgs()
+        .withDescription("the segment directory to use").create("segmentDir");
+
+    // create the options
+    Options options = new Options();
+    options.addOption(helpOpt);
+    options.addOption(normOpt);
+    options.addOption(filtOpt);
+    options.addOption(graphOpt);
+    options.addOption(segOpt);
+    options.addOption(segDirOpt);
+
+    CommandLineParser parser = new GnuParser();
+    try {
+      CommandLine line = parser.parse(options, args);
+      if (line.hasOption("help") || !line.hasOption("webgraphdb")
+          || (!line.hasOption("segment") && !line.hasOption("segmentDir"))) {
+        HelpFormatter formatter = new HelpFormatter();
+        formatter.printHelp("WebGraph", options, true);
+        return -1;
+      }
+
+      String webGraphDb = line.getOptionValue("webgraphdb");
+
+      Path[] segPaths = null;
+
+      // Handle segment option
+      if (line.hasOption("segment")) {
+        String[] segments = line.getOptionValues("segment");
+        segPaths = new Path[segments.length];
+        for (int i = 0; i < segments.length; i++) {
+          segPaths[i] = new Path(segments[i]);
+        }
+      }
+
+      // Handle segmentDir option
+      if (line.hasOption("segmentDir")) {
+        Path dir = new Path(line.getOptionValue("segmentDir"));
+        FileSystem fs = dir.getFileSystem(getConf());
+        FileStatus[] fstats = fs.listStatus(dir,
+            HadoopFSUtil.getPassDirectoriesFilter(fs));
+        segPaths = HadoopFSUtil.getPaths(fstats);
+      }
+
+      boolean normalize = false;
+
+      if (line.hasOption("normalize")) {
+        normalize = true;
+      }
+
+      boolean filter = false;
+
+      if (line.hasOption("filter")) {
+        filter = true;
+      }
+
+      createWebGraph(new Path(webGraphDb), segPaths, normalize, filter);
+      return 0;
+    } catch (Exception e) {
+      LOG.error("WebGraph: " + StringUtils.stringifyException(e));
+      return -2;
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/scoring/webgraph/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/scoring/webgraph/package-info.java b/nutch-core/src/main/java/org/apache/nutch/scoring/webgraph/package-info.java
new file mode 100644
index 0000000..a568b46
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/scoring/webgraph/package-info.java
@@ -0,0 +1,24 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Scoring implementation based on link analysis
+ * ({@link org.apache.nutch.scoring.webgraph.LinkRank}),
+ * see {@link org.apache.nutch.scoring.webgraph.WebGraph}.
+ */
+package org.apache.nutch.scoring.webgraph;
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/segment/ContentAsTextInputFormat.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/segment/ContentAsTextInputFormat.java b/nutch-core/src/main/java/org/apache/nutch/segment/ContentAsTextInputFormat.java
new file mode 100644
index 0000000..d67b590
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/segment/ContentAsTextInputFormat.java
@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.segment;
+
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.FileSplit;
+import org.apache.hadoop.mapred.InputSplit;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.RecordReader;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.hadoop.mapred.SequenceFileInputFormat;
+import org.apache.hadoop.mapred.SequenceFileRecordReader;
+import org.apache.nutch.protocol.Content;
+
+/**
+ * An input format that takes Nutch Content objects and converts them to text
+ * while converting newline endings to spaces. This format is useful for working
+ * with Nutch content objects in Hadoop Streaming with other languages.
+ */
+public class ContentAsTextInputFormat extends
+    SequenceFileInputFormat<Text, Text> {
+
+  private static class ContentAsTextRecordReader implements
+      RecordReader<Text, Text> {
+
+    private final SequenceFileRecordReader<Text, Content> sequenceFileRecordReader;
+
+    private Text innerKey;
+    private Content innerValue;
+
+    public ContentAsTextRecordReader(Configuration conf, FileSplit split)
+        throws IOException {
+      sequenceFileRecordReader = new SequenceFileRecordReader<Text, Content>(
+          conf, split);
+      innerKey = sequenceFileRecordReader.createKey();
+      innerValue = sequenceFileRecordReader.createValue();
+    }
+
+    public Text createKey() {
+      return new Text();
+    }
+
+    public Text createValue() {
+      return new Text();
+    }
+
+    public synchronized boolean next(Text key, Text value) throws IOException {
+
+      // convert the content object to text
+      Text tKey = key;
+      if (!sequenceFileRecordReader.next(innerKey, innerValue)) {
+        return false;
+      }
+      tKey.set(innerKey.toString());
+      String contentAsStr = new String(innerValue.getContent());
+
+      // replace new line endings with spaces
+      contentAsStr = contentAsStr.replaceAll("\n", " ");
+      value.set(contentAsStr);
+
+      return true;
+    }
+
+    public float getProgress() throws IOException {
+      return sequenceFileRecordReader.getProgress();
+    }
+
+    public synchronized long getPos() throws IOException {
+      return sequenceFileRecordReader.getPos();
+    }
+
+    public synchronized void close() throws IOException {
+      sequenceFileRecordReader.close();
+    }
+  }
+
+  public ContentAsTextInputFormat() {
+    super();
+  }
+
+  public RecordReader<Text, Text> getRecordReader(InputSplit split,
+      JobConf job, Reporter reporter) throws IOException {
+
+    reporter.setStatus(split.toString());
+    return new ContentAsTextRecordReader(job, (FileSplit) split);
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/segment/SegmentChecker.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/segment/SegmentChecker.java b/nutch-core/src/main/java/org/apache/nutch/segment/SegmentChecker.java
new file mode 100644
index 0000000..ec601f4
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/segment/SegmentChecker.java
@@ -0,0 +1,136 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.segment;
+
+import java.io.IOException;
+
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.parse.ParseText;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.util.HadoopFSUtil;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Checks whether a segment is valid, or has a certain status (generated,
+ * fetched, parsed), or can be used safely for a certain processing step
+ * (e.g., indexing).
+ */
+public class SegmentChecker {
+
+  public static final Logger LOG = LoggerFactory
+      .getLogger(SegmentChecker.class);
+
+  /**
+   * Check if the segment is indexable. May add new check methods here.
+   */
+  public static boolean isIndexable(Path segmentPath, FileSystem fs)
+      throws IOException {
+    if (segmentPath == null || fs == null) {
+      LOG.info("No segment path or filesystem set.");
+      return false;
+    }
+
+    boolean checkResult = true;
+    checkResult &= checkSegmentDir(segmentPath, fs);
+    // Add new check methods here
+
+    if (checkResult) {
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  /**
+   * Check the segment to see if it is valid based on the sub directories.
+   */
+  public static boolean checkSegmentDir(Path segmentPath, FileSystem fs)
+      throws IOException {
+
+    if (segmentPath.getName().length() != 14) {
+      LOG.warn("The input path at {} is not a segment... skipping", segmentPath.getName());
+      return false;
+    }
+    
+    FileStatus[] fstats_segment = fs.listStatus(segmentPath,
+        HadoopFSUtil.getPassDirectoriesFilter(fs));
+    Path[] segment_files = HadoopFSUtil.getPaths(fstats_segment);
+
+    boolean crawlFetchExists = false;
+    boolean crawlParseExists = false;
+    boolean parseDataExists = false;
+    boolean parseTextExists = false;
+
+    for (Path path : segment_files) {
+      String pathName = path.getName();
+      crawlFetchExists |= pathName.equals(CrawlDatum.FETCH_DIR_NAME);
+      crawlParseExists |= pathName.equals(CrawlDatum.PARSE_DIR_NAME);
+      parseDataExists |= pathName.equals(ParseData.DIR_NAME);
+      parseTextExists |= pathName.equals(ParseText.DIR_NAME);
+    }
+
+    if (parseTextExists && crawlParseExists && crawlFetchExists
+        && parseDataExists) {
+
+      // No segment dir missing
+      LOG.info("Segment dir is complete: " + segmentPath.toString() + ".");
+
+      return true;
+    } else {
+
+      // log the missing dir
+      StringBuilder missingDir = new StringBuilder("");
+      if (parseDataExists == false) {
+        missingDir.append(ParseData.DIR_NAME + ", ");
+      }
+      if (parseTextExists == false) {
+        missingDir.append(ParseText.DIR_NAME + ", ");
+      }
+      if (crawlParseExists == false) {
+        missingDir.append(CrawlDatum.PARSE_DIR_NAME + ", ");
+      }
+      if (crawlFetchExists == false) {
+        missingDir.append(CrawlDatum.FETCH_DIR_NAME + ", ");
+      }
+
+      String missingDirString = missingDir.toString();
+      LOG.warn("Skipping segment: " + segmentPath.toString()
+          + ". Missing sub directories: "
+          + missingDirString.substring(0, missingDirString.length() - 2));
+
+      return false;
+    }
+
+  }
+
+  /**
+   * Check the segment to see if it is has been parsed before.
+   */
+  public static boolean isParsed(Path segment, FileSystem fs)
+      throws IOException {
+
+      if (fs.exists(new Path(segment, CrawlDatum.PARSE_DIR_NAME))){
+	return true;
+      }
+      return false;
+  }
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/segment/SegmentMergeFilter.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/segment/SegmentMergeFilter.java b/nutch-core/src/main/java/org/apache/nutch/segment/SegmentMergeFilter.java
new file mode 100644
index 0000000..6d53809
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/segment/SegmentMergeFilter.java
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.segment;
+
+import java.util.Collection;
+
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseText;
+import org.apache.nutch.protocol.Content;
+
+/**
+ * Interface used to filter segments during segment merge. It allows filtering
+ * on more sophisticated criteria than just URLs. In particular it allows
+ * filtering based on metadata collected while parsing page.
+ * 
+ */
+public interface SegmentMergeFilter {
+  /** The name of the extension point. */
+  public final static String X_POINT_ID = SegmentMergeFilter.class.getName();
+
+  /**
+   * The filtering method which gets all information being merged for a given
+   * key (URL).
+   * 
+   * @return <tt>true</tt> values for this <tt>key</tt> (URL) should be merged
+   *         into the new segment.
+   */
+  public boolean filter(Text key, CrawlDatum generateData,
+      CrawlDatum fetchData, CrawlDatum sigData, Content content,
+      ParseData parseData, ParseText parseText, Collection<CrawlDatum> linked);
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/segment/SegmentMergeFilters.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/segment/SegmentMergeFilters.java b/nutch-core/src/main/java/org/apache/nutch/segment/SegmentMergeFilters.java
new file mode 100644
index 0000000..7aa2de3
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/segment/SegmentMergeFilters.java
@@ -0,0 +1,84 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.segment;
+
+import java.util.Collection;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.net.URLFilter;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseText;
+import org.apache.nutch.plugin.Extension;
+import org.apache.nutch.plugin.ExtensionPoint;
+import org.apache.nutch.plugin.PluginRepository;
+import org.apache.nutch.plugin.PluginRuntimeException;
+import org.apache.nutch.protocol.Content;
+
+/**
+ * This class wraps all {@link SegmentMergeFilter} extensions in a single object
+ * so it is easier to operate on them. If any of extensions returns
+ * <tt>false</tt> this one will return <tt>false</tt> as well.
+ * 
+ */
+public class SegmentMergeFilters {
+  private static final Logger LOG = LoggerFactory
+      .getLogger(SegmentMergeFilters.class);
+  private SegmentMergeFilter[] filters;
+
+  public SegmentMergeFilters(Configuration conf) {
+    try {
+      ExtensionPoint point = PluginRepository.get(conf).getExtensionPoint(
+          SegmentMergeFilter.X_POINT_ID);
+      if (point == null)
+        throw new RuntimeException(URLFilter.X_POINT_ID + " not found.");
+      Extension[] extensions = point.getExtensions();
+      filters = new SegmentMergeFilter[extensions.length];
+      for (int i = 0; i < extensions.length; i++) {
+        filters[i] = (SegmentMergeFilter) extensions[i].getExtensionInstance();
+      }
+    } catch (PluginRuntimeException e) {
+      throw new RuntimeException(e);
+    }
+  }
+
+  /**
+   * Iterates over all {@link SegmentMergeFilter} extensions and if any of them
+   * returns false, it will return false as well.
+   * 
+   * @return <tt>true</tt> values for this <tt>key</tt> (URL) should be merged
+   *         into the new segment.
+   */
+  public boolean filter(Text key, CrawlDatum generateData,
+      CrawlDatum fetchData, CrawlDatum sigData, Content content,
+      ParseData parseData, ParseText parseText, Collection<CrawlDatum> linked) {
+    for (SegmentMergeFilter filter : filters) {
+      if (!filter.filter(key, generateData, fetchData, sigData, content,
+          parseData, parseText, linked)) {
+        if (LOG.isTraceEnabled())
+          LOG.trace("Key " + key + " dropped by " + filter.getClass().getName());
+        return false;
+      }
+    }
+    if (LOG.isTraceEnabled())
+      LOG.trace("Key " + key + " accepted for merge.");
+    return true;
+  }
+}

[17/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build for nutch-core and nutch-plugins

Posted by th...@apache.org.

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/language-identifier/src/test/java/org/apache/nutch/analysis/lang/pt.test
----------------------------------------------------------------------
diff --git a/nutch-plugins/language-identifier/src/test/java/org/apache/nutch/analysis/lang/pt.test b/nutch-plugins/language-identifier/src/test/java/org/apache/nutch/analysis/lang/pt.test
new file mode 100644
index 0000000..58c7e05
--- /dev/null
+++ b/nutch-plugins/language-identifier/src/test/java/org/apache/nutch/analysis/lang/pt.test
@@ -0,0 +1,105 @@
+Rein�cio da sess�o
+Declaro reaberta a sess�o do Parlamento Europeu, que tinha sido interrompida na sexta-feira, 17 de Dezembro �ltimo, e renovo todos os meus votos, esperando que tenham tido boas f�rias.
+Como puderam constatar, o grande "bug do ano 2000" n�o aconteceu. Em contrapartida, os cidad�os de alguns dos nossos pa�ses foram v�timas de cat�strofes naturais verdadeiramente terr�veis. Os senhores manifestaram o desejo de se proceder a um debate sobre o assunto nos pr�ximos dias, durante este per�odo de sess�es. Entretanto, gostaria - como tamb�m me foi pedido por um certo n�mero de colegas - que observ�ssemos um minuto de sil�ncio por todas as v�timas, nomeadamente das tempestades, nos diferentes pa�ses da Uni�o Europeia que foram afectados. Convido-os a levantarem-se para um minuto de sil�ncio.
+(O Parlamento, de p�, guarda um minuto de sil�ncio)
+
+Senhora Presidente, intervenho para um ponto de ordem. Certamente que j� tomou conhecimento, pelas not�cias transmitidas na imprensa e na televis�o, dos diversos atentados � bomba e assass�nios perpetrados no Sri Lanka. Uma das pessoas recentemente assassinadas foi o senhor Kumar Ponnambalam, que ainda h� poucos meses visitara o Parlamento Europeu. Ser� que a senhora Presidente poderia enviar uma carta � Presidente do Sri Lanka manifestando o pesar do Parlamento por esta e outras mortes violentas perpetradas no seu pa�s, e instando�a a envidar todos os esfor�os ao seu alcance para procurar obter uma reconcilia��o pac�fica na situa��o extremamente dif�cil que ali se vive?
+
+Sim, Senhor Deputado Evans, penso que uma iniciativa no sentido que acaba de sugerir seria perfeitamente adequada. Se a assembleia estiver de acordo, farei como sugeriu o senhor deputado Evans.
+
+Senhora Presidente, intervenho para um ponto de ordem. Nos termos do artigo 143� do Regimento, gostaria que me desse o seu parecer em rela��o � inadmissibilidade de uma proposta. A quest�o que pretendo focar incide sobre um relat�rio que ser� aqui tratado na quinta�feira, ocasi�o em que tenciono levant�la novamente.
+O relat�rio Cunha sobre os resultados dos programas de orienta��o plurianuais para as frotas de pesca, inscrito na ordem do dia de quinta�feira, cont�m, no seu n� 6, uma proposta no sentido da aplica��o de san��es, sob a forma de uma redu��o autom�tica das quotas, aos Estados�Membros que n�o cumpram anualmente os objectivos de redu��o da respectiva frota definidos nos programas de orienta��o plurianuais. Mais se prop�e que a aplica��o das san��es seja feita independentemente da salvaguarda do princ�pio da estabilidade relativa. O princ�pio da estabilidade relativa constitui, em minha opini�o, um princ�pio jur�dico fundamental da pol�tica comum da pesca, pelo que toda e qualquer proposta que vise subvert�lo �, com certeza, juridicamente inadmiss�vel. Gostaria de saber se posso levantar uma objec��o deste tipo ao que constitui meramente um relat�rio, n�o uma proposta legislativa, e se tenho compet�ncia para o fazer na quinta�feira.
+
+Com efeito, � precisamente nessa altura que, se o desejar, poder� levantar a quest�o, isto �, na quinta-feira antes do in�cio da apresenta��o do relat�rio.
+
+Senhora Presidente, coincidindo com a primeira sess�o deste ano do Parlamento Europeu, nos Estados Unidos, no Texas, est� marcada, lamentavelmente para a pr�xima quinta-feira, a execu��o de um condenado � morte, um jovem de 34 anos a quem designaremos por X.
+A pedido de um deputado franc�s, o senhor deputado Zimeray, j� foi apresentada uma peti��o, que teve muitos signat�rios, entre os quais o abaixo assinado, mas pe�o�lhe, em conformidade com a orienta��o repetidamente expressa pelo Parlamento Europeu e por toda a Comunidade Europeia, que intervenha, com o prest�gio do seu cargo e da Institui��o que representa, junto do Presidente e do Governador do Texas, Bush, que tem poder para suspender a condena��o � morte e perdoar o condenado.
+E tudo isso em conformidade com os princ�pios que sempre defendemos.
+
+Obrigada, Senhor Deputado Segni, f�-lo-ei de boa vontade. Com efeito, essa � a linha das posi��es que o nosso Parlamento sempre adoptou.
+
+Senhora Presidente, gostaria de chamar a sua aten��o para um caso de que este Parlamento repetidamente se tem ocupado. � o caso de Alexander Nikitin. Congratulamo-nos todos aqui pelo facto de o tribunal o ter posto em liberdade e ter tornado claro que tamb�m na R�ssia o acesso �s informa��es sobre o ambiente constitui direito constitucional. No entanto, sucede agora que ele dever� ser� novamente acusado, uma vez que o Minist�rio P�blico ir� interpor recurso. Sabemos e constat�mo�lo realmente em muit�ssimas resolu��es - precisamente durante a �ltima sess�o plen�ria do ano passado - que aqui n�o se trata apenas de um caso jur�dico e que � errado acusar Alexander Nikitin de crime e de trai��o, uma vez que n�s, que somos afectados, podemos tirar proveito dos resultados por ele conseguidos. Estes resultados constituem a base do Programa Europeu de Defesa do Mar de Barents e, por esse motivo, pe�o-lhe que analise um projecto de carta que lhe exp�e os factos ma
 is importantes, e que, de acordo com as decis�es do Parlamento, torne clara esta posi��o na R�ssia.
+
+Sim, Senhora Deputada Schroedter, analisarei de boa vontade os factos relativos a essa quest�o, logo que receba a sua carta.
+
+� (NL) Senhora Presidente, em primeiro lugar, gostaria de a felicitar pelo facto de ter cumprido a sua palavra e de agora, neste primeiro per�odo de sess�es do novo ano, ter aumentado realmente muito o n�mero de canais de televis�o nos nossos gabinetes. Todavia, Senhora Presidente, n�o se fez aquilo que eu tinha pedido. � certo que h� agora dois canais finlandeses e um portugu�s, mas continua a n�o haver um �nico canal holand�s, como eu lhe tinha pedido, porque tamb�m os holandeses gostam de ver os notici�rios, todos os meses, quando somos mandados para aqui, para este desterro. Queria, portanto, pedir�lhe, uma vez mais, que tome a seu cargo fazer com que tamb�m recebamos um canal holand�s.
+
+Senhora Deputada Plooij-van Gorsel, posso dizer-lhe que essa quest�o se encontra inscrita na ordem de trabalhos da reuni�o dos Questores de quarta�feira. Ser� analisada, espero, num esp�rito construtivo.
+
+Senhora Presidente, poder�me�� dizer por que raz�o este Parlamento n�o respeita a legisla��o por si pr�prio aprovada em mat�ria de seguran�a e higiene? Por que raz�o n�o foi efectuado neste edif�cio onde nos encontramos qualquer ensaio sobre a qualidade do ar desde que fomos eleitos? Por que raz�o o nosso Comit� da Seguran�a e Higiene n�o se re�ne desde 1998? Porque n�o foi efectuado qualquer exerc�cio de simula��o de inc�ndio nos edif�cios do Parlamento, quer em Bruxelas quer em Estrasburgo? Porque n�o se encontram afixadas instru��es a seguir em caso de inc�ndio? Por que raz�o as escadas n�o foram melhoradas desde o meu acidente? Porque n�o s�o criadas zonas obrigat�rias de n�o fumadores? Considero absolutamente vergonhoso o facto de n�o respeitarmos a legisla��o que n�s pr�prios aprov�mos.
+
+Senhora Deputada Lynne, tem toda a raz�o. Vou verificar se nada disso foi efectivamente feito. Submeterei tamb�m o problema ao Col�gio dos Questores e estou certa de que os nossos Questores levar�o a peito fazer com que respeitemos a regulamenta��o que, com efeito, aprovamos.
+
+� (ES) Senhora Presidente, a senhora deputada D�ez Gonz�lez e eu pr�prio t�nhamos apresentado algumas perguntas a respeito de certas opini�es da senhora Vice-presidente, senhora Comiss�ria de Palacio, que tinham sido reproduzidas num jornal espanhol. Os servi�os competentes n�o as inclu�ram na ordem do dia, por considerarem que j� lhes tinha sido dada resposta numa sess�o anterior.
+Rogo-lhe que reconsidere esta decis�o, porque as coisas n�o se passaram assim. As perguntas a que tinha sido dada resposta anteriormente referiam-se � interven��o da senhora Comiss�ria de Palacio em determinado dossier e n�o a essas declara��es aparecidas no jornal ABC, no dia 18 do passado m�s de Novembro.
+
+Cara colega, vamos verificar tudo isso. Confesso que, para j�, as coisas me parecem um pouco confusas. Assim, vamos rever essa quest�o muito seriamente, para que tudo fique em ordem.
+
+Senhora Presidente, gostaria de saber se esta semana o Parlamento ter� oportunidade de manifestar a sua inequ�voca posi��o de descontentamento face � decis�o, hoje tomada, de n�o renovar o embargo de armas destinadas � Indon�sia, tendo em aten��o que a grande maioria da assembleia apoiou o referido embargo quando este foi decretado. Perante a situa��o que se vive naquela regi�o, a decis�o hoje tomada de n�o renovar o embargo de armas � extremamente perigosa. O Parlamento deveria, pois, enviar um sinal inequ�voco do seu descontentamento face � suspens�o do embargo, uma vez que � essa a posi��o da grande maioria da assembleia. A recusa por parte dos Estados�Membros da Uni�o Europeia de renovar o embargo de armas destinadas � Indon�sia � uma atitude irrespons�vel. Como j� aqui foi afirmado, a situa��o que ali se vive � extremamente vol�til. Existe, ali�s, o risco de poder haver um golpe militar no futuro. N�o sabemos exactamente o que ali se est� a 
 passar. Como � poss�vel a UE permitir que os fabricantes europeus de armamentos obtenham lucros � custa das vidas de seres humanos inocentes?
+
+Seja como for, essa quest�o n�o figura, para j�, entre os pedidos de aplica��o do processo de urg�ncia para a pr�xima quinta-feira.
+
+Ordem dos trabalhos
+Segue-se na ordem do dia a fixa��o da ordem de trabalhos. Foi distribu�da a vers�o final do projecto de ordem do dia do presente per�odo de sess�es, elaborada pela Confer�ncia dos Presidentes, reunida na quinta-feira, 13 de Janeiro, nos termos do artigo 110� do Regimento. Relativamente a segunda e ter�a-feiras, n�o foram apresentados pedidos de modifica��o.
+Quarta-feira:
+O Grupo do Partido dos Socialistas Europeus pede a inclus�o de uma declara��o da Comiss�o sobre os seus objectivos estrat�gicos para os pr�ximos cinco anos, bem com o sobre a reforma administrativa da Comiss�o.
+Gostaria que o senhor deputado Bar�n Crespo, autor do pedido, interviesse para o justificar, caso pretenda, evidentemente. Em seguida, faremos como � costume: ouviremos um orador a favor e um orador contra.
+
+� (ES) Senhora Presidente, a apresenta��o do programa pol�tico da Comiss�o Prodi para toda a legislatura foi inicialmente uma proposta do Grupo do Partido dos Socialistas Europeus, que, em Setembro, conseguiu a unanimidade na Confer�ncia dos Presidentes, bem como a aceita��o expl�cita do Presidente da Comiss�o, Romano Prodi, que reiterou o seu compromisso no seu discurso de investidura.
+Este compromisso � importante na medida em que a Comiss�o � um organismo que, de acordo com os Tratados, det�m o monop�lio da iniciativa e que, portanto, determina fundamentalmente o que vai ser a actividade pol�tica e legislativa deste Parlamento nos pr�ximos cinco anos. Recordo, al�m disso, Senhora Presidente, que, na anterior legislatura, este Parlamento deu, em duas ocasi�es diferentes, o seu voto de confian�a ao Presidente da Comiss�o, Romano Prodi, coisa que voltou a fazer em Julho, nesta legislatura. Mais tarde, j� com a nova Comiss�o em funcionamento, voltou a dar um voto de confian�a a toda a Comiss�o em Setembro. J� houve, portanto, tempo suficiente para a Comiss�o preparar o seu programa e para n�s podermos tomar conhecimento dele e explan�lo aos cidad�os. Neste sentido, recordo a resolu��o de 15 de Setembro, em que se recomendava a apresenta��o da proposta dentro do mais breve prazo poss�vel.
+Os factos ocorridos na semana passada - que tiveram origem � margem da Confer�ncia dos Presidentes, e que a utilizaram s� para corroborar e ratificar decis�es tomadas fora dela - criam um dilema: ou a Comiss�o n�o se encontra em condi��es de apresentar esse programa (e nesse caso conviria que o explicasse. Segundo as palavras da sua Presidente, por�m, encontra�se em condi��es de o fazer e dado que a Comiss�o se encontra representada pela Vice�presidente, senhora Comiss�ria de Palacio, creio que antes de se proceder � vota��o seria conveniente conhecer a situa��o da Comiss�o relativamente � sua disponibilidade para apresentar o programa, tal como tinha sido acordado), ou ent�o o Parlamento n�o se encontra em condi��es de examinar este programa como, aparentemente, alguns pretendem. Em minha opini�o, esta segunda hip�tese significaria o abandono das nossas responsabilidades como Parlamento, al�m de introduzir uma tese original, um m�todo desconhecido que 
 consiste em dar a conhecer por escrito aos grupos pol�ticos o discurso pragm�tico da Comiss�o com uma semana de anteced�ncia - e n�o no dia anterior, como se tinha acordado -, tendo em conta o facto de o programa legislativo ir ser discutido em Fevereiro, de modo que poder�amos prescindir do debate, porque no dia seguinte a imprensa e a Internet j� o teriam dado a conhecer a todos os cidad�os, n�o tendo j� o Parlamento motivo para se ocupar do assunto.
+Como o meu grupo � de opini�o que um Parlamento foi feito para escutar, para debater e para reflectir, pensamos n�o haver motivo algum que justifique este adiamento. Cremos, al�m disso, que sim, que a Comiss�o se encontra em condi��es de o fazer, que estamos perfeitamente a tempo de poder restabelecer o acordo original entre o Parlamento e a Comiss�o e de proceder responsavelmente perante as nossas concidad�s e os nossos concidad�os. Logo, a proposta do Grupo do Partido dos Socialistas Europeus que a senhora mencionou � que se mantenha a apresenta��o na quarta�feira do programa da legislatura da Comiss�o Prodi, incluindo neste programa tamb�m o projecto de reforma administrativa, porque, de outro modo, podemos vir a encontrar�nos numa situa��o paradoxal: com a desculpa de n�o existir texto, nega�se, por um lado, ao Presidente da Comiss�o o direito de fazer uso da palavra neste Parlamento, e, por outro lado, a possibilidade de realiza��o de um debate sobre a r
 eforma sem que este Parlamento conhe�a previamente os textos em que se baseia. Rogo�lhe, portanto, Senhora Presidente, que pe�a � Comiss�o que se manifeste agora e que depois se proceda � vota��o.
+(Aplausos da bancada do Grupo PSE)
+
+Senhora Presidente, caros colegas, estou realmente um pouco estupefacto em rela��o � atitude do colega Bar�n Crespo, que exige agora que este ponto da ordem do dia seja inscrito na ordem do dia de quarta-feira.
+Senhor Deputado Bar�n Crespo, o senhor n�o p�de comparecer na passada quinta-feira � Confer�ncia dos Presidentes. N�o o critico por isso; de vez em quando acontece enviarmos um representante. O colega H�nsch representou-o nessa ocasi�o. Realiz�mos um exaustivo debate na Confer�ncia dos Presidentes. Apenas o seu grupo defendeu a posi��o que o senhor agora defende. Procedemos seguidamente � vota��o. Cada presidente tem tantos votos quantos os membros do seu grupo. Teve lugar uma vota��o em rela��o a este ponto. Se bem me recordo, a vota��o teve o seguinte resultado: 422 votos contra 180 e umas poucas absten��es. Ou seja, todos os grupos, com a excep��o dos N�o-inscritos - mas esses nem sequer constituem grupo - chegaram a consenso; apenas o seu grupo defendia que se procedesse do modo que aqui prop�s. Todos os restantes discordavam. Foi essa a decis�o.
+Agora gostaria de dizer algo a respeito da mat�ria de fundo desta quest�o. Confiamos na Comiss�o, em Romano Prodi e a esmagadora maioria do nosso grupo depositou, como todos sabemos, a sua confian�a em Romano Prodi e na Comiss�o, na sequ�ncia de um dif�cil processo. No entanto, somos tamb�m da opini�o de que deveria haver um debate sobre esta estrat�gia da Comiss�o que seguisse um procedimento ordenado, e n�o s� com base numa declara��o oral pronunciada aqui no Parlamento Europeu, mas tamb�m com base num documento que seja decidido na Comiss�o e que apresente uma descri��o deste programa para um per�odo de cinco anos. Esse documento n�o existe!
+
+A Comiss�o ir� apresentar em Fevereiro o programa para o ano 2000. N�s demos o nosso acordo, se a Comiss�o n�o quiser ainda elaborar o Programa para 2000 em Janeiro, ent�o que o fa�a em Fevereiro. Vot�mos a favor. N�o pretendemos entrar em conflito com a Comiss�o s� por entrar, mas somos da opini�o de que, se for poss�vel, a Comiss�o e o Parlamento devem seguir um mesmo caminho. Todavia, enquanto Parlamento, somos tamb�m o �rg�o controlador da Comiss�o e nem tudo o que prov�m da Comiss�o tem de coincidir com a nossa opini�o.
+Gostaria que nos pud�ssemos preparar seriamente nos diferentes grupos para um debate sobre o programa para os pr�ximos cinco anos. N�o � poss�vel prepararmo-nos se ouvirmos aqui falar de uma declara��o e n�o soubermos qual o conte�do dessa mesma declara��o. Por esse motivo, sugerimos - e a minha impress�o � que a Comiss�o tamb�m est� aberta a estas considera��es - que se agende para Fevereiro o debate sobre o programa a longo prazo da Comiss�o at� ao ano 2005. Espero que a Comiss�o chegue a acordo, at� essa data, relativamente a um programa que nos venha a propor, e sugerimos que, tamb�m em Fevereiro, realizemos o debate sobre o programa legislativo da Comiss�o para o ano 2000. �, portanto, tamb�m um contexto objectivo s�rio que nos aconselha a realizar conjuntamente o debate sobre os dois programas. Por isso, o meu grupo rejeita decididamente a proposta do Grupo do Partido dos Socialistas Europeus!
+(Aplausos do Grupo PPE-DE)
+
+Senhora Presidente, quero deixar bem claro que, acima de tudo, a Comiss�o tem o m�ximo respeito pelas decis�es deste Parlamento e, entre elas, a de estabelecer a sua ordem do dia. Respeitamos, portanto, o que este Parlamento possa decidir nesse sentido.
+Quero, por�m, deixar igualmente bem claro que o Presidente da Comiss�o, Romano Prodi, se comprometeu com o Parlamento a acrescentar um novo debate, como recordou o senhor deputado Bar�n Crespo, ao debate anual sobre o programa legislativo da Comiss�o, sobre as grandes linhas de actua��o para o pr�ximo per�odo de cinco anos, quer dizer, para esta legislatura.
+Quero dizer, Senhora Presidente, que, no acordo a que se chegou no m�s de Setembro, este debate se distinguia daquilo que constitui a apresenta��o anual do programa legislativo da Comiss�o. Quero dizer tamb�m, Senhora Presidente, que, por parte da Comiss�o, estamos preparados e dispostos a realizar esse debate quando for conveniente, que est�vamos preparados para o realizar esta semana, como se tinha acordado inicialmente, partindo da base de que o programa seria apresentado na v�spera, num discurso aos grupos parlamentares.
+Quero, portanto, Senhora Presidente, reiterar que, pela nossa parte, j� discutimos o programa de actua��o para os pr�ximos cinco anos e que estamos preparados para, quando o Parlamento decidir - esta mesma semana, se for essa a sua decis�o - vir expor aqui o programa para os pr�ximos cinco anos, e, no m�s que vem, o programa para o ano 2000, que era o que estava absolutamente combinado.
+
+Proponho que votemos o pedido do Grupo do Partido dos Socialistas Europeus que visa voltar a inscrever a declara��o da Comiss�o sobre os seus objectivos estrat�gicos.
+(O Parlamento rejeita o pedido) Presidente. Ainda no que respeita ao dia de quarta�feira, recebi uma outra proposta referente � pergunta oral relativa ao imposto sobre o capital. O Grupo do Partido Popular Europeu/Democratas Europeus pede que esse ponto seja retirado da ordem do dia.
+Algum colega pretende tomar a palavra em nome do grupo e justificar esse pedido?
+
+Senhora Presidente, estou a ouvir alguns risos da bancada do Grupo do Partido dos Socialistas Europeus, mas foi-me dito que tamb�m largos c�rculos do Grupo do Partido dos Socialistas Europeus teriam gostado de retirar este ponto da ordem do dia, visto que aquando da vota��o na Confer�ncia dos Presidentes, o parecer do grupo de trabalho das colegas e dos colegas competentes do Grupo do Partido dos Socialistas Europeus n�o foi apresentado. N�o sei se esta informa��o � correcta, mas n�s, Grupo PPE�DE, ficar�amos, em todo o caso, gratos se este ponto fosse retirado, uma vez que o Parlamento j� se debru�ou sobre esta quest�o v�rias vezes. Existem tamb�m decis�es contra um imposto deste tipo. Por esse motivo, o meu grupo vem requerer a supress�o deste ponto da ordem do dia.
+
+Obrigada, Senhor Deputado Poettering.
+Dou a palavra ao senhor deputado Wurtz, que intervir� contra este pedido.
+
+Senhora Presidente, antes de mais, gostaria de real�ar a aus�ncia de l�gica do senhor deputado Poettering. H� pouco, pregou um serm�o ao Grupo do Partido dos Socialistas Europeus porque este volta atr�s numa decis�o tomada muito claramente na Confer�ncia dos Presidentes. Agora, faz a mesma coisa. N�s discutimos, fomos un�nimes - � excep��o do Grupo do Partido Popular Europeu/Democratas Europeus e do Grupo do Partido Europeu dos Liberais, Democratas e Reformistas - e eu cheguei mesmo a observar, como se recordar�o, caros confrades presidentes, que a quest�o n�o � a de saber se estamos a favor ou contra a taxa de Todin, mas sim a de saber se ousamos ouvir o que a Comiss�o e o Conselho pensam dela. N�o � pedir muito. Assim, reitero a proposta de se manter essa pergunta oral � Comiss�o e ao Conselho, a fim de conhecermos, de uma vez por todas, a posi��o daquelas duas inst�ncias perante esta proposta, relativamente modesta, mas que emitiria um sinal importante di
 rigido � opini�o p�blica, sobretudo depois da emo��o criada em torno do fracasso da Confer�ncia de Seattle.
+
+Coloco � vota��o o pedido do Grupo do Partido Popular Europeu/Democratas Europeus, que visa retirar da ordem do dia a pergunta oral relativa ao imposto sobre o capital.
+(O Parlamento rejeita o pedido, com 164 votos a favor, 166 votos contra e 7 absten��es)
+
+Senhora Presidente, gostaria de agradecer ao senhor deputado Poettering a publicidade que acaba de dar a este debate. Obrigado.
+
+� (ES) Senhora Presidente, ser� que se contou o meu voto, que n�o p�de ser efectuado electronicamente, pelo facto de eu n�o ter o cart�o? O meu voto era "a favor".
+
+Efectivamente, se juntarmos os dois colegas que se manifestaram, obteremos como resultado...
+
+� (ES) Senhora Presidente, a Presid�ncia anunciou o resultado da vota��o. N�o h� lugar para mudan�as.
+
+Caros colegas, mais uma vez, temos todos de trazer o cart�o � segunda-feira. Estamos perante um problema. Por conseguinte, tenho de tomar uma decis�o.
+Tamb�m eu me esqueci do meu cart�o, e teria votado contra. Assim, considero que a pergunta oral se mant�m inscrita na ordem do dia
+� a �ltima vezes que contaremos os cart�es esquecidos. Que fique bem claro e que se informe toda a gente.
+(Aplausos)Sim, a pergunta oral mant�m-se na ordem do dia, e sim, a presidente tem o direito de votar, como tamb�m tem o direito de se esquecer do cart�o.
+Vamos prosseguir com as outras modifica��es da ordem de trabalhos.
+
+Senhora Presidente, na vota��o anterior - e aceitarei a sua decis�o nesta mat�ria - sobre a quest�o dos objectivos estrat�gicos da Comiss�o, informei que gostaria de, em nome do Grupo ELDR, usar da palavra antes da vota��o. Tal n�o se verificou. Solicitar�lhe�ia, pois, que, antes de passarmos ao pr�ximo ponto da ordem do dia, me fosse permitido fazer uma declara��o de voto em nome do meu grupo. Trata�se de uma quest�o importante. Seria vantajoso para a reputa��o deste Parlamento declarar de que modo as pessoas interpretam o que acabamos de fazer � luz da sua pr�pria an�lise pol�tica.
+
+Senhora Presidente, n�o pretendo relan�ar o debate, mas tinha tamb�m pedido a palavra para me pronunciar sobre o pedido do senhor deputado Bar�n Crespo. Tamb�m n�o me chamou. Lamento-o, mas j� se procedeu � vota��o, a decis�o foi tomada, portanto, esque�amos o assunto.
+
+Pe�o muita desculpa, Senhor Deputado H�nsch e Senhor Deputado Cox, mas n�o vi que estavam a pedir a palavra. Dito isto, penso que as posi��es est�o muito claras e que ser�o correctamente reproduzidas na acta. Quando amanh� aprovarmos a acta da sess�o de hoje, os colegas que considerarem que as posi��es n�o foram suficientemente bem explicadas, podem pedir modifica��es. Penso que se trata de uma boa solu��o. Evidentemente que a acta da reuni�o de amanh� ter� em conta todas as explica��es suplementares. Creio ser uma solu��o melhor do que proceder agora a declara��es de voto que nos levariam muito longe. Senhor Deputado Cox, Senhor Deputado H�nsch, est�o de acordo?
+
+Senhora Presidente, se o resultado da vota��o reflectir correctamente o sentido de voto do meu grupo, n�o me posso opor ao mesmo, nem o farei. Se a sua decis�o for no sentido de eu n�o poder fazer uma declara��o de voto, terei de a aceitar, mas com reservas.
+
+Prestaremos portanto muita aten��o � redac��o da acta. Ali�s, prestamos sempre. Se n�o reflectir bem as nossas posi��es, podemos sempre corrigi-la.
+(O Parlamento aprova a ordem de trabalhos assim modificada)
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/language-identifier/src/test/java/org/apache/nutch/analysis/lang/sv.test
----------------------------------------------------------------------
diff --git a/nutch-plugins/language-identifier/src/test/java/org/apache/nutch/analysis/lang/sv.test b/nutch-plugins/language-identifier/src/test/java/org/apache/nutch/analysis/lang/sv.test
new file mode 100644
index 0000000..b43b5df
--- /dev/null
+++ b/nutch-plugins/language-identifier/src/test/java/org/apache/nutch/analysis/lang/sv.test
@@ -0,0 +1,108 @@
+�terupptagande av sessionen
+Jag f�rklarar Europaparlamentets session �terupptagen efter avbrottet den 17 december. Jag vill p� nytt �nska er ett gott nytt �r och jag hoppas att ni haft en trevlig semester.
+Som ni kunnat konstatera �gde "den stora �r 2000-buggen" aldrig rum. D�remot har inv�narna i ett antal av v�ra medlemsl�nder drabbats av naturkatastrofer som verkligen varit f�rskr�ckliga. Ni har beg�rt en debatt i �mnet under sammantr�desperiodens kommande dagar. Till dess vill jag att vi, som ett antal kolleger beg�rt, h�ller en tyst minut f�r offren f�r bl.a. stormarna i de l�nder i Europeiska unionen som drabbats. Jag ber er resa er f�r en tyst minut.
+(Parlamentet h�ll en tyst minut.)
+
+Fru talman! Det g�ller en ordningsfr�ga. Ni k�nner till fr�n media att det skett en rad bombexplosioner och mord i Sri Lanka. En av de personer som mycket nyligen m�rdades i Sri Lanka var Kumar Ponnambalam, som bes�kte Europaparlamentet f�r bara n�gra m�nader sedan. Skulle det vara m�jligt f�r er, fru talman, att skriva ett brev till den srilankesiska presidenten i vilket parlamentets beklagande uttrycks �ver hans och de �vriga brutala d�dsfallen i Sri Lanka och uppmanar henne att g�ra allt som st�r i hennes makt f�r att f� en fredlig l�sning p� en mycket komplicerad situation?
+
+Ja, herr Evans, jag tror att ett initiativ i den riktning ni just f�reslagit skulle vara mycket l�mpligt. Om kammaren inst�mmer skall jag g�ra som herr Evans f�reslagit.
+
+Fru talman! Det g�ller en ordningsfr�ga. Jag skulle vilja ha r�d fr�n er vad g�ller artikel 143 om avvisning av ett �rende som otill�tligt. Min fr�ga har att g�ra med n�got som kommer att behandlas p� torsdag och som jag d� kommer att ta upp igen.
+Cunhas bet�nkande om de fler�riga utvecklingsprogrammen behandlas i parlamentet p� torsdag och det inneh�ller ett f�rslag i punkt 6 om att n�got slag av kvoteringsp�f�ljder b�r inf�ras f�r l�nder som misslyckas med att uppfylla sina �rliga m�l r�rande minskning av flottorna. I bet�nkandet st�r det att detta b�r g�ras trots principen om relativ stabilitet. Jag anser att principen om relativ stabilitet �r en grundl�ggande r�ttsprincip inom den gemensamma fiskeripolitiken, och ett f�rslag som skulle undergr�va den m�ste betraktas som r�ttsligt otill�tligt. Jag vill veta om jag kan g�ra en s�dan inv�ndning mot ett bet�nkande, som allts� inte �r ett lagf�rslag, och om det �r n�got som jag har beh�righet att g�ra p� torsdag.
+
+Det �r faktiskt just vid det tillf�llet som ni, om ni vill, kan ta upp denna fr�ga, dvs. p� torsdag innan bet�nkandet l�ggs fram.
+
+Fru talman! Under �rets f�rsta sammantr�desperiod f�r Europaparlamentet best�mde man dessv�rre i Texas i USA att n�sta torsdag avr�tta en d�dsd�md, en ung man p� 34 �r som vi kan kalla Hicks.
+P� uppmaning av en fransk parlamentsledamot, Zimeray, har redan en framst�llning gjorts, undertecknad av m�nga, bland annat jag sj�lv, men jag uppmanar er, i enlighet med de riktlinjer som Europaparlamentet och hela den europeiska gemenskapen alltid har h�llit fast vid, att med all den tyngd ni har i kraft av ert �mbete och den institution ni f�retr�der, uppmana Texas guvern�r, Bush, att uppskjuta verkst�lligheten och att ben�da den d�mde.
+Detta �r helt i linje med de principer som vi alltid har h�vdat.
+
+Tack, herr Segni, det skall jag g�rna g�ra. Det ligger faktiskt helt i linje med de st�ndpunkter v�rt parlament alltid antagit.
+
+Fru talman! Jag vill f�sta er uppm�rksamhet vid ett fall som parlamentet vid upprepade tillf�llen har befattat sig med. Det g�ller fallet Alexander Nikitin. Alla gl�der vi oss �t att domstolen har friat honom och tydligt visat att tillg�ngligheten till milj�information �r en konstitutionell r�ttighet �ven i Ryssland. Nu �r det emellertid s� att han skall �talas p� nytt i och med att allm�nne �klagaren �verklagar. Vi �r medvetna om, vilket vi ocks� - inte minst under f�rra �rets sista plenarsammantr�de - har kunnat konstatera i en l�ng rad beslut, att detta inte enbart �r ett juridiskt fall och att det �r fel att beskylla Alexander Nikitin f�r kriminalitet och f�rr�deri, eftersom vi som ber�rda parter drar nytta av de resultat han har kommit fram till. Resultaten utg�r grunden f�r de europeiska programmen f�r skydd av Barents hav, och d�rf�r ber jag er granska ett utkast till ett brev som skildrar de viktigaste fakta samt att i enlighet med parlamen
 tsbesluten visa Ryssland denna st�ndpunkt klart och tydligt.
+
+Ja, fru Schroedter, jag skall mycket g�rna granska fakta r�rande denna fr�ga n�r jag f�tt ert brev.
+
+Fru talman! F�rst skulle jag vilja ge er en komplimang f�r det faktum att ni h�llit ert ord och att det nu, under det nya �rets f�rsta sammantr�desperiod, faktiskt har skett en kraftig ut�kning av antalet TV-kanaler p� v�ra rum. Men, fru talman, det som jag bad om har inte intr�ffat. Det finns nu visserligen tv� finska kanaler och en portugisisk, men det finns fortfarande ingen nederl�ndsk kanal. Jag bad er om en nederl�ndsk kanal, eftersom nederl�ndare ocks� g�rna vill ta del av nyheterna varje m�nad d� vi blir f�rvisade till den h�r platsen. Jag skulle s�ledes p� nytt vilja be er att ombes�rja att vi ocks� f�r en nederl�ndsk kanal.
+
+Fru Plooij-van Gorsel! Jag kan tala om f�r er att fr�gan finns p� f�redragningslistan f�r kvestorernas m�te p� onsdag. Jag hoppas att den kommer att granskas i en positiv anda.
+
+Fru talman! Kan ni ber�tta f�r mig varf�r detta parlament inte f�ljer den arbetsskyddslagstiftning det faktiskt antar? Varf�r har det inte genomf�rts n�got luftkvalitetstest i denna byggnad efter denna mandatperiods b�rjan? Varf�r har inte arbetsskyddskommitt�n haft n�gra sammantr�den sedan 1998? Varf�r har det inte skett n�gra brand�vningar i parlamentets byggnader i Bryssel eller Strasbourg? Varf�r finns det inga instruktioner om hur man skall bete sig om det b�rjar brinna? Varf�r har inte trapporna byggts om efter den olycka jag r�kade ut f�r? Varf�r uppr�tth�ller man inte best�mmelserna om r�kfria omr�den? Jag tycker det �r skr�mmande att vi antar lagstiftning som vi inte sj�lva f�ljer.
+(Appl�der)
+
+Fru Lynne! Ni har helt r�tt och jag skall kontrollera om allt detta faktiskt inte har gjorts. Jag skall ocks� �verl�mna problemet till kvestorerna och jag �r �vertygad om att de �r m�na om att se till att vi respekterar de regler som vi faktiskt r�stat fram.
+
+Fru talman! D�ez Gonz�lez och jag har st�llt n�gra fr�gor ang�ende vissa av vice ordf�rande de Palacios �sikter som �tergavs i en spansk dagstidning. De ansvariga har inte tagit med dessa fr�gor p� f�redragningslistan, eftersom man ans�g att dessa hade besvarats vid ett tidigare sammantr�de.
+Jag ber att man ompr�var det beslutet, eftersom s� inte �r fallet. De fr�gor som tidigare besvarats handlade om de Palacios inblandning i ett s�rskilt �rende, inte om de uttalanden som �tergavs i dagstidningen ABC den 18 november i fjol.
+
+K�ra kollega! Vi skall kontrollera allt detta. Jag erk�nner att f�r n�rvarande f�refaller saker och ting litet oklara. Vi skall allts� se �ver detta mycket noga s� allt blir i sin ordning.
+
+Fru talman! Jag vill veta om det kommer att g� ut ett tydligt budskap fr�n parlamentet under veckan om v�rt missn�je r�rande dagens beslut om att v�gra en f�rnyelse av vapenembargot mot Indonesien, med h�nsyn till att det stora flertalet i detta parlament har st�tt vapenembargot mot Indonesien? Dagens beslut att inte f�rnya embargot �r oerh�rt farligt med h�nsyn till situationen d�r. Parlamentet b�r allts� s�nda ut ett budskap, eftersom detta �r vad det stora flertalet vill. Det �r oansvarigt av EU:s medlemsstater att v�gra att f�rnya embargot. Som olika personer har sagt, �r situationen d�r oerh�rt turbulent. Det finns faktiskt en risk f�r en framtida milit�rkupp. Vi vet inte vad som h�nder. S� varf�r skall vapentillverkarna i EU profitera p� oskyldiga m�nniskors bekostnad?
+(Appl�der)
+
+I vilket fall som helst �r fr�gan f�r n�rvarande inte f�rem�l f�r n�gon beg�ran om br�dskande f�rfarande p� torsdag.
+
+Arbetsplan
+N�sta punkt p� f�redragningslistan �r fastst�llande av arbetsplanen. Det slutgiltiga f�rslaget till f�redragningslista som utarbetats av talmanskonferensen vid sammantr�det den 13 januari i enlighet med artikel 110 i arbetsordningen har delats ut. F�r m�ndag och tisdag har inga �ndringar f�reslagits.
+Betr�ffande onsdag:
+Den socialistiska gruppen har beg�rt att ett uttalande fr�n kommissionen om dess strategiska m�l f�r de fem kommande �ren samt om kommissionens administrativa reform skall tas upp.
+Jag skulle vilja att Bar�n Crespo, som l�mnat beg�ran, uttalar sig f�r att motivera den, om han vill, naturligtvis. Sedan g�r vi som vi brukar: vi lyssnar till en talare f�r och en talare emot.
+
+Fru talman! Framl�ggandet av kommission Prodis politiska program f�r hela mandatperioden bottnar i ett f�rslag fr�n Europeiska socialdemokratiska partiets grupp som antogs med enh�llighet p� talmanskonferensen i september samt ett tydligt godk�nnande fr�n ordf�rande Prodi som upprepade detta �tagande i sitt anf�rande i samband med tilltr�dandet av sitt �mbete.
+Detta �tagande �r viktigt, med tanke p� att kommissionen �r det organ som enligt f�rdragen har ensam initiativr�tt, och det utg�r d�rf�r grunden till parlamentets politiska och lagstiftande verksamhet de kommande fem �ren. Jag vill dessutom, fru talman, p�minna om att parlamentet vid tv� tillf�llen under f�reg�ende mandatperiod r�stade om f�rtroendet f�r ordf�rande Prodi; man r�stade p� nytt om detta under denna mandatperiod i juli, och sedan, n�r den nya kommissionen hade p�b�rjat sitt arbete, gav man i september en f�rtroender�st till hela kommissionen. D�rf�r har det funnits tillr�ckligt mycket tid f�r kommissionen att f�rbereda sitt program och f�r att oss att ta del av detta och redog�ra f�r detta inf�r medborgarna. Jag vill ocks� p�minna om resolutionen av den 15 september, d�r man rekommenderade att f�rslaget skulle l�ggas fram s� snart som m�jligt.
+Det som h�nde f�rra veckan - n�got som inleddes utanf�r talmanskonferensen, en konferens som endast utnyttjades f�r att bestyrka och bekr�fta det beslut som fattats utanf�r ramarna f�r denna - utg�r ett dilemma: antingen �r det s� att kommissionen inte �r i st�nd att presentera programmet (i s�dant fall b�r den klarg�ra detta. Enligt ordf�randens uttalanden kan man presentera programmet. Med tanke p� att kommissionen f�retr�ds av vice ordf�rande de Palacio, anser jag att vi innan omr�stningen sker b�r f� veta huruvida kommissionen �r beredd att l�gga fram programmet, s� som man kommit �verens om); annars �r parlamentet inte i st�nd att granska programmet, s� som vissa tycks anse. Enligt min uppfattning skulle den sistn�mnda hypotesen inneb�ra att vi f�rsummade v�rt ansvar som parlament, f�rutom att man d� skulle inf�ra en grundtes, en ok�nd metod som inneb�r att de politiska grupperna skriftligen f�r ta del av kommissionens tankar kring progr
 ammet en vecka i f�rv�g i st�llet f�r en dag i f�rv�g, som man kommit �verens om. D� b�r man t�nka p� att lagstiftningsprogrammet skall debatteras i februari, och d�rf�r skulle vi lika g�rna kunna avst� fr�n den debatten, f�r pressen och Internet skulle redan dagen d�rp� tillk�nnage programmet f�r alla medborgare, och det skulle inte l�ngre finnas n�gon anledning f�r parlamentet att �gna sig �t fr�gan.
+Eftersom min grupp anser att parlamentet �r till f�r att lyssna, f�r att debattera och f�r att reflektera, anser vi att det inte finns n�got som r�ttf�rdigar en senarel�ggning av debatten, och om kommissionen �r beredd till det, menar vi att det fortfarande �r m�jligt att �teruppr�tta det ursprungliga avtalet mellan parlamentet och kommissionen och agera p� ett ansvarsfullt s�tt gentemot v�ra medborgare. D�rf�r inneb�r f�rslaget fr�n Europeiska socialdemokratiska partiets grupp, som fru talmannen n�mnde, att kommission Prodis lagstiftningsprogram l�ggs fram p� onsdag som planerat, och att man inbegriper f�rslaget om en administrativ reform, f�r i annat fall kan en paradoxal situation uppst�: � ena sidan v�gras kommissionens ordf�rande, med urs�kten att det inte finns n�got dokument, r�tten att tala i parlamentet, � andra sidan om�jligg�rs en debatt om reformen, eftersom parlamentet inte tidigare har f�tt ta del av dokumenten i fr�ga. D�rf�r be
 r jag, fru talman, att ni uppmanar kommissionen att uttala sig och att vi d�refter g�r till omr�stning.
+(Appl�der fr�n PSE)
+
+Fru talman, �rade kolleger! Jag m�ste s�ga att jag �r n�got f�rv�nad �ver kollegan Bar�n Crespos agerande n�r han nu kr�ver att denna punkt p� f�redragningslistan flyttas till onsdagen.
+Herr Bar�n Crespo! Ni kunde inte n�rvara vid talmanskonferensen f�rra torsdagen. Det t�nker jag inte kritisera: det h�nder alltid att man har en st�llf�retr�dare. Kollegan H�nsch var d�r och f�retr�dde er. Vi f�rde en grundlig debatt p� talmanskonferensen. Det var bara er egen grupp som f�respr�kade det ni nu talar om. D�refter r�stade vi. Varje ordf�rande har ju lika m�nga r�ster som hans eller hennes grupp har medlemmar. Det r�stades p� denna punkt. Omr�stningen resulterade vad jag kan minnas i f�ljande siffror: 422 r�ster mot 180, med n�gra f� nedlagda r�ster. Detta betyder att alla grupper, med undantag f�r de gruppl�sa - men de �r ju heller ingen grupp - var �verens, och endast er grupp ans�g att man borde f�rfara p� det s�tt som ni har f�reslagit h�r. Alla andra var av en annan �sikt. S�dant blev beslutet.
+Nu vill jag sj�lv s�ga n�got i �mnet. Vi hyser f�rtroende f�r kommissionen, f�r Romano Prodi, och en mycket stor majoritet av v�r grupp uttalade sitt f�rtroende f�r Romano Prodi och kommissionen efter en, som alla vet, sv�r process. Men vi anser ocks� att vi m�ste f�ra en debatt om kommissionens strategi under ordnade former, inte bara utifr�n ett muntligt uttalande h�r i Europaparlamentet utan ocks� fr�n ett dokument som kommissionen har beslutat om och som beskriver programmet f�r fem �r fram�ver. N�got s�dant dokument existerar inte!
+(Appl�der)
+I februari skall kommissionen l�gga fram programmet f�r �r 2000. Vi har sagt att detta g�r f�r sig, om kommissionen inte vill g�ra klart program 2000 redan i januari, d� g�r vi det i februari. Det har vi sagt ja till. Vi vill ju f�r den delen inte gr�la med kommissionen utan anser att kommission och parlament s� l�ngt det �r m�jligt skall g� samma v�g. Men samtidigt �r vi som parlament kontrollinstans gentemot kommissionen. Och allt som kommer fr�n kommissionen beh�ver inte n�dv�ndigtvis vara v�r �sikt.
+Jag vill att vi inom grupperna skall kunna f�rbereda oss p� ett klokt s�tt inf�r en debatt om fem�rsprogrammet. Man kan inte f�rbereda sig genom att sitta h�r och lyssna p� ett uttalande utan att alls veta vad som ligger bakom ett s�dant uttalande. D�rf�r rekommenderar vi - och jag har intrycket att kommissionen likaledes �r �ppen f�r den tanken - att debatten om kommissionens l�ngsiktiga arbete fram till �r 2000 f�rs nu i februari - jag hoppas ocks� att kommissionen till dess har kommit �verens om ett program att f�resl� - och att vi samtidigt i februari kan debattera kommissionens lagstiftningsprogram f�r �r 2000. Det �r s�lunda �ven av f�rnuftiga och sakliga sk�l som vi b�r debattera b�gge programmen samtidigt. D�rf�r tillbakavisar min grupp � det best�mdaste den socialistiska gruppens f�rslag!
+(Appl�der fr�n PPE-DE-gruppen)
+
+Fru talman! Jag vill i f�rsta hand klarg�ra att kommissionen hyser den st�rsta respekt f�r parlamentets beslut, och d�rmed �ven f�r beslutet om fastst�llandet av arbetsplanen. Vi respekterar s�ledes i det avseendet parlamentets beslut. Samtidigt vill jag klarg�ra att ordf�rande Prodi har lovat parlamentet ytterligare en debatt, precis som Bar�n p�pekade, f�rutom den �rliga debatten om kommissionens lagstiftningsprogram, en debatt om de viktigaste handlingslinjerna under den kommande fem�rsperioden, det vill s�ga under denna mandatperiod.
+Det jag vill s�ga, fru talman, �r att man i den �verenskommelse som uppn�ddes i september, gjorde en �tskillnad mellan den debatten och framst�llandet av kommissionens �rliga lagstiftningsprogram. Och jag vill ocks�, fru talman, s�ga att vi fr�n kommissionens sida �r f�rberedda och f�rdiga f�r denna debatt n�r �n det m� vara, att vi �r redo att h�lla debatten denna vecka, s� som man i princip avtalade, med tanke p� att utkastet redan har presenterats i ett anf�rande inf�r de parlamentariska grupperna.
+D�rf�r, fru talman, vill jag �n en g�ng p�peka att vi f�r v�r del har diskuterat igenom �tg�rdsprogrammet f�r de kommande fem �ren, och att vi �r redo att, n�r som helst d� parlamentet beslutar det, - den h�r veckan om man best�mmer sig f�r det - komma och presentera programmet f�r de kommande fem �ren, och sedan n�sta m�nad, programmet f�r �r 2000, precis som avtalat.
+
+Jag f�resl�r att vi r�star om beg�ran fr�n den socialistiska gruppen att p� nytt f�ra upp kommissionens uttalande om dess strategiska m�l p� f�redragningslistan.
+(Parlamentet avslog beg�ran.) Talmannen. Betr�ffande onsdagen har jag ocks� mottagit ett annat f�rslag betr�ffande den muntliga fr�gan om kapitalskatt. PPE-DE-gruppen beg�r att denna punkt skall strykas fr�n f�redragningslistan.
+Vill n�gon kollega beg�ra ordet f�r gruppens r�kning och motivera denna beg�ran?
+
+Fru talman! Eftersom jag h�r att det skrattas bland socialisterna: man har sagt mig att �ven vida kretsar inom den socialistiska gruppen g�rna vill se den h�r punkten avf�rd fr�n f�redragningslistan, eftersom det vid omr�stningen p� talmanskonferensen saknades votum f�r ber�rda kolleger i den socialistiska arbetsgruppen. Jag vet inte huruvida denna information st�mmer, men vi i PPE-DE-gruppen vore i alla fall tacksamma ifall punkten str�ks, d� ju parlamentet redan har befattat sig med fr�gan flera g�nger. Det finns ocks� beslut fattade mot en s�dan skatt. D�rf�r yrkar min grupp p� att punkten avf�rs fr�n f�redragningslistan.
+
+Tack, herr Poettering.
+Vi skall nu lyssna till Wurtz som skall uttala sig emot denna beg�ran.
+
+Fru talman! Jag skulle till att b�rja med vilja understryka Poetterings bristande logik. Han har just l�xat upp den socialistiska gruppen f�r att den �ndrat sig n�r det g�ller ett beslut som fattats med mycket liten marginal i talmanskonferensen. Men han g�r samma sak sj�lv. Vi diskuterade och var eniga, utom PPE-gruppen och den liberala gruppen, och jag noterade t.o.m., det minns ni s�kert k�ra ordf�randekolleger, att fr�gan inte handlar om huruvida ni �r f�r eller emot Todinskatten, utan om ni v�gar h�ra vad kommissionen och r�det tycker om den. Det �r inte f�r mycket beg�rt. Jag upprepar d�rf�r f�rslaget att beh�lla denna muntliga fr�ga till kommissionen och r�det f�r att en g�ng f�r alla f� veta vilken inst�llning dessa tv� instanser har till denna relativt blygsamma beg�ran, som �nd� skulle utg�ra en viktig signal till allm�nheten, s�rskilt med tanke p� den oro som uppstod efter den misslyckade konferensen i Seattle.
+
+Vi skall r�sta om beg�ran fr�n PPE-DE-gruppen som syftar till att stryka den muntliga fr�gan om kapitalskatt fr�n f�redragningslistan.
+(Parlamentet avslog beg�ran med 164 r�ster f�r, 166 emot. 7 ledam�ter avstod fr�n att r�sta.)
+
+Fru talman! Jag skulle vilja tacka Poettering f�r att han just gjort reklam f�r denna debatt. Tack.
+
+Fru talman! Jag undrar om �ven min r�st har r�knats, trots att den inte kunde avges p� elektronisk v�g, eftersom jag inte har n�got kort? Jag r�stade "f�r".
+
+Om man l�gger till de tv� kolleger som yttrat sig blir resultatet...
+
+Fru talman! Ordf�randeskapet har redan meddelat resultatet fr�n omr�stningen. Det finns inget utrymme f�r n�gra �ndringar.
+(Appl�der)
+
+K�ra kolleger! �nnu en g�ng vill jag p�peka att alla m�ste ha sitt kort p� m�ndagen. Det �r tydligt att vi har ett problem h�r. Jag m�ste d�rf�r fatta ett beslut.
+Jag har ocks� gl�mt mitt kort och jag skulle ha r�stat emot. Jag anser d�rf�r att den muntliga fr�gan kvarst�r p� f�redragningslistan.
+(Appl�der) Det �r sista g�ngen vi tar h�nsyn till att ni gl�mt korten. Jag hoppas att alla har f�rst�tt och vi skall se till att alla f�r veta det.
+(Appl�der)
+Ja, den munliga fr�gan kvarst�r p� f�redragningslistan och, ja, talmannen har r�tt att r�sta, liksom hon har r�tt att ocks� gl�mma sitt kort.
+Vi forts�tter nu med �vriga �ndringar i f�redragningslistan.
+
+Fru talman! Under den tidigare omr�stningen - och jag kommer att f�lja ert utslag i denna fr�ga - r�rande fr�gan om kommissionens strategiska plan, sade jag att jag ville uttala mig f�re omr�stningen p� min grupps v�gnar. S� blev inte fallet. Jag skulle uppskatta om jag vid denna punkts avslutande kunde f� avge en r�stf�rklaring p� min grupps v�gnar. Detta �r en viktig fr�ga. Det skulle vara anv�ndbart f�r kammarens r�kning att upplysa om hur folk uppfattar vad vi just gjort mot bakgrund av deras egen politiska analys.
+
+Fru talman! Jag skall inte ta upp debatten p� nytt, men �ven jag hade beg�rt ordet f�r att ta st�llning till herr Bar�n Crespos beg�ran. Ni l�t mig aldrig komma till tals. Det beklagar jag, men omr�stningen har genomf�rts, beslutet har fattats, allts� f�r det vara.
+
+Jag �r ledsen, herr H�nsch och herr Cox, jag s�g inte att ni hade beg�rt ordet. Jag tror �nd� att st�ndpunkterna �r tydliga och de kommer att bekr�ftas i protokollet. N�r vi i morgon justerar protokollet fr�n dagens sammantr�de kan de kolleger, som d� anser att st�ndpunkterna inte f�rklarats tillr�ckligt tydligt, beg�ra �ndringar. Jag tror att det �r ett bra s�tt. Naturligtvis kommer man i protokollet fr�n morgondagens sammantr�de att ta h�nsyn till alla kompletterande f�rklaringar. Jag tror att det �r b�ttre �n att nu genomf�ra r�stf�rklaringar som kommer att leda mycket l�ngt. Vad s�ger ni om det, herr Cox och herr H�nsch?
+
+Fru talman! Om omr�stningsregistreringen p� ett korrekt s�tt visar hur min grupp r�stade, skall jag och kan jag inte protestera mot denna. Om ert utslag inneb�r att jag inte kan avge en r�stf�rklaring, accepterar jag detta men med reservation.
+
+Vi skall allts� vara mycket noggranna vid uppr�ttandet av protokollet. Det �r vi f�r �vrigt alltid. Om det inte �terger st�ndpunkterna tillfredsst�llande, kan vi eventuellt �ndra i det.
+(Arbetsplanen fastst�lldes med dessa �ndringar.)
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/language-identifier/src/test/java/org/apache/nutch/analysis/lang/test-referencial.txt
----------------------------------------------------------------------
diff --git a/nutch-plugins/language-identifier/src/test/java/org/apache/nutch/analysis/lang/test-referencial.txt b/nutch-plugins/language-identifier/src/test/java/org/apache/nutch/analysis/lang/test-referencial.txt
new file mode 100644
index 0000000..ca32073
--- /dev/null
+++ b/nutch-plugins/language-identifier/src/test/java/org/apache/nutch/analysis/lang/test-referencial.txt
@@ -0,0 +1,10 @@
+da.test;da
+de.test;de
+el.test;el
+en.test;en
+es.test;es
+fi.test;fi
+fr.test;fr
+it.test;it
+nl.test;nl
+pt.test;pt

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-htmlunit/build-ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-htmlunit/build-ivy.xml b/nutch-plugins/lib-htmlunit/build-ivy.xml
new file mode 100644
index 0000000..7022f4e
--- /dev/null
+++ b/nutch-plugins/lib-htmlunit/build-ivy.xml
@@ -0,0 +1,54 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="lib-htmlunit" default="deps-jar" xmlns:ivy="antlib:org.apache.ivy.ant">
+
+    <property name="ivy.install.version" value="2.1.0" />
+    <condition property="ivy.home" value="${env.IVY_HOME}">
+      <isset property="env.IVY_HOME" />
+    </condition>
+    <property name="ivy.home" value="${user.home}/.ant" />
+    <property name="ivy.checksums" value="" />
+    <property name="ivy.jar.dir" value="${ivy.home}/lib" />
+    <property name="ivy.jar.file" value="${ivy.jar.dir}/ivy.jar" />
+
+    <target name="download-ivy" unless="offline">
+
+        <mkdir dir="${ivy.jar.dir}"/>
+        <!-- download Ivy from web site so that it can be used even without any special installation -->
+        <get src="http://repo2.maven.org/maven2/org/apache/ivy/ivy/${ivy.install.version}/ivy-${ivy.install.version}.jar" 
+             dest="${ivy.jar.file}" usetimestamp="true"/>
+    </target>
+
+    <target name="init-ivy" depends="download-ivy">
+      <!-- try to load ivy here from ivy home, in case the user has not already dropped
+              it into ant's lib dir (note that the latter copy will always take precedence).
+              We will not fail as long as local lib dir exists (it may be empty) and
+              ivy is in at least one of ant's lib dir or the local lib dir. -->
+        <path id="ivy.lib.path">
+            <fileset dir="${ivy.jar.dir}" includes="*.jar"/>
+
+        </path>
+        <taskdef resource="org/apache/ivy/ant/antlib.xml"
+                 uri="antlib:org.apache.ivy.ant" classpathref="ivy.lib.path"/>
+    </target>
+
+  <target name="deps-jar" depends="init-ivy">
+    <ivy:retrieve pattern="lib/[artifact]-[revision].[ext]" sync="true"/>
+  </target>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-htmlunit/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-htmlunit/build.xml b/nutch-plugins/lib-htmlunit/build.xml
new file mode 100644
index 0000000..14f5d8f
--- /dev/null
+++ b/nutch-plugins/lib-htmlunit/build.xml
@@ -0,0 +1,28 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="lib-htmlunit" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+  <!-- Add compilation dependencies to classpath -->
+  <path id="plugin.deps">    
+    <fileset dir="${nutch.root}/build">
+      <include name="**/lib-http/*.jar" />
+    </fileset>
+  </path>
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-htmlunit/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-htmlunit/ivy.xml b/nutch-plugins/lib-htmlunit/ivy.xml
new file mode 100644
index 0000000..6430535
--- /dev/null
+++ b/nutch-plugins/lib-htmlunit/ivy.xml
@@ -0,0 +1,52 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../../ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+    <!-- begin selenium dependencies -->
+    <dependency org="org.seleniumhq.selenium" name="selenium-java" rev="2.44.0" />
+    
+    <dependency org="com.opera" name="operadriver" rev="1.5">
+      <exclude org="org.seleniumhq.selenium" name="selenium-remote-driver" />
+    </dependency>
+    <dependency org="com.codeborne" name="phantomjsdriver" rev="1.2.1" >
+      <exclude org="org.seleniumhq.selenium" name="selenium-remote-driver" />
+      <exclude org="org.seleniumhq.selenium" name="selenium-java" />
+    </dependency>
+    <!-- end selenium dependencies -->
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-htmlunit/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-htmlunit/plugin.xml b/nutch-plugins/lib-htmlunit/plugin.xml
new file mode 100644
index 0000000..290a137
--- /dev/null
+++ b/nutch-plugins/lib-htmlunit/plugin.xml
@@ -0,0 +1,166 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<!--
+ ! A common framework for http protocol implementations
+ !-->
+<plugin
+   id="lib-htmlunit"
+   name="HTTP Framework"
+   version="1.0"
+   provider-name="org.apache.nutch">
+
+   <runtime>
+     <library name="lib-htmlunit.jar">
+        <export name="*"/>
+     </library>
+     <!-- all classes from dependent libraries are exported -->
+     <library name="cglib-nodep-2.1_3.jar">
+       <export name="*"/>
+     </library>
+     <library name="commons-codec-1.9.jar">
+       <export name="*"/>
+     </library>
+     <library name="commons-collections-3.2.1.jar">
+       <export name="*"/>
+     </library>
+     <library name="commons-exec-1.1.jar">
+       <export name="*"/>
+     </library>
+     <library name="commons-io-2.4.jar">
+       <export name="*"/>
+     </library>
+     <library name="commons-jxpath-1.3.jar">
+       <export name="*"/>
+     </library>
+     <library name="commons-lang3-3.3.2.jar">
+       <export name="*"/>
+     </library>
+     <library name="commons-logging-1.1.3.jar">
+       <export name="*"/>
+     </library>
+     <library name="cssparser-0.9.14.jar">
+       <export name="*"/>
+     </library>
+     <library name="gson-2.3.jar">
+       <export name="*"/>
+     </library>
+     <library name="guava-18.0.jar">
+       <export name="*"/>
+     </library>
+     <library name="htmlunit-2.15.jar">
+       <export name="*"/>
+     </library>
+     <library name="htmlunit-core-js-2.15.jar">
+       <export name="*"/>
+     </library>
+     <library name="httpclient-4.3.4.jar">
+       <export name="*"/>
+     </library>
+     <library name="httpcore-4.3.2.jar">
+       <export name="*"/>
+     </library>
+     <library name="httpmime-4.3.3.jar">
+       <export name="*"/>
+     </library>
+     <library name="ini4j-0.5.2.jar">
+       <export name="*"/>
+     </library>
+     <library name="jetty-http-8.1.15.v20140411.jar">
+       <export name="*"/>
+     </library>
+     <library name="jetty-io-8.1.15.v20140411.jar">
+       <export name="*"/>
+     </library>
+     <library name="jetty-util-8.1.15.v20140411.jar">
+       <export name="*"/>
+     </library>
+     <library name="jetty-websocket-8.1.15.v20140411.jar">
+       <export name="*"/>
+     </library>
+     <library name="jna-3.4.0.jar">
+       <export name="*"/>
+     </library>
+     <library name="nekohtml-1.9.21.jar">
+       <export name="*"/>
+     </library>
+     <library name="netty-3.5.2.Final.jar">
+       <export name="*"/>
+     </library>
+     <library name="operadriver-1.5.jar">
+       <export name="*"/>
+     </library>
+     <library name="operalaunchers-1.1.jar">
+       <export name="*"/>
+     </library>
+     <library name="phantomjsdriver-1.2.1.jar">
+       <export name="*"/>
+     </library>
+     <library name="platform-3.4.0.jar">
+       <export name="*"/>
+     </library>
+     <library name="protobuf-java-2.4.1.jar">
+       <export name="*"/>
+     </library>
+     <library name="sac-1.3.jar">
+       <export name="*"/>
+     </library>
+     <library name="selenium-api-2.44.0.jar">
+       <export name="*"/>
+     </library>
+     <library name="selenium-chrome-driver-2.44.0.jar">
+       <export name="*"/>
+     </library>
+     <library name="selenium-firefox-driver-2.44.0.jar">
+       <export name="*"/>
+     </library>
+     <library name="selenium-htmlunit-driver-2.44.0.jar">
+       <export name="*"/>
+     </library>
+     <library name="selenium-ie-driver-2.44.0.jar">
+       <export name="*"/>
+     </library>
+     <library name="selenium-java-2.44.0.jar">
+       <export name="*"/>
+     </library>
+     <library name="selenium-remote-driver-2.44.0.jar">
+       <export name="*"/>
+     </library>
+     <library name="selenium-safari-driver-2.44.0.jar">
+       <export name="*"/>
+     </library>
+     <library name="selenium-support-2.44.0.jar">
+       <export name="*"/>
+     </library>
+     <library name="serializer-2.7.1.jar">
+       <export name="*"/>
+     </library>
+     <library name="webbit-0.4.14.jar">
+       <export name="*"/>
+     </library>
+     <library name="xalan-2.7.1.jar">
+       <export name="*"/>
+     </library>
+     <library name="xercesImpl-2.11.0.jar">
+       <export name="*"/>
+     </library>
+     <library name="xml-apis-1.4.01.jar">
+       <export name="*"/>
+     </library>
+   </runtime>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-htmlunit/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-htmlunit/pom.xml b/nutch-plugins/lib-htmlunit/pom.xml
new file mode 100644
index 0000000..e128b8a
--- /dev/null
+++ b/nutch-plugins/lib-htmlunit/pom.xml
@@ -0,0 +1,55 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>lib-htmlunit</artifactId>
+    <packaging>jar</packaging>
+
+    <name>lib-htmlunit</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+    <dependencies>
+        <dependency>
+            <groupId>org.seleniumhq.selenium</groupId>
+            <artifactId>selenium-java</artifactId>
+            <version>2.44.0</version>
+        </dependency>
+        <dependency>
+            <groupId>com.opera</groupId>
+            <artifactId>operadriver</artifactId>
+            <version>1.5</version>
+        </dependency>
+        <dependency>
+            <groupId>com.codeborne</groupId>
+            <artifactId>phantomjsdriver</artifactId>
+            <version>1.2.1</version>
+        </dependency>
+    </dependencies>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-htmlunit/src/main/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebDriver.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-htmlunit/src/main/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebDriver.java b/nutch-plugins/lib-htmlunit/src/main/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebDriver.java
new file mode 100644
index 0000000..064894e
--- /dev/null
+++ b/nutch-plugins/lib-htmlunit/src/main/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebDriver.java
@@ -0,0 +1,189 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.htmlunit;
+
+import java.io.BufferedInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.util.concurrent.TimeUnit;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IOUtils;
+import org.openqa.selenium.By;
+import org.openqa.selenium.JavascriptExecutor;
+import org.openqa.selenium.OutputType;
+import org.openqa.selenium.TakesScreenshot;
+import org.openqa.selenium.TimeoutException;
+import org.openqa.selenium.WebDriver;
+import org.openqa.selenium.WebElement;
+import org.openqa.selenium.htmlunit.HtmlUnitDriver;
+import org.openqa.selenium.io.TemporaryFilesystem;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.gargoylesoftware.htmlunit.WebClient;
+
+public class HtmlUnitWebDriver extends HtmlUnitDriver {
+
+  private static final Logger LOG = LoggerFactory.getLogger(HtmlUnitWebDriver.class);
+  private static boolean enableJavascript;
+  private static boolean enableCss;
+  private static boolean enableRedirect;
+  private static long javascriptTimeout;
+  private static int maxRedirects;
+  
+  public HtmlUnitWebDriver() {
+    super(enableJavascript);
+  }
+  
+  @Override
+  protected WebClient modifyWebClient(WebClient client) {
+    client.getOptions().setJavaScriptEnabled(enableJavascript);
+    client.getOptions().setCssEnabled(enableCss);
+    client.getOptions().setRedirectEnabled(enableRedirect);
+    if(enableJavascript)
+      client.setJavaScriptTimeout(javascriptTimeout);
+      client.getOptions().setThrowExceptionOnScriptError(false);
+      if(enableRedirect)
+        client.addWebWindowListener(new HtmlUnitWebWindowListener(maxRedirects));
+	  return client;
+  }
+  
+  public static WebDriver getDriverForPage(String url, Configuration conf) {
+    long pageLoadTimout = conf.getLong("page.load.delay", 3);
+    enableJavascript = conf.getBoolean("htmlunit.enable.javascript", true);
+    enableCss = conf.getBoolean("htmlunit.enable.css", false);
+    javascriptTimeout = conf.getLong("htmlunit.javascript.timeout", 3500);
+    int redirects = Integer.parseInt(conf.get("http.redirect.max", "0"));
+    enableRedirect = redirects <= 0 ? false : true;
+    maxRedirects = redirects;
+	  
+    WebDriver driver = null;
+	  
+    try {
+      driver = new HtmlUnitWebDriver();
+      driver.manage().timeouts().pageLoadTimeout(pageLoadTimout, TimeUnit.SECONDS);
+      driver.get(url);
+     } catch(Exception e) {
+       if(e instanceof TimeoutException) {
+	       LOG.debug("HtmlUnit WebDriver: Timeout Exception: Capturing whatever loaded so far...");
+	       return driver;
+     }
+     cleanUpDriver(driver);
+     throw new RuntimeException(e);
+    }
+
+    return driver;
+  }
+
+  public static String getHTMLContent(WebDriver driver, Configuration conf) {
+    try {
+      if (conf.getBoolean("take.screenshot", false))
+        takeScreenshot(driver, conf);
+		  
+      String innerHtml = "";
+      if(enableJavascript) {
+	      WebElement body = driver.findElement(By.tagName("body"));
+	      innerHtml = (String)((JavascriptExecutor)driver).executeScript("return arguments[0].innerHTML;", body); 
+      }
+      else
+	      innerHtml = driver.getPageSource().replaceAll("&amp;", "&");
+      return innerHtml;
+    } catch(Exception e) {
+	    TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles();
+    	cleanUpDriver(driver);
+    	throw new RuntimeException(e);
+    } 
+  }
+
+  public static void cleanUpDriver(WebDriver driver) {
+    if (driver != null) {
+      try {
+        driver.close();
+        driver.quit();
+        TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles();
+      } catch (Exception e) {
+        throw new RuntimeException(e);
+      }
+    }
+  }
+
+  /**
+   * Function for obtaining the HTML BODY using the selected
+   * {@link org.openqa.selenium.WebDriver}.
+   * There are a number of configuration properties within
+   * <code>nutch-site.xml</code> which determine whether to
+   * take screenshots of the rendered pages and persist them
+   * as timestamped .png's into HDFS.
+   * @param url the URL to fetch and render
+   * @param conf the {@link org.apache.hadoop.conf.Configuration}
+   * @return the rendered inner HTML page
+   */
+  public static String getHtmlPage(String url, Configuration conf) {
+    WebDriver driver = getDriverForPage(url, conf);
+
+    try {
+      if (conf.getBoolean("take.screenshot", false))
+	      takeScreenshot(driver, conf);
+
+      String innerHtml = "";
+      if(enableJavascript) {
+	      WebElement body = driver.findElement(By.tagName("body"));
+    	  innerHtml = (String)((JavascriptExecutor)driver).executeScript("return arguments[0].innerHTML;", body); 
+      }
+      else
+    	  innerHtml = driver.getPageSource().replaceAll("&amp;", "&");
+      return innerHtml;
+
+    } catch (Exception e) {
+	    TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles();
+      throw new RuntimeException(e);
+    } finally {
+      cleanUpDriver(driver);
+    }
+  }
+
+  private static void takeScreenshot(WebDriver driver, Configuration conf) {
+    try {
+      String url = driver.getCurrentUrl();
+      File srcFile = ((TakesScreenshot)driver).getScreenshotAs(OutputType.FILE);
+      LOG.debug("In-memory screenshot taken of: {}", url);
+      FileSystem fs = FileSystem.get(conf);
+      if (conf.get("screenshot.location") != null) {
+    	  Path screenshotPath = new Path(conf.get("screenshot.location") + "/" + srcFile.getName());
+        OutputStream os = null;
+        if (!fs.exists(screenshotPath)) {
+          LOG.debug("No existing screenshot already exists... creating new file at {} {}.", screenshotPath, srcFile.getName());
+          os = fs.create(screenshotPath);
+        }
+        InputStream is = new BufferedInputStream(new FileInputStream(srcFile));
+        IOUtils.copyBytes(is, os, conf);
+        LOG.debug("Screenshot for {} successfully saved to: {} {}", url, screenshotPath, srcFile.getName()); 
+      } else {
+        LOG.warn("Screenshot for {} not saved to HDFS (subsequently disgarded) as value for "
+            + "'screenshot.location' is absent from nutch-site.xml.", url);
+      }
+    } catch (Exception e) {
+    	cleanUpDriver(driver);
+    	throw new RuntimeException(e);
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-htmlunit/src/main/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebWindowListener.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-htmlunit/src/main/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebWindowListener.java b/nutch-plugins/lib-htmlunit/src/main/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebWindowListener.java
new file mode 100644
index 0000000..c2b88a6
--- /dev/null
+++ b/nutch-plugins/lib-htmlunit/src/main/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebWindowListener.java
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.htmlunit;
+
+import com.gargoylesoftware.htmlunit.WebWindowEvent;
+import com.gargoylesoftware.htmlunit.WebWindowListener;
+
+public class HtmlUnitWebWindowListener implements WebWindowListener {
+
+  private Integer redirectCount = 0;
+  private Integer maxRedirects = 0;
+  
+  public HtmlUnitWebWindowListener() {
+    
+  }
+  
+  public HtmlUnitWebWindowListener(int maxRedirects) {
+    this.maxRedirects = maxRedirects;
+  }
+  
+  @Override
+  public void webWindowOpened(WebWindowEvent event) {
+    
+  }
+
+  @Override
+  public void webWindowContentChanged(WebWindowEvent event) {
+    redirectCount++;
+    if(redirectCount > maxRedirects)
+      throw new RuntimeException("Redirect Count: " + redirectCount + " exceeded the Maximum Redirects allowed: " + maxRedirects);
+  }
+
+  @Override
+  public void webWindowClosed(WebWindowEvent event) {
+    
+  }
+  
+}
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-http/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-http/build.xml b/nutch-plugins/lib-http/build.xml
new file mode 100644
index 0000000..f26a409
--- /dev/null
+++ b/nutch-plugins/lib-http/build.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="lib-http" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-http/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-http/ivy.xml b/nutch-plugins/lib-http/ivy.xml
new file mode 100644
index 0000000..1a86d68
--- /dev/null
+++ b/nutch-plugins/lib-http/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-http/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-http/plugin.xml b/nutch-plugins/lib-http/plugin.xml
new file mode 100644
index 0000000..a96e59d
--- /dev/null
+++ b/nutch-plugins/lib-http/plugin.xml
@@ -0,0 +1,33 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<!--
+ ! A common framework for http protocol implementations
+ !-->
+<plugin
+   id="lib-http"
+   name="HTTP Framework"
+   version="1.0"
+   provider-name="org.apache.nutch">
+
+   <runtime>
+     <library name="lib-http.jar">
+        <export name="*"/>
+     </library>
+   </runtime>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-http/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-http/pom.xml b/nutch-plugins/lib-http/pom.xml
new file mode 100644
index 0000000..0c39ff9
--- /dev/null
+++ b/nutch-plugins/lib-http/pom.xml
@@ -0,0 +1,38 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>lib-http</artifactId>
+    <packaging>jar</packaging>
+
+    <name>lib-http</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-http/src/main/java/org/apache/nutch/protocol/http/api/BlockedException.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-http/src/main/java/org/apache/nutch/protocol/http/api/BlockedException.java b/nutch-plugins/lib-http/src/main/java/org/apache/nutch/protocol/http/api/BlockedException.java
new file mode 100644
index 0000000..b1103f8
--- /dev/null
+++ b/nutch-plugins/lib-http/src/main/java/org/apache/nutch/protocol/http/api/BlockedException.java
@@ -0,0 +1,26 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol.http.api;
+
+public class BlockedException extends HttpException {
+
+  public BlockedException(String msg) {
+    super(msg);
+  }
+
+}

[38/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build for nutch-core and nutch-plugins

Posted by th...@apache.org.

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/segment/SegmentMerger.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/segment/SegmentMerger.java b/nutch-core/src/main/java/org/apache/nutch/segment/SegmentMerger.java
new file mode 100644
index 0000000..ef12f52
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/segment/SegmentMerger.java
@@ -0,0 +1,793 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.segment;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.TreeMap;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.MapFile;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.io.MapFile.Writer.Option;
+import org.apache.hadoop.io.SequenceFile.CompressionType;
+import org.apache.hadoop.io.SequenceFile.Metadata;
+import org.apache.hadoop.io.compress.DefaultCodec;
+import org.apache.hadoop.mapred.FileInputFormat;
+import org.apache.hadoop.mapred.FileOutputFormat;
+import org.apache.hadoop.mapred.FileSplit;
+import org.apache.hadoop.mapred.InputSplit;
+import org.apache.hadoop.mapred.JobClient;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.Mapper;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.RecordReader;
+import org.apache.hadoop.mapred.RecordWriter;
+import org.apache.hadoop.mapred.Reducer;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.hadoop.mapred.SequenceFileInputFormat;
+import org.apache.hadoop.mapred.SequenceFileOutputFormat;
+import org.apache.hadoop.mapred.SequenceFileRecordReader;
+import org.apache.hadoop.util.Progressable;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Generator;
+import org.apache.nutch.metadata.MetaWrapper;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.net.URLFilters;
+import org.apache.nutch.net.URLNormalizers;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseText;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.HadoopFSUtil;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.NutchJob;
+
+/**
+ * This tool takes several segments and merges their data together. Only the
+ * latest versions of data is retained.
+ * <p>
+ * Optionally, you can apply current URLFilters to remove prohibited URL-s.
+ * </p>
+ * <p>
+ * Also, it's possible to slice the resulting segment into chunks of fixed size.
+ * </p>
+ * <h3>Important Notes</h3> <h4>Which parts are merged?</h4>
+ * <p>
+ * It doesn't make sense to merge data from segments, which are at different
+ * stages of processing (e.g. one unfetched segment, one fetched but not parsed,
+ * and one fetched and parsed). Therefore, prior to merging, the tool will
+ * determine the lowest common set of input data, and only this data will be
+ * merged. This may have some unintended consequences: e.g. if majority of input
+ * segments are fetched and parsed, but one of them is unfetched, the tool will
+ * fall back to just merging fetchlists, and it will skip all other data from
+ * all segments.
+ * </p>
+ * <h4>Merging fetchlists</h4>
+ * <p>
+ * Merging segments, which contain just fetchlists (i.e. prior to fetching) is
+ * not recommended, because this tool (unlike the
+ * {@link org.apache.nutch.crawl.Generator} doesn't ensure that fetchlist parts
+ * for each map task are disjoint.
+ * </p>
+ * <p>
+ * <h4>Duplicate content</h4>
+ * Merging segments removes older content whenever possible (see below).
+ * However, this is NOT the same as de-duplication, which in addition removes
+ * identical content found at different URL-s. In other words, running
+ * DeleteDuplicates is still necessary.
+ * </p>
+ * <p>
+ * For some types of data (especially ParseText) it's not possible to determine
+ * which version is really older. Therefore the tool always uses segment names
+ * as timestamps, for all types of input data. Segment names are compared in
+ * forward lexicographic order (0-9a-zA-Z), and data from segments with "higher"
+ * names will prevail. It follows then that it is extremely important that
+ * segments be named in an increasing lexicographic order as their creation time
+ * increases.
+ * </p>
+ * <p>
+ * <h4>Merging and indexes</h4>
+ * Merged segment gets a different name. Since Indexer embeds segment names in
+ * indexes, any indexes originally created for the input segments will NOT work
+ * with the merged segment. Newly created merged segment(s) need to be indexed
+ * afresh. This tool doesn't use existing indexes in any way, so if you plan to
+ * merge segments you don't have to index them prior to merging.
+ * 
+ * 
+ * @author Andrzej Bialecki
+ */
+public class SegmentMerger extends Configured implements Tool,
+    Mapper<Text, MetaWrapper, Text, MetaWrapper>,
+    Reducer<Text, MetaWrapper, Text, MetaWrapper> {
+  private static final Logger LOG = LoggerFactory
+      .getLogger(SegmentMerger.class);
+
+  private static final String SEGMENT_PART_KEY = "part";
+  private static final String SEGMENT_SLICE_KEY = "slice";
+
+  private URLFilters filters = null;
+  private URLNormalizers normalizers = null;
+  private SegmentMergeFilters mergeFilters = null;
+  private long sliceSize = -1;
+  private long curCount = 0;
+
+  /**
+   * Wraps inputs in an {@link MetaWrapper}, to permit merging different types
+   * in reduce and use additional metadata.
+   */
+  public static class ObjectInputFormat extends
+      SequenceFileInputFormat<Text, MetaWrapper> {
+
+    @Override
+    public RecordReader<Text, MetaWrapper> getRecordReader(
+        final InputSplit split, final JobConf job, Reporter reporter)
+        throws IOException {
+
+      reporter.setStatus(split.toString());
+
+      // find part name
+      SegmentPart segmentPart;
+      final String spString;
+      final FileSplit fSplit = (FileSplit) split;
+      try {
+        segmentPart = SegmentPart.get(fSplit);
+        spString = segmentPart.toString();
+      } catch (IOException e) {
+        throw new RuntimeException("Cannot identify segment:", e);
+      }
+
+      SequenceFile.Reader reader = new SequenceFile.Reader(job, SequenceFile.Reader.file(fSplit.getPath()));
+
+      final Writable w;
+      try {
+        w = (Writable) reader.getValueClass().newInstance();
+      } catch (Exception e) {
+        throw new IOException(e.toString());
+      } finally {
+        try {
+          reader.close();
+        } catch (Exception e) {
+          // ignore
+        }
+      }
+      final SequenceFileRecordReader<Text, Writable> splitReader = new SequenceFileRecordReader<Text, Writable>(
+          job, (FileSplit) split);
+
+      try {
+        return new SequenceFileRecordReader<Text, MetaWrapper>(job, fSplit) {
+
+          public synchronized boolean next(Text key, MetaWrapper wrapper)
+              throws IOException {
+            LOG.debug("Running OIF.next()");
+
+            boolean res = splitReader.next(key, w);
+            wrapper.set(w);
+            wrapper.setMeta(SEGMENT_PART_KEY, spString);
+            return res;
+          }
+
+          @Override
+          public synchronized void close() throws IOException {
+            splitReader.close();
+          }
+
+          @Override
+          public MetaWrapper createValue() {
+            return new MetaWrapper();
+          }
+
+        };
+      } catch (IOException e) {
+        throw new RuntimeException("Cannot create RecordReader: ", e);
+      }
+    }
+  }
+
+  public static class SegmentOutputFormat extends
+      FileOutputFormat<Text, MetaWrapper> {
+    private static final String DEFAULT_SLICE = "default";
+
+    @Override
+    public RecordWriter<Text, MetaWrapper> getRecordWriter(final FileSystem fs,
+        final JobConf job, final String name, final Progressable progress)
+        throws IOException {
+      return new RecordWriter<Text, MetaWrapper>() {
+        MapFile.Writer c_out = null;
+        MapFile.Writer f_out = null;
+        MapFile.Writer pd_out = null;
+        MapFile.Writer pt_out = null;
+        SequenceFile.Writer g_out = null;
+        SequenceFile.Writer p_out = null;
+        HashMap<String, Closeable> sliceWriters = new HashMap<String, Closeable>();
+        String segmentName = job.get("segment.merger.segmentName");
+
+        public void write(Text key, MetaWrapper wrapper) throws IOException {
+          // unwrap
+          SegmentPart sp = SegmentPart.parse(wrapper.getMeta(SEGMENT_PART_KEY));
+          Writable o = wrapper.get();
+          String slice = wrapper.getMeta(SEGMENT_SLICE_KEY);
+          if (o instanceof CrawlDatum) {
+            if (sp.partName.equals(CrawlDatum.GENERATE_DIR_NAME)) {
+              g_out = ensureSequenceFile(slice, CrawlDatum.GENERATE_DIR_NAME);
+              g_out.append(key, o);
+            } else if (sp.partName.equals(CrawlDatum.FETCH_DIR_NAME)) {
+              f_out = ensureMapFile(slice, CrawlDatum.FETCH_DIR_NAME,
+                  CrawlDatum.class);
+              f_out.append(key, o);
+            } else if (sp.partName.equals(CrawlDatum.PARSE_DIR_NAME)) {
+              p_out = ensureSequenceFile(slice, CrawlDatum.PARSE_DIR_NAME);
+              p_out.append(key, o);
+            } else {
+              throw new IOException("Cannot determine segment part: "
+                  + sp.partName);
+            }
+          } else if (o instanceof Content) {
+            c_out = ensureMapFile(slice, Content.DIR_NAME, Content.class);
+            c_out.append(key, o);
+          } else if (o instanceof ParseData) {
+            // update the segment name inside contentMeta - required by Indexer
+            if (slice == null) {
+              ((ParseData) o).getContentMeta().set(Nutch.SEGMENT_NAME_KEY,
+                  segmentName);
+            } else {
+              ((ParseData) o).getContentMeta().set(Nutch.SEGMENT_NAME_KEY,
+                  segmentName + "-" + slice);
+            }
+            pd_out = ensureMapFile(slice, ParseData.DIR_NAME, ParseData.class);
+            pd_out.append(key, o);
+          } else if (o instanceof ParseText) {
+            pt_out = ensureMapFile(slice, ParseText.DIR_NAME, ParseText.class);
+            pt_out.append(key, o);
+          }
+        }
+
+        // lazily create SequenceFile-s.
+        private SequenceFile.Writer ensureSequenceFile(String slice,
+            String dirName) throws IOException {
+          if (slice == null)
+            slice = DEFAULT_SLICE;
+          SequenceFile.Writer res = (SequenceFile.Writer) sliceWriters
+              .get(slice + dirName);
+          if (res != null)
+            return res;
+          Path wname;
+          Path out = FileOutputFormat.getOutputPath(job);
+          if (slice == DEFAULT_SLICE) {
+            wname = new Path(new Path(new Path(out, segmentName), dirName),
+                name);
+          } else {
+            wname = new Path(new Path(new Path(out, segmentName + "-" + slice),
+                dirName), name);
+          }
+          
+//          Option rKeyClassOpt = MapFile.Writer.keyClass(Text.class);
+//          org.apache.hadoop.io.SequenceFile.Writer.Option rValClassOpt = SequenceFile.Writer.valueClass(CrawlDatum.class);
+//          Option rProgressOpt = (Option) SequenceFile.Writer.progressable(progress);
+//          Option rCompOpt = (Option) SequenceFile.Writer.compression(SequenceFileOutputFormat.getOutputCompressionType(job));
+//          Option rFileOpt = (Option) SequenceFile.Writer.file(wname);
+          
+          //res = SequenceFile.createWriter(job, rFileOpt, rKeyClassOpt,
+           //   rValClassOpt, rCompOpt, rProgressOpt);
+          
+          res = SequenceFile.createWriter(job, SequenceFile.Writer.file(wname),
+              SequenceFile.Writer.keyClass(Text.class),
+              SequenceFile.Writer.valueClass(CrawlDatum.class),
+              SequenceFile.Writer.bufferSize(fs.getConf().getInt("io.file.buffer.size",4096)),
+              SequenceFile.Writer.replication(fs.getDefaultReplication(wname)),
+              SequenceFile.Writer.blockSize(1073741824),
+              SequenceFile.Writer.compression(SequenceFileOutputFormat.getOutputCompressionType(job), new DefaultCodec()),
+              SequenceFile.Writer.progressable(progress),
+              SequenceFile.Writer.metadata(new Metadata())); 
+          
+          sliceWriters.put(slice + dirName, res);
+          return res;
+        }
+
+        // lazily create MapFile-s.
+        private MapFile.Writer ensureMapFile(String slice, String dirName,
+            Class<? extends Writable> clazz) throws IOException {
+          if (slice == null)
+            slice = DEFAULT_SLICE;
+          MapFile.Writer res = (MapFile.Writer) sliceWriters.get(slice
+              + dirName);
+          if (res != null)
+            return res;
+          Path wname;
+          Path out = FileOutputFormat.getOutputPath(job);
+          if (slice == DEFAULT_SLICE) {
+            wname = new Path(new Path(new Path(out, segmentName), dirName),
+                name);
+          } else {
+            wname = new Path(new Path(new Path(out, segmentName + "-" + slice),
+                dirName), name);
+          }
+          CompressionType compType = SequenceFileOutputFormat
+              .getOutputCompressionType(job);
+          if (clazz.isAssignableFrom(ParseText.class)) {
+            compType = CompressionType.RECORD;
+          }
+          
+          Option rKeyClassOpt = (Option) MapFile.Writer.keyClass(Text.class);
+          org.apache.hadoop.io.SequenceFile.Writer.Option rValClassOpt = SequenceFile.Writer.valueClass(clazz);
+          org.apache.hadoop.io.SequenceFile.Writer.Option rProgressOpt = SequenceFile.Writer.progressable(progress);
+          org.apache.hadoop.io.SequenceFile.Writer.Option rCompOpt = SequenceFile.Writer.compression(compType);
+          
+          res = new MapFile.Writer(job, wname, rKeyClassOpt,
+              rValClassOpt, rCompOpt, rProgressOpt);
+          sliceWriters.put(slice + dirName, res);
+          return res;
+        }
+
+        public void close(Reporter reporter) throws IOException {
+          Iterator<Closeable> it = sliceWriters.values().iterator();
+          while (it.hasNext()) {
+            Object o = it.next();
+            if (o instanceof SequenceFile.Writer) {
+              ((SequenceFile.Writer) o).close();
+            } else {
+              ((MapFile.Writer) o).close();
+            }
+          }
+        }
+      };
+    }
+  }
+
+  public SegmentMerger() {
+    super(null);
+  }
+
+  public SegmentMerger(Configuration conf) {
+    super(conf);
+  }
+
+  public void setConf(Configuration conf) {
+    super.setConf(conf);
+    if (conf == null)
+      return;
+    if (conf.getBoolean("segment.merger.filter", false)) {
+      filters = new URLFilters(conf);
+      mergeFilters = new SegmentMergeFilters(conf);
+    }
+    if (conf.getBoolean("segment.merger.normalizer", false))
+      normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_DEFAULT);
+    sliceSize = conf.getLong("segment.merger.slice", -1);
+    if ((sliceSize > 0) && (LOG.isInfoEnabled())) {
+      LOG.info("Slice size: " + sliceSize + " URLs.");
+    }
+  }
+
+  public void close() throws IOException {
+  }
+
+  public void configure(JobConf conf) {
+    setConf(conf);
+    if (sliceSize > 0) {
+      sliceSize = sliceSize / conf.getNumReduceTasks();
+    }
+  }
+
+  private Text newKey = new Text();
+
+  public void map(Text key, MetaWrapper value,
+      OutputCollector<Text, MetaWrapper> output, Reporter reporter)
+      throws IOException {
+    String url = key.toString();
+    if (normalizers != null) {
+      try {
+        url = normalizers.normalize(url, URLNormalizers.SCOPE_DEFAULT); // normalize
+                                                                        // the
+                                                                        // url
+      } catch (Exception e) {
+        LOG.warn("Skipping " + url + ":" + e.getMessage());
+        url = null;
+      }
+    }
+    if (url != null && filters != null) {
+      try {
+        url = filters.filter(url);
+      } catch (Exception e) {
+        LOG.warn("Skipping key " + url + ": " + e.getMessage());
+        url = null;
+      }
+    }
+    if (url != null) {
+      newKey.set(url);
+      output.collect(newKey, value);
+    }
+  }
+
+  /**
+   * NOTE: in selecting the latest version we rely exclusively on the segment
+   * name (not all segment data contain time information). Therefore it is
+   * extremely important that segments be named in an increasing lexicographic
+   * order as their creation time increases.
+   */
+  public void reduce(Text key, Iterator<MetaWrapper> values,
+      OutputCollector<Text, MetaWrapper> output, Reporter reporter)
+      throws IOException {
+    CrawlDatum lastG = null;
+    CrawlDatum lastF = null;
+    CrawlDatum lastSig = null;
+    Content lastC = null;
+    ParseData lastPD = null;
+    ParseText lastPT = null;
+    String lastGname = null;
+    String lastFname = null;
+    String lastSigname = null;
+    String lastCname = null;
+    String lastPDname = null;
+    String lastPTname = null;
+    TreeMap<String, ArrayList<CrawlDatum>> linked = new TreeMap<String, ArrayList<CrawlDatum>>();
+    while (values.hasNext()) {
+      MetaWrapper wrapper = values.next();
+      Object o = wrapper.get();
+      String spString = wrapper.getMeta(SEGMENT_PART_KEY);
+      if (spString == null) {
+        throw new IOException("Null segment part, key=" + key);
+      }
+      SegmentPart sp = SegmentPart.parse(spString);
+      if (o instanceof CrawlDatum) {
+        CrawlDatum val = (CrawlDatum) o;
+        // check which output dir it belongs to
+        if (sp.partName.equals(CrawlDatum.GENERATE_DIR_NAME)) {
+          if (lastG == null) {
+            lastG = val;
+            lastGname = sp.segmentName;
+          } else {
+            // take newer
+            if (lastGname.compareTo(sp.segmentName) < 0) {
+              lastG = val;
+              lastGname = sp.segmentName;
+            }
+          }
+        } else if (sp.partName.equals(CrawlDatum.FETCH_DIR_NAME)) {
+          // only consider fetch status and ignore fetch retry status
+          // https://issues.apache.org/jira/browse/NUTCH-1520
+          // https://issues.apache.org/jira/browse/NUTCH-1113
+          if (CrawlDatum.hasFetchStatus(val)
+              && val.getStatus() != CrawlDatum.STATUS_FETCH_RETRY
+              && val.getStatus() != CrawlDatum.STATUS_FETCH_NOTMODIFIED) {
+            if (lastF == null) {
+              lastF = val;
+              lastFname = sp.segmentName;
+            } else {
+              if (lastFname.compareTo(sp.segmentName) < 0) {
+                lastF = val;
+                lastFname = sp.segmentName;
+              }
+            }
+          }
+        } else if (sp.partName.equals(CrawlDatum.PARSE_DIR_NAME)) {
+          if (val.getStatus() == CrawlDatum.STATUS_SIGNATURE) {
+            if (lastSig == null) {
+              lastSig = val;
+              lastSigname = sp.segmentName;
+            } else {
+              // take newer
+              if (lastSigname.compareTo(sp.segmentName) < 0) {
+                lastSig = val;
+                lastSigname = sp.segmentName;
+              }
+            }
+            continue;
+          }
+          // collect all LINKED values from the latest segment
+          ArrayList<CrawlDatum> segLinked = linked.get(sp.segmentName);
+          if (segLinked == null) {
+            segLinked = new ArrayList<CrawlDatum>();
+            linked.put(sp.segmentName, segLinked);
+          }
+          segLinked.add(val);
+        } else {
+          throw new IOException("Cannot determine segment part: " + sp.partName);
+        }
+      } else if (o instanceof Content) {
+        if (lastC == null) {
+          lastC = (Content) o;
+          lastCname = sp.segmentName;
+        } else {
+          if (lastCname.compareTo(sp.segmentName) < 0) {
+            lastC = (Content) o;
+            lastCname = sp.segmentName;
+          }
+        }
+      } else if (o instanceof ParseData) {
+        if (lastPD == null) {
+          lastPD = (ParseData) o;
+          lastPDname = sp.segmentName;
+        } else {
+          if (lastPDname.compareTo(sp.segmentName) < 0) {
+            lastPD = (ParseData) o;
+            lastPDname = sp.segmentName;
+          }
+        }
+      } else if (o instanceof ParseText) {
+        if (lastPT == null) {
+          lastPT = (ParseText) o;
+          lastPTname = sp.segmentName;
+        } else {
+          if (lastPTname.compareTo(sp.segmentName) < 0) {
+            lastPT = (ParseText) o;
+            lastPTname = sp.segmentName;
+          }
+        }
+      }
+    }
+    // perform filtering based on full merge record
+    if (mergeFilters != null
+        && !mergeFilters.filter(key, lastG, lastF, lastSig, lastC, lastPD,
+            lastPT, linked.isEmpty() ? null : linked.lastEntry().getValue())) {
+      return;
+    }
+
+    curCount++;
+    String sliceName = null;
+    MetaWrapper wrapper = new MetaWrapper();
+    if (sliceSize > 0) {
+      sliceName = String.valueOf(curCount / sliceSize);
+      wrapper.setMeta(SEGMENT_SLICE_KEY, sliceName);
+    }
+    SegmentPart sp = new SegmentPart();
+    // now output the latest values
+    if (lastG != null) {
+      wrapper.set(lastG);
+      sp.partName = CrawlDatum.GENERATE_DIR_NAME;
+      sp.segmentName = lastGname;
+      wrapper.setMeta(SEGMENT_PART_KEY, sp.toString());
+      output.collect(key, wrapper);
+    }
+    if (lastF != null) {
+      wrapper.set(lastF);
+      sp.partName = CrawlDatum.FETCH_DIR_NAME;
+      sp.segmentName = lastFname;
+      wrapper.setMeta(SEGMENT_PART_KEY, sp.toString());
+      output.collect(key, wrapper);
+    }
+    if (lastSig != null) {
+      wrapper.set(lastSig);
+      sp.partName = CrawlDatum.PARSE_DIR_NAME;
+      sp.segmentName = lastSigname;
+      wrapper.setMeta(SEGMENT_PART_KEY, sp.toString());
+      output.collect(key, wrapper);
+    }
+    if (lastC != null) {
+      wrapper.set(lastC);
+      sp.partName = Content.DIR_NAME;
+      sp.segmentName = lastCname;
+      wrapper.setMeta(SEGMENT_PART_KEY, sp.toString());
+      output.collect(key, wrapper);
+    }
+    if (lastPD != null) {
+      wrapper.set(lastPD);
+      sp.partName = ParseData.DIR_NAME;
+      sp.segmentName = lastPDname;
+      wrapper.setMeta(SEGMENT_PART_KEY, sp.toString());
+      output.collect(key, wrapper);
+    }
+    if (lastPT != null) {
+      wrapper.set(lastPT);
+      sp.partName = ParseText.DIR_NAME;
+      sp.segmentName = lastPTname;
+      wrapper.setMeta(SEGMENT_PART_KEY, sp.toString());
+      output.collect(key, wrapper);
+    }
+    if (linked.size() > 0) {
+      String name = linked.lastKey();
+      sp.partName = CrawlDatum.PARSE_DIR_NAME;
+      sp.segmentName = name;
+      wrapper.setMeta(SEGMENT_PART_KEY, sp.toString());
+      ArrayList<CrawlDatum> segLinked = linked.get(name);
+      for (int i = 0; i < segLinked.size(); i++) {
+        CrawlDatum link = segLinked.get(i);
+        wrapper.set(link);
+        output.collect(key, wrapper);
+      }
+    }
+  }
+
+  public void merge(Path out, Path[] segs, boolean filter, boolean normalize,
+      long slice) throws Exception {
+    String segmentName = Generator.generateSegmentName();
+    if (LOG.isInfoEnabled()) {
+      LOG.info("Merging " + segs.length + " segments to " + out + "/"
+          + segmentName);
+    }
+    JobConf job = new NutchJob(getConf());
+    job.setJobName("mergesegs " + out + "/" + segmentName);
+    job.setBoolean("segment.merger.filter", filter);
+    job.setBoolean("segment.merger.normalizer", normalize);
+    job.setLong("segment.merger.slice", slice);
+    job.set("segment.merger.segmentName", segmentName);
+    FileSystem fs = FileSystem.get(getConf());
+    // prepare the minimal common set of input dirs
+    boolean g = true;
+    boolean f = true;
+    boolean p = true;
+    boolean c = true;
+    boolean pd = true;
+    boolean pt = true;
+    
+    // These contain previous values, we use it to track changes in the loop
+    boolean pg = true;
+    boolean pf = true;
+    boolean pp = true;
+    boolean pc = true;
+    boolean ppd = true;
+    boolean ppt = true;
+    for (int i = 0; i < segs.length; i++) {
+      if (!fs.exists(segs[i])) {
+        if (LOG.isWarnEnabled()) {
+          LOG.warn("Input dir " + segs[i] + " doesn't exist, skipping.");
+        }
+        segs[i] = null;
+        continue;
+      }
+      if (LOG.isInfoEnabled()) {
+        LOG.info("SegmentMerger:   adding " + segs[i]);
+      }
+      Path cDir = new Path(segs[i], Content.DIR_NAME);
+      Path gDir = new Path(segs[i], CrawlDatum.GENERATE_DIR_NAME);
+      Path fDir = new Path(segs[i], CrawlDatum.FETCH_DIR_NAME);
+      Path pDir = new Path(segs[i], CrawlDatum.PARSE_DIR_NAME);
+      Path pdDir = new Path(segs[i], ParseData.DIR_NAME);
+      Path ptDir = new Path(segs[i], ParseText.DIR_NAME);
+      c = c && fs.exists(cDir);
+      g = g && fs.exists(gDir);
+      f = f && fs.exists(fDir);
+      p = p && fs.exists(pDir);
+      pd = pd && fs.exists(pdDir);
+      pt = pt && fs.exists(ptDir);
+      
+      // Input changed?
+      if (g != pg || f != pf || p != pp || c != pc || pd != ppd || pt != ppt) {
+        LOG.info(segs[i] + " changed input dirs");
+      }
+      
+      pg = g; pf = f; pp = p; pc = c; ppd = pd; ppt = pt;
+    }
+    StringBuffer sb = new StringBuffer();
+    if (c)
+      sb.append(" " + Content.DIR_NAME);
+    if (g)
+      sb.append(" " + CrawlDatum.GENERATE_DIR_NAME);
+    if (f)
+      sb.append(" " + CrawlDatum.FETCH_DIR_NAME);
+    if (p)
+      sb.append(" " + CrawlDatum.PARSE_DIR_NAME);
+    if (pd)
+      sb.append(" " + ParseData.DIR_NAME);
+    if (pt)
+      sb.append(" " + ParseText.DIR_NAME);
+    if (LOG.isInfoEnabled()) {
+      LOG.info("SegmentMerger: using segment data from:" + sb.toString());
+    }
+    for (int i = 0; i < segs.length; i++) {
+      if (segs[i] == null)
+        continue;
+      if (g) {
+        Path gDir = new Path(segs[i], CrawlDatum.GENERATE_DIR_NAME);
+        FileInputFormat.addInputPath(job, gDir);
+      }
+      if (c) {
+        Path cDir = new Path(segs[i], Content.DIR_NAME);
+        FileInputFormat.addInputPath(job, cDir);
+      }
+      if (f) {
+        Path fDir = new Path(segs[i], CrawlDatum.FETCH_DIR_NAME);
+        FileInputFormat.addInputPath(job, fDir);
+      }
+      if (p) {
+        Path pDir = new Path(segs[i], CrawlDatum.PARSE_DIR_NAME);
+        FileInputFormat.addInputPath(job, pDir);
+      }
+      if (pd) {
+        Path pdDir = new Path(segs[i], ParseData.DIR_NAME);
+        FileInputFormat.addInputPath(job, pdDir);
+      }
+      if (pt) {
+        Path ptDir = new Path(segs[i], ParseText.DIR_NAME);
+        FileInputFormat.addInputPath(job, ptDir);
+      }
+    }
+    job.setInputFormat(ObjectInputFormat.class);
+    job.setMapperClass(SegmentMerger.class);
+    job.setReducerClass(SegmentMerger.class);
+    FileOutputFormat.setOutputPath(job, out);
+    job.setOutputKeyClass(Text.class);
+    job.setOutputValueClass(MetaWrapper.class);
+    job.setOutputFormat(SegmentOutputFormat.class);
+
+    setConf(job);
+
+    JobClient.runJob(job);
+  }
+
+  /**
+   * @param args
+   */
+  public int run(String[] args)  throws Exception {
+    if (args.length < 2) {
+      System.err
+          .println("SegmentMerger output_dir (-dir segments | seg1 seg2 ...) [-filter] [-slice NNNN]");
+      System.err
+          .println("\toutput_dir\tname of the parent dir for output segment slice(s)");
+      System.err
+          .println("\t-dir segments\tparent dir containing several segments");
+      System.err.println("\tseg1 seg2 ...\tlist of segment dirs");
+      System.err
+          .println("\t-filter\t\tfilter out URL-s prohibited by current URLFilters");
+      System.err
+          .println("\t-normalize\t\tnormalize URL via current URLNormalizers");
+      System.err
+          .println("\t-slice NNNN\tcreate many output segments, each containing NNNN URLs");
+      return -1;
+    }
+    Configuration conf = NutchConfiguration.create();
+    final FileSystem fs = FileSystem.get(conf);
+    Path out = new Path(args[0]);
+    ArrayList<Path> segs = new ArrayList<Path>();
+    long sliceSize = 0;
+    boolean filter = false;
+    boolean normalize = false;
+    for (int i = 1; i < args.length; i++) {
+      if (args[i].equals("-dir")) {
+        FileStatus[] fstats = fs.listStatus(new Path(args[++i]),
+            HadoopFSUtil.getPassDirectoriesFilter(fs));
+        Path[] files = HadoopFSUtil.getPaths(fstats);
+        for (int j = 0; j < files.length; j++)
+          segs.add(files[j]);
+      } else if (args[i].equals("-filter")) {
+        filter = true;
+      } else if (args[i].equals("-normalize")) {
+        normalize = true;
+      } else if (args[i].equals("-slice")) {
+        sliceSize = Long.parseLong(args[++i]);
+      } else {
+        segs.add(new Path(args[i]));
+      }
+    }
+    if (segs.size() == 0) {
+      System.err.println("ERROR: No input segments.");
+      return -1;
+    }
+
+    merge(out, segs.toArray(new Path[segs.size()]), filter, normalize,
+        sliceSize);
+    return 0;
+  }
+
+  public static void main(String[] args) throws Exception {
+    int result = ToolRunner.run(NutchConfiguration.create(),
+        new SegmentMerger(), args);
+    System.exit(result);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/segment/SegmentPart.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/segment/SegmentPart.java b/nutch-core/src/main/java/org/apache/nutch/segment/SegmentPart.java
new file mode 100644
index 0000000..84247e4
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/segment/SegmentPart.java
@@ -0,0 +1,113 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.segment;
+
+import java.io.IOException;
+
+import org.apache.hadoop.mapred.FileSplit;
+
+/**
+ * Utility class for handling information about segment parts.
+ * 
+ * @author Andrzej Bialecki
+ */
+public class SegmentPart {
+  /** Name of the segment (just the last path component). */
+  public String segmentName;
+  /** Name of the segment part (ie. one of subdirectories inside a segment). */
+  public String partName;
+
+  public SegmentPart() {
+
+  }
+
+  public SegmentPart(String segmentName, String partName) {
+    this.segmentName = segmentName;
+    this.partName = partName;
+  }
+
+  /**
+   * Return a String representation of this class, in the form
+   * "segmentName/partName".
+   */
+  public String toString() {
+    return segmentName + "/" + partName;
+  }
+
+  /**
+   * Create SegmentPart from a FileSplit.
+   * 
+   * @param split
+   * @return A {@link SegmentPart} resultant from a {@link FileSplit}.
+   * @throws Exception
+   */
+  public static SegmentPart get(FileSplit split) throws IOException {
+    return get(split.getPath().toString());
+  }
+
+  /**
+   * Create SegmentPart from a full path of a location inside any segment part.
+   * 
+   * @param path
+   *          full path into a segment part (may include "part-xxxxx"
+   *          components)
+   * @return SegmentPart instance describing this part.
+   * @throws IOException
+   *           if any required path components are missing.
+   */
+  public static SegmentPart get(String path) throws IOException {
+    // find part name
+    String dir = path.replace('\\', '/');
+    int idx = dir.lastIndexOf("/part-");
+    if (idx == -1) {
+      throw new IOException("Cannot determine segment part: " + dir);
+    }
+    dir = dir.substring(0, idx);
+    idx = dir.lastIndexOf('/');
+    if (idx == -1) {
+      throw new IOException("Cannot determine segment part: " + dir);
+    }
+    String part = dir.substring(idx + 1);
+    // find segment name
+    dir = dir.substring(0, idx);
+    idx = dir.lastIndexOf('/');
+    if (idx == -1) {
+      throw new IOException("Cannot determine segment name: " + dir);
+    }
+    String segment = dir.substring(idx + 1);
+    return new SegmentPart(segment, part);
+  }
+
+  /**
+   * Create SegmentPart from a String in format "segmentName/partName".
+   * 
+   * @param string
+   *          input String
+   * @return parsed instance of SegmentPart
+   * @throws IOException
+   *           if "/" is missing.
+   */
+  public static SegmentPart parse(String string) throws IOException {
+    int idx = string.indexOf('/');
+    if (idx == -1) {
+      throw new IOException("Invalid SegmentPart: '" + string + "'");
+    }
+    String segment = string.substring(0, idx);
+    String part = string.substring(idx + 1);
+    return new SegmentPart(segment, part);
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/segment/SegmentReader.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/segment/SegmentReader.java b/nutch-core/src/main/java/org/apache/nutch/segment/SegmentReader.java
new file mode 100644
index 0000000..d00d1e2
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/segment/SegmentReader.java
@@ -0,0 +1,719 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.segment;
+
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.OutputStreamWriter;
+import java.io.PrintStream;
+import java.io.PrintWriter;
+import java.io.Writer;
+import java.text.SimpleDateFormat;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Date;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.MapFile;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.io.WritableComparable;
+import org.apache.hadoop.mapred.FileInputFormat;
+import org.apache.hadoop.mapred.FileOutputFormat;
+import org.apache.hadoop.mapred.JobClient;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.MapFileOutputFormat;
+import org.apache.hadoop.mapred.MapReduceBase;
+import org.apache.hadoop.mapred.Mapper;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.RecordWriter;
+import org.apache.hadoop.mapred.Reducer;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.hadoop.mapred.SequenceFileInputFormat;
+import org.apache.hadoop.mapred.SequenceFileOutputFormat;
+import org.apache.hadoop.util.Progressable;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.NutchWritable;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseText;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.HadoopFSUtil;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.NutchJob;
+
+/** Dump the content of a segment. */
+public class SegmentReader extends Configured implements
+    Reducer<Text, NutchWritable, Text, Text> {
+
+  public static final Logger LOG = LoggerFactory.getLogger(SegmentReader.class);
+
+  long recNo = 0L;
+
+  private boolean co, fe, ge, pa, pd, pt;
+  private FileSystem fs;
+
+  public static class InputCompatMapper extends MapReduceBase implements
+      Mapper<WritableComparable<?>, Writable, Text, NutchWritable> {
+    private Text newKey = new Text();
+
+    public void map(WritableComparable<?> key, Writable value,
+        OutputCollector<Text, NutchWritable> collector, Reporter reporter)
+        throws IOException {
+      // convert on the fly from old formats with UTF8 keys.
+      // UTF8 deprecated and replaced by Text.
+      if (key instanceof Text) {
+        newKey.set(key.toString());
+        key = newKey;
+      }
+      collector.collect((Text) key, new NutchWritable(value));
+    }
+
+  }
+
+  /** Implements a text output format */
+  public static class TextOutputFormat extends
+      FileOutputFormat<WritableComparable<?>, Writable> {
+    public RecordWriter<WritableComparable<?>, Writable> getRecordWriter(
+        final FileSystem fs, JobConf job, String name,
+        final Progressable progress) throws IOException {
+
+      final Path segmentDumpFile = new Path(
+          FileOutputFormat.getOutputPath(job), name);
+
+      // Get the old copy out of the way
+      if (fs.exists(segmentDumpFile))
+        fs.delete(segmentDumpFile, true);
+
+      final PrintStream printStream = new PrintStream(
+          fs.create(segmentDumpFile));
+      return new RecordWriter<WritableComparable<?>, Writable>() {
+        public synchronized void write(WritableComparable<?> key, Writable value)
+            throws IOException {
+          printStream.println(value);
+        }
+
+        public synchronized void close(Reporter reporter) throws IOException {
+          printStream.close();
+        }
+      };
+    }
+  }
+
+  public SegmentReader() {
+    super(null);
+  }
+
+  public SegmentReader(Configuration conf, boolean co, boolean fe, boolean ge,
+      boolean pa, boolean pd, boolean pt) {
+    super(conf);
+    this.co = co;
+    this.fe = fe;
+    this.ge = ge;
+    this.pa = pa;
+    this.pd = pd;
+    this.pt = pt;
+    try {
+      this.fs = FileSystem.get(getConf());
+    } catch (IOException e) {
+      LOG.error("IOException:", e);
+    }
+  }
+
+  public void configure(JobConf job) {
+    setConf(job);
+    this.co = getConf().getBoolean("segment.reader.co", true);
+    this.fe = getConf().getBoolean("segment.reader.fe", true);
+    this.ge = getConf().getBoolean("segment.reader.ge", true);
+    this.pa = getConf().getBoolean("segment.reader.pa", true);
+    this.pd = getConf().getBoolean("segment.reader.pd", true);
+    this.pt = getConf().getBoolean("segment.reader.pt", true);
+    try {
+      this.fs = FileSystem.get(getConf());
+    } catch (IOException e) {
+      LOG.error("IOException:", e);
+    }
+  }
+
+  private JobConf createJobConf() {
+    JobConf job = new NutchJob(getConf());
+    job.setBoolean("segment.reader.co", this.co);
+    job.setBoolean("segment.reader.fe", this.fe);
+    job.setBoolean("segment.reader.ge", this.ge);
+    job.setBoolean("segment.reader.pa", this.pa);
+    job.setBoolean("segment.reader.pd", this.pd);
+    job.setBoolean("segment.reader.pt", this.pt);
+    return job;
+  }
+
+  public void close() {
+  }
+
+  public void reduce(Text key, Iterator<NutchWritable> values,
+      OutputCollector<Text, Text> output, Reporter reporter) throws IOException {
+    StringBuffer dump = new StringBuffer();
+
+    dump.append("\nRecno:: ").append(recNo++).append("\n");
+    dump.append("URL:: " + key.toString() + "\n");
+    while (values.hasNext()) {
+      Writable value = values.next().get(); // unwrap
+      if (value instanceof CrawlDatum) {
+        dump.append("\nCrawlDatum::\n").append(((CrawlDatum) value).toString());
+      } else if (value instanceof Content) {
+        dump.append("\nContent::\n").append(((Content) value).toString());
+      } else if (value instanceof ParseData) {
+        dump.append("\nParseData::\n").append(((ParseData) value).toString());
+      } else if (value instanceof ParseText) {
+        dump.append("\nParseText::\n").append(((ParseText) value).toString());
+      } else if (LOG.isWarnEnabled()) {
+        LOG.warn("Unrecognized type: " + value.getClass());
+      }
+    }
+    output.collect(key, new Text(dump.toString()));
+  }
+
+  public void dump(Path segment, Path output) throws IOException {
+
+    if (LOG.isInfoEnabled()) {
+      LOG.info("SegmentReader: dump segment: " + segment);
+    }
+
+    JobConf job = createJobConf();
+    job.setJobName("read " + segment);
+
+    if (ge)
+      FileInputFormat.addInputPath(job, new Path(segment,
+          CrawlDatum.GENERATE_DIR_NAME));
+    if (fe)
+      FileInputFormat.addInputPath(job, new Path(segment,
+          CrawlDatum.FETCH_DIR_NAME));
+    if (pa)
+      FileInputFormat.addInputPath(job, new Path(segment,
+          CrawlDatum.PARSE_DIR_NAME));
+    if (co)
+      FileInputFormat.addInputPath(job, new Path(segment, Content.DIR_NAME));
+    if (pd)
+      FileInputFormat.addInputPath(job, new Path(segment, ParseData.DIR_NAME));
+    if (pt)
+      FileInputFormat.addInputPath(job, new Path(segment, ParseText.DIR_NAME));
+
+    job.setInputFormat(SequenceFileInputFormat.class);
+    job.setMapperClass(InputCompatMapper.class);
+    job.setReducerClass(SegmentReader.class);
+
+    Path tempDir = new Path(job.get("hadoop.tmp.dir", "/tmp") + "/segread-"
+        + new java.util.Random().nextInt());
+    fs.delete(tempDir, true);
+
+    FileOutputFormat.setOutputPath(job, tempDir);
+    job.setOutputFormat(TextOutputFormat.class);
+    job.setOutputKeyClass(Text.class);
+    job.setOutputValueClass(NutchWritable.class);
+
+    JobClient.runJob(job);
+
+    // concatenate the output
+    Path dumpFile = new Path(output, job.get("segment.dump.dir", "dump"));
+
+    // remove the old file
+    fs.delete(dumpFile, true);
+    FileStatus[] fstats = fs.listStatus(tempDir,
+        HadoopFSUtil.getPassAllFilter());
+    Path[] files = HadoopFSUtil.getPaths(fstats);
+
+    PrintWriter writer = null;
+    int currentRecordNumber = 0;
+    if (files.length > 0) {
+      writer = new PrintWriter(new BufferedWriter(new OutputStreamWriter(
+          fs.create(dumpFile))));
+      try {
+        for (int i = 0; i < files.length; i++) {
+          Path partFile = files[i];
+          try {
+            currentRecordNumber = append(fs, job, partFile, writer,
+                currentRecordNumber);
+          } catch (IOException exception) {
+            if (LOG.isWarnEnabled()) {
+              LOG.warn("Couldn't copy the content of " + partFile.toString()
+                  + " into " + dumpFile.toString());
+              LOG.warn(exception.getMessage());
+            }
+          }
+        }
+      } finally {
+        writer.close();
+      }
+    }
+    fs.delete(tempDir, true);
+    if (LOG.isInfoEnabled()) {
+      LOG.info("SegmentReader: done");
+    }
+  }
+
+  /** Appends two files and updates the Recno counter */
+  private int append(FileSystem fs, Configuration conf, Path src,
+      PrintWriter writer, int currentRecordNumber) throws IOException {
+    BufferedReader reader = new BufferedReader(new InputStreamReader(
+        fs.open(src)));
+    try {
+      String line = reader.readLine();
+      while (line != null) {
+        if (line.startsWith("Recno:: ")) {
+          line = "Recno:: " + currentRecordNumber++;
+        }
+        writer.println(line);
+        line = reader.readLine();
+      }
+      return currentRecordNumber;
+    } finally {
+      reader.close();
+    }
+  }
+
+  private static final String[][] keys = new String[][] {
+      { "co", "Content::\n" }, { "ge", "Crawl Generate::\n" },
+      { "fe", "Crawl Fetch::\n" }, { "pa", "Crawl Parse::\n" },
+      { "pd", "ParseData::\n" }, { "pt", "ParseText::\n" } };
+
+  public void get(final Path segment, final Text key, Writer writer,
+      final Map<String, List<Writable>> results) throws Exception {
+    LOG.info("SegmentReader: get '" + key + "'");
+    ArrayList<Thread> threads = new ArrayList<Thread>();
+    if (co)
+      threads.add(new Thread() {
+        public void run() {
+          try {
+            List<Writable> res = getMapRecords(new Path(segment,
+                Content.DIR_NAME), key);
+            results.put("co", res);
+          } catch (Exception e) {
+            LOG.error("Exception:", e);
+          }
+        }
+      });
+    if (fe)
+      threads.add(new Thread() {
+        public void run() {
+          try {
+            List<Writable> res = getMapRecords(new Path(segment,
+                CrawlDatum.FETCH_DIR_NAME), key);
+            results.put("fe", res);
+          } catch (Exception e) {
+            LOG.error("Exception:", e);
+          }
+        }
+      });
+    if (ge)
+      threads.add(new Thread() {
+        public void run() {
+          try {
+            List<Writable> res = getSeqRecords(new Path(segment,
+                CrawlDatum.GENERATE_DIR_NAME), key);
+            results.put("ge", res);
+          } catch (Exception e) {
+            LOG.error("Exception:", e);
+          }
+        }
+      });
+    if (pa)
+      threads.add(new Thread() {
+        public void run() {
+          try {
+            List<Writable> res = getSeqRecords(new Path(segment,
+                CrawlDatum.PARSE_DIR_NAME), key);
+            results.put("pa", res);
+          } catch (Exception e) {
+            LOG.error("Exception:", e);
+          }
+        }
+      });
+    if (pd)
+      threads.add(new Thread() {
+        public void run() {
+          try {
+            List<Writable> res = getMapRecords(new Path(segment,
+                ParseData.DIR_NAME), key);
+            results.put("pd", res);
+          } catch (Exception e) {
+            LOG.error("Exception:", e);
+          }
+        }
+      });
+    if (pt)
+      threads.add(new Thread() {
+        public void run() {
+          try {
+            List<Writable> res = getMapRecords(new Path(segment,
+                ParseText.DIR_NAME), key);
+            results.put("pt", res);
+          } catch (Exception e) {
+            LOG.error("Exception:", e);
+          }
+        }
+      });
+    Iterator<Thread> it = threads.iterator();
+    while (it.hasNext())
+      it.next().start();
+    int cnt;
+    do {
+      cnt = 0;
+      try {
+        Thread.sleep(5000);
+      } catch (Exception e) {
+      }
+      ;
+      it = threads.iterator();
+      while (it.hasNext()) {
+        if (it.next().isAlive())
+          cnt++;
+      }
+      if ((cnt > 0) && (LOG.isDebugEnabled())) {
+        LOG.debug("(" + cnt + " to retrieve)");
+      }
+    } while (cnt > 0);
+    for (int i = 0; i < keys.length; i++) {
+      List<Writable> res = results.get(keys[i][0]);
+      if (res != null && res.size() > 0) {
+        for (int k = 0; k < res.size(); k++) {
+          writer.write(keys[i][1]);
+          writer.write(res.get(k) + "\n");
+        }
+      }
+      writer.flush();
+    }
+  }
+
+  private List<Writable> getMapRecords(Path dir, Text key) throws Exception {
+    MapFile.Reader[] readers = MapFileOutputFormat.getReaders(fs, dir,
+        getConf());
+    ArrayList<Writable> res = new ArrayList<Writable>();
+    Class<?> keyClass = readers[0].getKeyClass();
+    Class<?> valueClass = readers[0].getValueClass();
+    if (!keyClass.getName().equals("org.apache.hadoop.io.Text"))
+      throw new IOException("Incompatible key (" + keyClass.getName() + ")");
+    Writable value = (Writable) valueClass.newInstance();
+    // we don't know the partitioning schema
+    for (int i = 0; i < readers.length; i++) {
+      if (readers[i].get(key, value) != null) {
+        res.add(value);
+        value = (Writable) valueClass.newInstance();
+        Text aKey = (Text) keyClass.newInstance();
+        while (readers[i].next(aKey, value) && aKey.equals(key)) {
+          res.add(value);
+          value = (Writable) valueClass.newInstance();
+        }
+      }
+      readers[i].close();
+    }
+    return res;
+  }
+
+  private List<Writable> getSeqRecords(Path dir, Text key) throws Exception {
+    SequenceFile.Reader[] readers = SequenceFileOutputFormat.getReaders(
+        getConf(), dir);
+    ArrayList<Writable> res = new ArrayList<Writable>();
+    Class<?> keyClass = readers[0].getKeyClass();
+    Class<?> valueClass = readers[0].getValueClass();
+    if (!keyClass.getName().equals("org.apache.hadoop.io.Text"))
+      throw new IOException("Incompatible key (" + keyClass.getName() + ")");
+    Writable aKey = (Writable) keyClass.newInstance();
+    Writable value = (Writable) valueClass.newInstance();
+    for (int i = 0; i < readers.length; i++) {
+      while (readers[i].next(aKey, value)) {
+        if (aKey.equals(key)) {
+          res.add(value);
+          value = (Writable) valueClass.newInstance();
+        }
+      }
+      readers[i].close();
+    }
+    return res;
+  }
+
+  public static class SegmentReaderStats {
+    public long start = -1L;
+    public long end = -1L;
+    public long generated = -1L;
+    public long fetched = -1L;
+    public long fetchErrors = -1L;
+    public long parsed = -1L;
+    public long parseErrors = -1L;
+  }
+
+  SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss");
+
+  public void list(List<Path> dirs, Writer writer) throws Exception {
+    writer
+        .write("NAME\t\tGENERATED\tFETCHER START\t\tFETCHER END\t\tFETCHED\tPARSED\n");
+    for (int i = 0; i < dirs.size(); i++) {
+      Path dir = dirs.get(i);
+      SegmentReaderStats stats = new SegmentReaderStats();
+      getStats(dir, stats);
+      writer.write(dir.getName() + "\t");
+      if (stats.generated == -1)
+        writer.write("?");
+      else
+        writer.write(stats.generated + "");
+      writer.write("\t\t");
+      if (stats.start == -1)
+        writer.write("?\t");
+      else
+        writer.write(sdf.format(new Date(stats.start)));
+      writer.write("\t");
+      if (stats.end == -1)
+        writer.write("?");
+      else
+        writer.write(sdf.format(new Date(stats.end)));
+      writer.write("\t");
+      if (stats.fetched == -1)
+        writer.write("?");
+      else
+        writer.write(stats.fetched + "");
+      writer.write("\t");
+      if (stats.parsed == -1)
+        writer.write("?");
+      else
+        writer.write(stats.parsed + "");
+      writer.write("\n");
+      writer.flush();
+    }
+  }
+
+  public void getStats(Path segment, final SegmentReaderStats stats)
+      throws Exception {
+    long cnt = 0L;
+    Text key = new Text();
+    
+    if (ge) {
+      SequenceFile.Reader[] readers = SequenceFileOutputFormat.getReaders(
+          getConf(), new Path(segment, CrawlDatum.GENERATE_DIR_NAME));
+      for (int i = 0; i < readers.length; i++) {
+        while (readers[i].next(key))
+          cnt++;
+        readers[i].close();
+      }
+      stats.generated = cnt;
+    }
+    
+    if (fe) {
+      Path fetchDir = new Path(segment, CrawlDatum.FETCH_DIR_NAME);
+      if (fs.exists(fetchDir) && fs.getFileStatus(fetchDir).isDirectory()) {
+        cnt = 0L;
+        long start = Long.MAX_VALUE;
+        long end = Long.MIN_VALUE;
+        CrawlDatum value = new CrawlDatum();
+        MapFile.Reader[] mreaders = MapFileOutputFormat.getReaders(fs, fetchDir,
+            getConf());
+        for (int i = 0; i < mreaders.length; i++) {
+          while (mreaders[i].next(key, value)) {
+            cnt++;
+            if (value.getFetchTime() < start)
+              start = value.getFetchTime();
+            if (value.getFetchTime() > end)
+              end = value.getFetchTime();
+          }
+          mreaders[i].close();
+        }
+        stats.start = start;
+        stats.end = end;
+        stats.fetched = cnt;
+      }
+    }
+    
+    if (pd) {
+      Path parseDir = new Path(segment, ParseData.DIR_NAME);
+      if (fs.exists(parseDir) && fs.getFileStatus(parseDir).isDirectory()) {
+        cnt = 0L;
+        long errors = 0L;
+        ParseData value = new ParseData();
+        MapFile.Reader[] mreaders = MapFileOutputFormat.getReaders(fs, parseDir,
+            getConf());
+        for (int i = 0; i < mreaders.length; i++) {
+          while (mreaders[i].next(key, value)) {
+            cnt++;
+            if (!value.getStatus().isSuccess())
+              errors++;
+          }
+          mreaders[i].close();
+        }
+        stats.parsed = cnt;
+        stats.parseErrors = errors;
+      }
+    }
+  }
+
+  private static final int MODE_DUMP = 0;
+
+  private static final int MODE_LIST = 1;
+
+  private static final int MODE_GET = 2;
+
+  public static void main(String[] args) throws Exception {
+    if (args.length < 2) {
+      usage();
+      return;
+    }
+    int mode = -1;
+    if (args[0].equals("-dump"))
+      mode = MODE_DUMP;
+    else if (args[0].equals("-list"))
+      mode = MODE_LIST;
+    else if (args[0].equals("-get"))
+      mode = MODE_GET;
+
+    boolean co = true;
+    boolean fe = true;
+    boolean ge = true;
+    boolean pa = true;
+    boolean pd = true;
+    boolean pt = true;
+    // collect general options
+    for (int i = 1; i < args.length; i++) {
+      if (args[i].equals("-nocontent")) {
+        co = false;
+        args[i] = null;
+      } else if (args[i].equals("-nofetch")) {
+        fe = false;
+        args[i] = null;
+      } else if (args[i].equals("-nogenerate")) {
+        ge = false;
+        args[i] = null;
+      } else if (args[i].equals("-noparse")) {
+        pa = false;
+        args[i] = null;
+      } else if (args[i].equals("-noparsedata")) {
+        pd = false;
+        args[i] = null;
+      } else if (args[i].equals("-noparsetext")) {
+        pt = false;
+        args[i] = null;
+      }
+    }
+    Configuration conf = NutchConfiguration.create();
+    final FileSystem fs = FileSystem.get(conf);
+    SegmentReader segmentReader = new SegmentReader(conf, co, fe, ge, pa, pd,
+        pt);
+    // collect required args
+    switch (mode) {
+    case MODE_DUMP:
+      String input = args[1];
+      if (input == null) {
+        System.err.println("Missing required argument: <segment_dir>");
+        usage();
+        return;
+      }
+      String output = args.length > 2 ? args[2] : null;
+      if (output == null) {
+        System.err.println("Missing required argument: <output>");
+        usage();
+        return;
+      }
+      segmentReader.dump(new Path(input), new Path(output));
+      return;
+    case MODE_LIST:
+      ArrayList<Path> dirs = new ArrayList<Path>();
+      for (int i = 1; i < args.length; i++) {
+        if (args[i] == null)
+          continue;
+        if (args[i].equals("-dir")) {
+          Path dir = new Path(args[++i]);
+          FileStatus[] fstats = fs.listStatus(dir,
+              HadoopFSUtil.getPassDirectoriesFilter(fs));
+          Path[] files = HadoopFSUtil.getPaths(fstats);
+          if (files != null && files.length > 0) {
+            dirs.addAll(Arrays.asList(files));
+          }
+        } else
+          dirs.add(new Path(args[i]));
+      }
+      segmentReader.list(dirs, new OutputStreamWriter(System.out, "UTF-8"));
+      return;
+    case MODE_GET:
+      input = args[1];
+      if (input == null) {
+        System.err.println("Missing required argument: <segment_dir>");
+        usage();
+        return;
+      }
+      String key = args.length > 2 ? args[2] : null;
+      if (key == null) {
+        System.err.println("Missing required argument: <keyValue>");
+        usage();
+        return;
+      }
+      segmentReader.get(new Path(input), new Text(key), new OutputStreamWriter(
+          System.out, "UTF-8"), new HashMap<String, List<Writable>>());
+      return;
+    default:
+      System.err.println("Invalid operation: " + args[0]);
+      usage();
+      return;
+    }
+  }
+
+  private static void usage() {
+    System.err
+        .println("Usage: SegmentReader (-dump ... | -list ... | -get ...) [general options]\n");
+    System.err.println("* General options:");
+    System.err.println("\t-nocontent\tignore content directory");
+    System.err.println("\t-nofetch\tignore crawl_fetch directory");
+    System.err.println("\t-nogenerate\tignore crawl_generate directory");
+    System.err.println("\t-noparse\tignore crawl_parse directory");
+    System.err.println("\t-noparsedata\tignore parse_data directory");
+    System.err.println("\t-noparsetext\tignore parse_text directory");
+    System.err.println();
+    System.err
+        .println("* SegmentReader -dump <segment_dir> <output> [general options]");
+    System.err
+        .println("  Dumps content of a <segment_dir> as a text file to <output>.\n");
+    System.err.println("\t<segment_dir>\tname of the segment directory.");
+    System.err
+        .println("\t<output>\tname of the (non-existent) output directory.");
+    System.err.println();
+    System.err
+        .println("* SegmentReader -list (<segment_dir1> ... | -dir <segments>) [general options]");
+    System.err
+        .println("  List a synopsis of segments in specified directories, or all segments in");
+    System.err
+        .println("  a directory <segments>, and print it on System.out\n");
+    System.err
+        .println("\t<segment_dir1> ...\tlist of segment directories to process");
+    System.err
+        .println("\t-dir <segments>\t\tdirectory that contains multiple segments");
+    System.err.println();
+    System.err
+        .println("* SegmentReader -get <segment_dir> <keyValue> [general options]");
+    System.err
+        .println("  Get a specified record from a segment, and print it on System.out.\n");
+    System.err.println("\t<segment_dir>\tname of the segment directory.");
+    System.err.println("\t<keyValue>\tvalue of the key (url).");
+    System.err
+        .println("\t\tNote: put double-quotes around strings with spaces.");
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/segment/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/segment/package-info.java b/nutch-core/src/main/java/org/apache/nutch/segment/package-info.java
new file mode 100644
index 0000000..ecc0c26
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/segment/package-info.java
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * A segment stores all data from on generate/fetch/update cycle:
+ * fetch list, protocol status, raw content, parsed content, and extracted outgoing links.
+ */
+package org.apache.nutch.segment;
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/service/ConfManager.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/service/ConfManager.java b/nutch-core/src/main/java/org/apache/nutch/service/ConfManager.java
new file mode 100644
index 0000000..c71cfa9
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/service/ConfManager.java
@@ -0,0 +1,39 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.service;
+
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.service.model.request.NutchConfig;
+
+public interface ConfManager {
+
+  public Configuration get(String confId);
+
+  public Map<String, String> getAsMap(String confId);
+
+  public void setProperty(String confId, String propName, String propValue);
+
+  public Set<String> list();
+
+  public String create(NutchConfig nutchConfig);
+
+  public void delete(String confId);
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/service/JobManager.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/service/JobManager.java b/nutch-core/src/main/java/org/apache/nutch/service/JobManager.java
new file mode 100644
index 0000000..20346fc
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/service/JobManager.java
@@ -0,0 +1,44 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.service;
+
+import java.util.Collection;
+import org.apache.nutch.service.model.request.JobConfig;
+import org.apache.nutch.service.model.response.JobInfo;
+import org.apache.nutch.service.model.response.JobInfo.State;
+
+public interface JobManager {
+
+  public static enum JobType{
+    INJECT, GENERATE, FETCH, PARSE, UPDATEDB, INDEX, READDB, CLASS, INVERTLINKS, DEDUP
+  };
+  public Collection<JobInfo> list(String crawlId, State state);
+
+  public JobInfo get(String crawlId, String id);
+
+  /**
+   * Creates specified job
+   * @param jobConfig
+   * @return JobInfo
+   */
+  public JobInfo create(JobConfig jobConfig);
+
+  public boolean abort(String crawlId, String id);
+
+  public boolean stop(String crawlId, String id);
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/service/NutchReader.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/service/NutchReader.java b/nutch-core/src/main/java/org/apache/nutch/service/NutchReader.java
new file mode 100644
index 0000000..00bb78f
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/service/NutchReader.java
@@ -0,0 +1,37 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.service;
+
+import java.io.FileNotFoundException;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.service.impl.SequenceReader;
+import org.apache.nutch.util.NutchConfiguration;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public interface  NutchReader {
+  
+  public static final Logger LOG = LoggerFactory.getLogger(NutchReader.class);
+  public static final Configuration conf = NutchConfiguration.create();
+  
+  public List read(String path) throws FileNotFoundException;
+  public List head(String path, int nrows) throws FileNotFoundException;
+  public List slice(String path, int start, int end) throws FileNotFoundException;
+  public int count(String path) throws FileNotFoundException;
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/service/NutchServer.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/service/NutchServer.java b/nutch-core/src/main/java/org/apache/nutch/service/NutchServer.java
new file mode 100644
index 0000000..e206707
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/service/NutchServer.java
@@ -0,0 +1,224 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.service;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+import java.util.concurrent.BlockingQueue;
+import java.util.concurrent.TimeUnit;
+
+import com.fasterxml.jackson.jaxrs.json.JacksonJaxbJsonProvider;
+
+import org.apache.commons.cli.CommandLineParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.OptionBuilder;
+import org.apache.commons.cli.Options;
+import org.apache.commons.cli.ParseException;
+import org.apache.commons.cli.PosixParser;
+import org.apache.commons.cli.CommandLine;
+import org.apache.cxf.binding.BindingFactoryManager;
+import org.apache.cxf.jaxrs.JAXRSBindingFactory;
+import org.apache.cxf.jaxrs.JAXRSServerFactoryBean;
+import org.apache.cxf.jaxrs.lifecycle.ResourceProvider;
+import org.apache.cxf.jaxrs.lifecycle.SingletonResourceProvider;
+import org.apache.nutch.fetcher.FetchNodeDb;
+import org.apache.nutch.service.impl.ConfManagerImpl;
+import org.apache.nutch.service.impl.JobFactory;
+import org.apache.nutch.service.impl.JobManagerImpl;
+import org.apache.nutch.service.impl.NutchServerPoolExecutor;
+import org.apache.nutch.service.model.response.JobInfo;
+import org.apache.nutch.service.model.response.JobInfo.State;
+import org.apache.nutch.service.resources.AdminResource;
+import org.apache.nutch.service.resources.ConfigResource;
+import org.apache.nutch.service.resources.DbResource;
+import org.apache.nutch.service.resources.JobResource;
+import org.apache.nutch.service.resources.ReaderResouce;
+import org.apache.nutch.service.resources.SeedResource;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.google.common.collect.Queues;
+
+public class NutchServer {
+
+  private static final Logger LOG = LoggerFactory.getLogger(NutchServer.class);
+
+  private static final String LOCALHOST = "localhost";
+  private static final Integer DEFAULT_PORT = 8081;
+  private static final int JOB_CAPACITY = 100;
+
+  private static Integer port = DEFAULT_PORT;
+  private static String host  = LOCALHOST;
+
+  private static final String CMD_HELP = "help";
+  private static final String CMD_PORT = "port";
+  private static final String CMD_HOST = "host";
+
+  private long started;
+  private boolean running;
+  private ConfManager configManager;
+  private JobManager jobManager;
+  private JAXRSServerFactoryBean sf; 
+
+  private static FetchNodeDb fetchNodeDb;
+
+  private static NutchServer server;
+
+  static {
+    server = new NutchServer();
+  }
+
+  private NutchServer() {
+    configManager = new ConfManagerImpl();
+    BlockingQueue<Runnable> runnables = Queues.newArrayBlockingQueue(JOB_CAPACITY);
+    NutchServerPoolExecutor executor = new NutchServerPoolExecutor(10, JOB_CAPACITY, 1, TimeUnit.HOURS, runnables);
+    jobManager = new JobManagerImpl(new JobFactory(), configManager, executor);
+    fetchNodeDb = FetchNodeDb.getInstance();
+
+    sf = new JAXRSServerFactoryBean();
+    BindingFactoryManager manager = sf.getBus().getExtension(BindingFactoryManager.class);
+    JAXRSBindingFactory factory = new JAXRSBindingFactory();
+    factory.setBus(sf.getBus());
+    manager.registerBindingFactory(JAXRSBindingFactory.JAXRS_BINDING_ID, factory);
+    sf.setResourceClasses(getClasses());
+    sf.setResourceProviders(getResourceProviders());
+    sf.setProvider(new JacksonJaxbJsonProvider());
+
+  }
+
+  public static NutchServer getInstance() {
+    return server;
+  }
+
+  protected static void startServer() {
+    server.start();
+  }
+
+  private void start() {
+    LOG.info("Starting NutchServer on {}:{}  ...", host, port);
+    try{
+      String address = "http://" + host + ":" + port;
+      sf.setAddress(address);
+      sf.create();
+    }catch(Exception e){
+      throw new IllegalStateException("Server could not be started", e);
+    }
+
+    started = System.currentTimeMillis();
+    running = true;
+    LOG.info("Started Nutch Server on {}:{} at {}", new Object[] {host, port, started});
+  }
+
+  private List<Class<?>> getClasses() {
+    List<Class<?>> resources = new ArrayList<Class<?>>();
+    resources.add(JobResource.class);
+    resources.add(ConfigResource.class);
+    resources.add(DbResource.class);
+    resources.add(AdminResource.class);
+    resources.add(SeedResource.class);
+    resources.add(ReaderResouce.class);
+    return resources;
+  }
+
+  private List<ResourceProvider> getResourceProviders() {
+    List<ResourceProvider> resourceProviders = new ArrayList<ResourceProvider>();
+    resourceProviders.add(new SingletonResourceProvider(getConfManager()));
+    return resourceProviders;
+  }
+
+  public ConfManager getConfManager() {
+    return configManager;
+  }
+
+  public JobManager getJobManager() {
+    return jobManager;
+  }
+
+  public FetchNodeDb getFetchNodeDb(){
+    return fetchNodeDb;
+  }
+
+  public boolean isRunning(){
+    return running;
+  }
+
+  public long getStarted(){
+    return started;
+  }
+
+  public static void main(String[] args) throws ParseException {
+    CommandLineParser parser = new PosixParser();
+    Options options = createOptions();
+    CommandLine commandLine = parser.parse(options, args);
+    if (commandLine.hasOption(CMD_HELP)) {
+      HelpFormatter formatter = new HelpFormatter();
+      formatter.printHelp("NutchServer", options, true);
+      return;
+    }
+
+    if (commandLine.hasOption(CMD_PORT)) {
+      port = Integer.parseInt(commandLine.getOptionValue(CMD_PORT));
+    }
+
+    if (commandLine.hasOption(CMD_HOST)) {
+      host = commandLine.getOptionValue(CMD_HOST);
+    }
+
+    startServer();
+  }
+
+  private static Options createOptions() {
+    Options options = new Options();
+
+    OptionBuilder.withDescription("Show this help");
+    options.addOption(OptionBuilder.create(CMD_HELP));
+
+    OptionBuilder.withArgName("port");
+    OptionBuilder.hasOptionalArg();
+    OptionBuilder.withDescription("The port to run the Nutch Server. Default port 8081");
+    options.addOption(OptionBuilder.create(CMD_PORT));
+
+    OptionBuilder.withArgName("host");
+    OptionBuilder.hasOptionalArg();
+    OptionBuilder.withDescription("The host to bind the Nutch Server to. Default is localhost.");
+    options.addOption(OptionBuilder.create(CMD_HOST));
+
+    return options;
+  }
+
+  public boolean canStop(boolean force){
+    if(force)
+      return true;
+
+    Collection<JobInfo> jobs = getJobManager().list(null, State.RUNNING);
+    return jobs.isEmpty();
+  }
+
+  protected static void setPort(int port) {
+	  NutchServer.port = port;
+  }
+  
+  public int getPort() {
+    return port;
+  }
+
+  public void stop() {
+    System.exit(0);
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/service/impl/ConfManagerImpl.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/service/impl/ConfManagerImpl.java b/nutch-core/src/main/java/org/apache/nutch/service/impl/ConfManagerImpl.java
new file mode 100644
index 0000000..0c08ce4
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/service/impl/ConfManagerImpl.java
@@ -0,0 +1,132 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.service.impl;
+
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.Set;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.apache.commons.collections.MapUtils;
+import org.apache.commons.lang.StringUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.service.ConfManager;
+import org.apache.nutch.service.model.request.NutchConfig;
+import org.apache.nutch.service.resources.ConfigResource;
+import org.apache.nutch.util.NutchConfiguration;
+
+import com.google.common.collect.Maps;
+
+public class ConfManagerImpl implements ConfManager {
+
+
+  private Map<String, Configuration> configurations = Maps.newConcurrentMap();
+
+  private AtomicInteger newConfigId = new AtomicInteger();
+
+  public ConfManagerImpl() {
+    configurations.put(ConfigResource.DEFAULT, NutchConfiguration.create());
+  }
+
+  /**
+   * Returns the configuration associatedConfManagerImpl with the given confId
+   */
+  public Configuration get(String confId) {
+    if (confId == null) {
+      return configurations.get(ConfigResource.DEFAULT);
+    }
+    return configurations.get(confId);
+  }
+
+  public Map<String, String> getAsMap(String confId) {
+    Configuration configuration = configurations.get(confId);
+    if (configuration == null) {
+      return Collections.emptyMap();
+    }
+
+    Iterator<Entry<String, String>> iterator = configuration.iterator();
+    Map<String, String> configMap = Maps.newTreeMap();
+    while (iterator.hasNext()) {
+      Entry<String, String> entry = iterator.next();
+      configMap.put(entry.getKey(), entry.getValue());
+    }
+    return configMap;
+  }
+
+  /**
+   * Sets the given property in the configuration associated with the confId
+   */
+  public void setProperty(String confId, String propName, String propValue) {
+    if (!configurations.containsKey(confId)) {
+      throw new IllegalArgumentException("Unknown configId '" + confId + "'");
+    }
+    Configuration conf = configurations.get(confId);
+    conf.set(propName, propValue);
+  }
+
+  public Set<String> list() {
+    return configurations.keySet();
+  }
+
+  /**
+   * Created a new configuration based on the values provided.
+   * @param NutchConfig
+   * @return String - confId
+   */
+  public String create(NutchConfig nutchConfig) {
+    if (StringUtils.isBlank(nutchConfig.getConfigId())) {
+      nutchConfig.setConfigId(String.valueOf(newConfigId.incrementAndGet()));
+    }
+
+    if (!canCreate(nutchConfig)) {
+      throw new IllegalArgumentException("Config already exists.");
+    }
+
+    createHadoopConfig(nutchConfig);
+    return nutchConfig.getConfigId();
+  }
+
+
+  public void delete(String confId) {
+    configurations.remove(confId);
+  }
+
+  private boolean canCreate(NutchConfig nutchConfig) {
+    if (nutchConfig.isForce()) {
+      return true;
+    }
+    if (!configurations.containsKey(nutchConfig.getConfigId())) {
+      return true;
+    }
+    return false;
+  }
+
+  private void createHadoopConfig(NutchConfig nutchConfig) {
+    Configuration conf = NutchConfiguration.create();
+    configurations.put(nutchConfig.getConfigId(), conf);
+
+    if (MapUtils.isEmpty(nutchConfig.getParams())) {
+      return;
+    }
+    for (Entry<String, String> e : nutchConfig.getParams().entrySet()) {
+      conf.set(e.getKey(), e.getValue());
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/service/impl/JobFactory.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/service/impl/JobFactory.java b/nutch-core/src/main/java/org/apache/nutch/service/impl/JobFactory.java
new file mode 100644
index 0000000..a74e362
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/service/impl/JobFactory.java
@@ -0,0 +1,75 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.service.impl;
+
+import java.util.Map;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.util.ReflectionUtils;
+import org.apache.nutch.service.JobManager.JobType;
+import org.apache.nutch.crawl.CrawlDb;
+import org.apache.nutch.crawl.DeduplicationJob;
+import org.apache.nutch.crawl.Generator;
+import org.apache.nutch.crawl.Injector;
+import org.apache.nutch.crawl.LinkDb;
+import org.apache.nutch.fetcher.Fetcher;
+import org.apache.nutch.indexer.IndexingJob;
+import org.apache.nutch.parse.ParseSegment;
+import org.apache.nutch.util.NutchTool;
+
+import com.google.common.collect.Maps;
+
+public class JobFactory {
+  private static Map<JobType, Class<? extends NutchTool>> typeToClass;
+
+  static {
+    typeToClass = Maps.newHashMap();
+    typeToClass.put(JobType.INJECT, Injector.class);
+    typeToClass.put(JobType.GENERATE, Generator.class);
+    typeToClass.put(JobType.FETCH, Fetcher.class);
+    typeToClass.put(JobType.PARSE, ParseSegment.class);
+    typeToClass.put(JobType.INDEX, IndexingJob.class);
+    typeToClass.put(JobType.UPDATEDB, CrawlDb.class);
+    typeToClass.put(JobType.INVERTLINKS, LinkDb.class);
+    typeToClass.put(JobType.DEDUP, DeduplicationJob.class);
+  }
+
+  public NutchTool createToolByType(JobType type, Configuration conf) {
+    if (!typeToClass.containsKey(type)) {
+      return null;
+    }
+    Class<? extends NutchTool> clz = typeToClass.get(type);
+    return createTool(clz, conf);
+  }
+
+  @SuppressWarnings({ "rawtypes", "unchecked" })
+  public NutchTool createToolByClassName(String className, Configuration conf) {
+    try {
+      Class clz = Class.forName(className);
+      return createTool(clz, conf);
+    } catch (ClassNotFoundException e) {
+      throw new IllegalStateException(e);
+    }
+  }
+
+  private NutchTool createTool(Class<? extends NutchTool> clz,
+      Configuration conf) {
+    return ReflectionUtils.newInstance(clz, conf);
+  }
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/service/impl/JobManagerImpl.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/service/impl/JobManagerImpl.java b/nutch-core/src/main/java/org/apache/nutch/service/impl/JobManagerImpl.java
new file mode 100644
index 0000000..a915457
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/service/impl/JobManagerImpl.java
@@ -0,0 +1,95 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.service.impl;
+
+import java.util.Collection;
+
+import org.apache.commons.lang.StringUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.service.ConfManager;
+import org.apache.nutch.service.JobManager;
+import org.apache.nutch.service.model.request.JobConfig;
+import org.apache.nutch.service.model.response.JobInfo;
+import org.apache.nutch.service.model.response.JobInfo.State;
+import org.apache.nutch.util.NutchTool;
+
+public class JobManagerImpl implements JobManager {
+
+  private JobFactory jobFactory;
+  private NutchServerPoolExecutor executor;
+  private ConfManager configManager;
+
+  public JobManagerImpl(JobFactory jobFactory, ConfManager configManager, NutchServerPoolExecutor executor) {
+    this.jobFactory = jobFactory;
+    this.configManager = configManager;		
+    this.executor = executor;
+  }
+
+  @Override
+  public JobInfo create(JobConfig jobConfig) {
+    if (jobConfig.getArgs() == null) {
+      throw new IllegalArgumentException("Arguments cannot be null!");
+    }
+    Configuration conf = cloneConfiguration(jobConfig.getConfId());
+    NutchTool tool = createTool(jobConfig, conf);
+    JobWorker worker = new JobWorker(jobConfig, conf, tool);
+    executor.execute(worker);
+    executor.purge();		
+    return worker.getInfo();
+  }
+
+  private Configuration cloneConfiguration(String confId) {
+    Configuration conf = configManager.get(confId);
+    if (conf == null) {
+      throw new IllegalArgumentException("Unknown confId " + confId);
+    }
+    return new Configuration(conf);
+  }
+
+  @Override
+  public Collection<JobInfo> list(String crawlId, State state) {
+    if (state == null || state == State.ANY) {
+      return executor.getAllJobs();
+    }
+    if (state == State.RUNNING || state == State.IDLE) {
+      return executor.getJobRunning();
+    }
+    return executor.getJobHistory();
+  }
+
+  @Override
+  public JobInfo get(String crawlId, String jobId) {
+    return executor.getInfo(jobId);
+  }
+
+  @Override
+  public boolean abort(String crawlId, String id) {
+    return executor.findWorker(id).killJob();
+  }
+
+  @Override
+  public boolean stop(String crawlId, String id) {
+    return executor.findWorker(id).stopJob();
+  }
+
+  private NutchTool createTool(JobConfig jobConfig, Configuration conf){
+    if(StringUtils.isNotBlank(jobConfig.getJobClassName())){
+      return jobFactory.createToolByClassName(jobConfig.getJobClassName(), conf);
+    }
+    return jobFactory.createToolByType(jobConfig.getType(), conf);
+  }
+}

[42/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build for nutch-core and nutch-plugins

Posted by th...@apache.org.

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/parse/ParsePluginsReader.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/parse/ParsePluginsReader.java b/nutch-core/src/main/java/org/apache/nutch/parse/ParsePluginsReader.java
new file mode 100644
index 0000000..daf96e0
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/parse/ParsePluginsReader.java
@@ -0,0 +1,278 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse;
+
+// JDK imports
+import java.io.InputStream;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.NodeList;
+import org.xml.sax.InputSource;
+
+// Commons Logging imports
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
+
+// Nutch imports
+import org.apache.nutch.util.NutchConfiguration;
+
+/**
+ * A reader to load the information stored in the
+ * <code>$NUTCH_HOME/conf/parse-plugins.xml</code> file.
+ * 
+ * @author mattmann
+ * @version 1.0
+ */
+class ParsePluginsReader {
+
+  /* our log stream */
+  public static final Logger LOG = LoggerFactory
+      .getLogger(ParsePluginsReader.class);
+
+  /** The property name of the parse-plugins location */
+  private static final String PP_FILE_PROP = "parse.plugin.file";
+
+  /** the parse-plugins file */
+  private String fParsePluginsFile = null;
+
+  /**
+   * Constructs a new ParsePluginsReader
+   */
+  public ParsePluginsReader() {
+  }
+
+  /**
+   * Reads the <code>parse-plugins.xml</code> file and returns the
+   * {@link #ParsePluginList} defined by it.
+   * 
+   * @return A {@link #ParsePluginList} specified by the
+   *         <code>parse-plugins.xml</code> file.
+   * @throws Exception
+   *           If any parsing error occurs.
+   */
+  public ParsePluginList parse(Configuration conf) {
+
+    ParsePluginList pList = new ParsePluginList();
+
+    // open up the XML file
+    DocumentBuilderFactory factory = null;
+    DocumentBuilder parser = null;
+    Document document = null;
+    InputSource inputSource = null;
+
+    InputStream ppInputStream = null;
+    if (fParsePluginsFile != null) {
+      URL parsePluginUrl = null;
+      try {
+        parsePluginUrl = new URL(fParsePluginsFile);
+        ppInputStream = parsePluginUrl.openStream();
+      } catch (Exception e) {
+        if (LOG.isWarnEnabled()) {
+          LOG.warn("Unable to load parse plugins file from URL " + "["
+              + fParsePluginsFile + "]. Reason is [" + e + "]");
+        }
+        return pList;
+      }
+    } else {
+      ppInputStream = conf.getConfResourceAsInputStream(conf.get(PP_FILE_PROP));
+    }
+
+    inputSource = new InputSource(ppInputStream);
+
+    try {
+      factory = DocumentBuilderFactory.newInstance();
+      parser = factory.newDocumentBuilder();
+      document = parser.parse(inputSource);
+    } catch (Exception e) {
+      if (LOG.isWarnEnabled()) {
+        LOG.warn("Unable to parse [" + fParsePluginsFile + "]." + "Reason is ["
+            + e + "]");
+      }
+      return null;
+    }
+
+    Element parsePlugins = document.getDocumentElement();
+
+    // build up the alias hash map
+    Map<String, String> aliases = getAliases(parsePlugins);
+    // And store it on the parse plugin list
+    pList.setAliases(aliases);
+
+    // get all the mime type nodes
+    NodeList mimeTypes = parsePlugins.getElementsByTagName("mimeType");
+
+    // iterate through the mime types
+    for (int i = 0; i < mimeTypes.getLength(); i++) {
+      Element mimeType = (Element) mimeTypes.item(i);
+      String mimeTypeStr = mimeType.getAttribute("name");
+
+      // for each mimeType, get the plugin list
+      NodeList pluginList = mimeType.getElementsByTagName("plugin");
+
+      // iterate through the plugins, add them in order read
+      // OR if they have a special order="" attribute, then hold those in
+      // a separate list, and then insert them into the final list at the
+      // order specified
+      if (pluginList != null && pluginList.getLength() > 0) {
+        List<String> plugList = new ArrayList<String>(pluginList.getLength());
+
+        for (int j = 0; j < pluginList.getLength(); j++) {
+          Element plugin = (Element) pluginList.item(j);
+          String pluginId = plugin.getAttribute("id");
+          String extId = aliases.get(pluginId);
+          if (extId == null) {
+            // Assume an extension id is directly specified
+            extId = pluginId;
+          }
+          String orderStr = plugin.getAttribute("order");
+          int order = -1;
+          try {
+            order = Integer.parseInt(orderStr);
+          } catch (NumberFormatException ignore) {
+          }
+          if (order != -1) {
+            plugList.add(order - 1, extId);
+          } else {
+            plugList.add(extId);
+          }
+        }
+
+        // now add the plugin list and map it to this mimeType
+        pList.setPluginList(mimeTypeStr, plugList);
+
+      } else if (LOG.isWarnEnabled()) {
+        LOG.warn("ParsePluginsReader:ERROR:no plugins defined for mime type: "
+            + mimeTypeStr + ", continuing parse");
+      }
+    }
+    return pList;
+  }
+
+  /**
+   * Tests parsing of the parse-plugins.xml file. An alternative name for the
+   * file can be specified via the <code>--file</code> option, although the file
+   * must be located in the <code>$NUTCH_HOME/conf</code> directory.
+   * 
+   * @param args
+   *          Currently only the --file argument to specify an alternative name
+   *          for the parse-plugins.xml file is supported.
+   */
+  public static void main(String[] args) throws Exception {
+    String parsePluginFile = null;
+    String usage = "ParsePluginsReader [--file <parse plugin file location>]";
+
+    if ((args.length != 0 && args.length != 2)
+        || (args.length == 2 && !"--file".equals(args[0]))) {
+      System.err.println(usage);
+      System.exit(1);
+    }
+
+    for (int i = 0; i < args.length; i++) {
+      if (args[i].equals("--file")) {
+        parsePluginFile = args[++i];
+      }
+    }
+
+    ParsePluginsReader reader = new ParsePluginsReader();
+
+    if (parsePluginFile != null) {
+      reader.setFParsePluginsFile(parsePluginFile);
+    }
+
+    ParsePluginList prefs = reader.parse(NutchConfiguration.create());
+
+    for (String mimeType : prefs.getSupportedMimeTypes()) {
+
+      System.out.println("MIMETYPE: " + mimeType);
+      List<String> plugList = prefs.getPluginList(mimeType);
+
+      System.out.println("EXTENSION IDs:");
+
+      for (String j : plugList) {
+        System.out.println(j);
+      }
+    }
+
+  }
+
+  /**
+   * @return Returns the fParsePluginsFile.
+   */
+  public String getFParsePluginsFile() {
+    return fParsePluginsFile;
+  }
+
+  /**
+   * @param parsePluginsFile
+   *          The fParsePluginsFile to set.
+   */
+  public void setFParsePluginsFile(String parsePluginsFile) {
+    fParsePluginsFile = parsePluginsFile;
+  }
+
+  private Map<String, String> getAliases(Element parsePluginsRoot) {
+
+    Map<String, String> aliases = new HashMap<String, String>();
+    NodeList aliasRoot = parsePluginsRoot.getElementsByTagName("aliases");
+
+    if (aliasRoot == null || (aliasRoot != null && aliasRoot.getLength() == 0)) {
+      if (LOG.isWarnEnabled()) {
+        LOG.warn("No aliases defined in parse-plugins.xml!");
+      }
+      return aliases;
+    }
+
+    if (aliasRoot.getLength() > 1) {
+      // log a warning, but try and continue processing
+      if (LOG.isWarnEnabled()) {
+        LOG.warn("There should only be one \"aliases\" tag in parse-plugins.xml");
+      }
+    }
+
+    Element aliasRootElem = (Element) aliasRoot.item(0);
+    NodeList aliasElements = aliasRootElem.getElementsByTagName("alias");
+
+    if (aliasElements != null && aliasElements.getLength() > 0) {
+      for (int i = 0; i < aliasElements.getLength(); i++) {
+        Element aliasElem = (Element) aliasElements.item(i);
+        String parsePluginId = aliasElem.getAttribute("name");
+        String extensionId = aliasElem.getAttribute("extension-id");
+        if (LOG.isTraceEnabled()) {
+          LOG.trace("Found alias: plugin-id: " + parsePluginId
+              + ", extension-id: " + extensionId);
+        }
+        if (parsePluginId != null && extensionId != null) {
+          aliases.put(parsePluginId, extensionId);
+        }
+      }
+    }
+    return aliases;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/parse/ParseResult.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/parse/ParseResult.java b/nutch-core/src/main/java/org/apache/nutch/parse/ParseResult.java
new file mode 100644
index 0000000..92d8871
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/parse/ParseResult.java
@@ -0,0 +1,178 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse;
+
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.Map.Entry;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.io.Text;
+
+/**
+ * A utility class that stores result of a parse. Internally a ParseResult
+ * stores &lt;{@link Text}, {@link Parse}&gt; pairs.
+ * <p>
+ * Parsers may return multiple results, which correspond to parts or other
+ * associated documents related to the original URL.
+ * </p>
+ * <p>
+ * There will be usually one parse result that corresponds directly to the
+ * original URL, and possibly many (or none) results that correspond to derived
+ * URLs (or sub-URLs).
+ */
+public class ParseResult implements Iterable<Map.Entry<Text, Parse>> {
+  private Map<Text, Parse> parseMap;
+  private String originalUrl;
+
+  public static final Logger LOG = LoggerFactory.getLogger(ParseResult.class);
+
+  /**
+   * Create a container for parse results.
+   * 
+   * @param originalUrl
+   *          the original url from which all parse results have been obtained.
+   */
+  public ParseResult(String originalUrl) {
+    parseMap = new HashMap<Text, Parse>();
+    this.originalUrl = originalUrl;
+  }
+
+  /**
+   * Convenience method for obtaining {@link ParseResult} from a single
+   * <code>Parse</code> output.
+   * 
+   * @param url
+   *          canonical url.
+   * @param parse
+   *          single parse output.
+   * @return result containing the single parse output.
+   */
+  public static ParseResult createParseResult(String url, Parse parse) {
+    ParseResult parseResult = new ParseResult(url);
+    parseResult.put(new Text(url), new ParseText(parse.getText()),
+        parse.getData());
+    return parseResult;
+  }
+
+  /**
+   * Checks whether the result is empty.
+   * 
+   * @return
+   */
+  public boolean isEmpty() {
+    return parseMap.isEmpty();
+  }
+
+  /**
+   * Return the number of parse outputs (both successful and failed)
+   */
+  public int size() {
+    return parseMap.size();
+  }
+
+  /**
+   * Retrieve a single parse output.
+   * 
+   * @param key
+   *          sub-url under which the parse output is stored.
+   * @return parse output corresponding to this sub-url, or null.
+   */
+  public Parse get(String key) {
+    return get(new Text(key));
+  }
+
+  /**
+   * Retrieve a single parse output.
+   * 
+   * @param key
+   *          sub-url under which the parse output is stored.
+   * @return parse output corresponding to this sub-url, or null.
+   */
+  public Parse get(Text key) {
+    return parseMap.get(key);
+  }
+
+  /**
+   * Store a result of parsing.
+   * 
+   * @param key
+   *          URL or sub-url of this parse result
+   * @param text
+   *          plain text result
+   * @param data
+   *          corresponding parse metadata of this result
+   */
+  public void put(Text key, ParseText text, ParseData data) {
+    put(key.toString(), text, data);
+  }
+
+  /**
+   * Store a result of parsing.
+   * 
+   * @param key
+   *          URL or sub-url of this parse result
+   * @param text
+   *          plain text result
+   * @param data
+   *          corresponding parse metadata of this result
+   */
+  public void put(String key, ParseText text, ParseData data) {
+    parseMap.put(new Text(key),
+        new ParseImpl(text, data, key.equals(originalUrl)));
+  }
+
+  /**
+   * Iterate over all entries in the &lt;url, Parse&gt; map.
+   */
+  public Iterator<Entry<Text, Parse>> iterator() {
+    return parseMap.entrySet().iterator();
+  }
+
+  /**
+   * Remove all results where status is not successful (as determined by
+   * </code>ParseStatus#isSuccess()</code>). Note that effects of this operation
+   * cannot be reversed.
+   */
+  public void filter() {
+    for (Iterator<Entry<Text, Parse>> i = iterator(); i.hasNext();) {
+      Entry<Text, Parse> entry = i.next();
+      if (!entry.getValue().getData().getStatus().isSuccess()) {
+        LOG.warn(entry.getKey() + " is not parsed successfully, filtering");
+        i.remove();
+      }
+    }
+
+  }
+
+  /**
+   * A convenience method which returns true only if all parses are successful.
+   * Parse success is determined by <code>ParseStatus#isSuccess()</code>.
+   */
+  public boolean isSuccess() {
+    for (Iterator<Entry<Text, Parse>> i = iterator(); i.hasNext();) {
+      Entry<Text, Parse> entry = i.next();
+      if (!entry.getValue().getData().getStatus().isSuccess()) {
+        return false;
+      }
+    }
+    return true;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/parse/ParseSegment.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/parse/ParseSegment.java b/nutch-core/src/main/java/org/apache/nutch/parse/ParseSegment.java
new file mode 100644
index 0000000..b008bed
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/parse/ParseSegment.java
@@ -0,0 +1,309 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.SignatureFactory;
+import org.apache.nutch.segment.SegmentChecker;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.io.*;
+import org.apache.hadoop.mapred.*;
+import org.apache.hadoop.util.*;
+import org.apache.hadoop.conf.*;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.*;
+import org.apache.nutch.scoring.ScoringFilterException;
+import org.apache.nutch.scoring.ScoringFilters;
+import org.apache.nutch.util.*;
+import org.apache.hadoop.fs.Path;
+
+import java.io.*;
+import java.text.SimpleDateFormat;
+import java.util.*;
+import java.util.Map.Entry;
+
+/* Parse content in a segment. */
+public class ParseSegment extends NutchTool implements Tool,
+    Mapper<WritableComparable<?>, Content, Text, ParseImpl>,
+    Reducer<Text, Writable, Text, Writable> {
+
+  public static final Logger LOG = LoggerFactory.getLogger(ParseSegment.class);
+
+  public static final String SKIP_TRUNCATED = "parser.skip.truncated";
+
+  private ScoringFilters scfilters;
+
+  private ParseUtil parseUtil;
+
+  private boolean skipTruncated;
+
+  public ParseSegment() {
+    this(null);
+  }
+
+  public ParseSegment(Configuration conf) {
+    super(conf);
+  }
+
+  public void configure(JobConf job) {
+    setConf(job);
+    this.scfilters = new ScoringFilters(job);
+    skipTruncated = job.getBoolean(SKIP_TRUNCATED, true);
+  }
+
+  public void close() {
+  }
+
+  private Text newKey = new Text();
+
+  public void map(WritableComparable<?> key, Content content,
+      OutputCollector<Text, ParseImpl> output, Reporter reporter)
+      throws IOException {
+    // convert on the fly from old UTF8 keys
+    if (key instanceof Text) {
+      newKey.set(key.toString());
+      key = newKey;
+    }
+
+    int status = Integer.parseInt(content.getMetadata().get(
+        Nutch.FETCH_STATUS_KEY));
+    if (status != CrawlDatum.STATUS_FETCH_SUCCESS) {
+      // content not fetched successfully, skip document
+      LOG.debug("Skipping " + key + " as content is not fetched successfully");
+      return;
+    }
+
+    if (skipTruncated && isTruncated(content)) {
+      return;
+    }
+
+    long start = System.currentTimeMillis();
+    ParseResult parseResult = null;
+    try {
+      if (parseUtil == null)
+        parseUtil = new ParseUtil(getConf());
+      parseResult = parseUtil.parse(content);
+    } catch (Exception e) {
+      LOG.warn("Error parsing: " + key + ": "
+          + StringUtils.stringifyException(e));
+      return;
+    }
+
+    for (Entry<Text, Parse> entry : parseResult) {
+      Text url = entry.getKey();
+      Parse parse = entry.getValue();
+      ParseStatus parseStatus = parse.getData().getStatus();
+
+      reporter.incrCounter("ParserStatus",
+          ParseStatus.majorCodes[parseStatus.getMajorCode()], 1);
+
+      if (!parseStatus.isSuccess()) {
+        LOG.warn("Error parsing: " + key + ": " + parseStatus);
+        parse = parseStatus.getEmptyParse(getConf());
+      }
+
+      // pass segment name to parse data
+      parse.getData().getContentMeta()
+          .set(Nutch.SEGMENT_NAME_KEY, getConf().get(Nutch.SEGMENT_NAME_KEY));
+
+      // compute the new signature
+      byte[] signature = SignatureFactory.getSignature(getConf()).calculate(
+          content, parse);
+      parse.getData().getContentMeta()
+          .set(Nutch.SIGNATURE_KEY, StringUtil.toHexString(signature));
+
+      try {
+        scfilters.passScoreAfterParsing(url, content, parse);
+      } catch (ScoringFilterException e) {
+        if (LOG.isWarnEnabled()) {
+          LOG.warn("Error passing score: " + url + ": " + e.getMessage());
+        }
+      }
+
+      long end = System.currentTimeMillis();
+      LOG.info("Parsed (" + Long.toString(end - start) + "ms):" + url);
+
+      output.collect(
+          url,
+          new ParseImpl(new ParseText(parse.getText()), parse.getData(), parse
+              .isCanonical()));
+    }
+  }
+
+  /**
+   * Checks if the page's content is truncated.
+   * 
+   * @param content
+   * @return If the page is truncated <code>true</code>. When it is not, or when
+   *         it could be determined, <code>false</code>.
+   */
+  public static boolean isTruncated(Content content) {
+    byte[] contentBytes = content.getContent();
+    if (contentBytes == null)
+      return false;
+    Metadata metadata = content.getMetadata();
+    if (metadata == null)
+      return false;
+
+    String lengthStr = metadata.get(Response.CONTENT_LENGTH);
+    if (lengthStr != null)
+      lengthStr = lengthStr.trim();
+    if (StringUtil.isEmpty(lengthStr)) {
+      return false;
+    }
+    int inHeaderSize;
+    String url = content.getUrl();
+    try {
+      inHeaderSize = Integer.parseInt(lengthStr);
+    } catch (NumberFormatException e) {
+      LOG.warn("Wrong contentlength format for " + url, e);
+      return false;
+    }
+    int actualSize = contentBytes.length;
+    if (inHeaderSize > actualSize) {
+      LOG.info(url + " skipped. Content of size " + inHeaderSize
+          + " was truncated to " + actualSize);
+      return true;
+    }
+    if (LOG.isDebugEnabled()) {
+      LOG.debug(url + " actualSize=" + actualSize + " inHeaderSize="
+          + inHeaderSize);
+    }
+    return false;
+  }
+
+  public void reduce(Text key, Iterator<Writable> values,
+      OutputCollector<Text, Writable> output, Reporter reporter)
+      throws IOException {
+    output.collect(key, values.next()); // collect first value
+  }
+
+  public void parse(Path segment) throws IOException {
+     if (SegmentChecker.isParsed(segment, FileSystem.get(getConf()))) {
+	  LOG.warn("Segment: " + segment
+	  + " already parsed!! Skipped parsing this segment!!"); // NUTCH-1854
+          return;
+      }
+
+    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+    long start = System.currentTimeMillis();
+    if (LOG.isInfoEnabled()) {
+      LOG.info("ParseSegment: starting at " + sdf.format(start));
+      LOG.info("ParseSegment: segment: " + segment);
+    }
+
+    JobConf job = new NutchJob(getConf());
+    job.setJobName("parse " + segment);
+
+    FileInputFormat.addInputPath(job, new Path(segment, Content.DIR_NAME));
+    job.set(Nutch.SEGMENT_NAME_KEY, segment.getName());
+    job.setInputFormat(SequenceFileInputFormat.class);
+    job.setMapperClass(ParseSegment.class);
+    job.setReducerClass(ParseSegment.class);
+
+    FileOutputFormat.setOutputPath(job, segment);
+    job.setOutputFormat(ParseOutputFormat.class);
+    job.setOutputKeyClass(Text.class);
+    job.setOutputValueClass(ParseImpl.class);
+
+    JobClient.runJob(job);
+    long end = System.currentTimeMillis();
+    LOG.info("ParseSegment: finished at " + sdf.format(end) + ", elapsed: "
+        + TimingUtil.elapsedTime(start, end));
+  }
+
+  public static void main(String[] args) throws Exception {
+    int res = ToolRunner.run(NutchConfiguration.create(), new ParseSegment(),
+        args);
+    System.exit(res);
+  }
+
+  public int run(String[] args) throws Exception {
+    Path segment;
+
+    String usage = "Usage: ParseSegment segment [-noFilter] [-noNormalize]";
+
+    if (args.length == 0) {
+      System.err.println(usage);
+      System.exit(-1);
+    }
+
+    if (args.length > 1) {
+      for (int i = 1; i < args.length; i++) {
+        String param = args[i];
+
+        if ("-nofilter".equalsIgnoreCase(param)) {
+          getConf().setBoolean("parse.filter.urls", false);
+        } else if ("-nonormalize".equalsIgnoreCase(param)) {
+          getConf().setBoolean("parse.normalize.urls", false);
+        }
+      }
+    }
+
+    segment = new Path(args[0]);
+    parse(segment);
+    return 0;
+  }
+
+  /*
+   * Used for Nutch REST service
+   */
+  public Map<String, Object> run(Map<String, Object> args, String crawlId) throws Exception {
+
+    Map<String, Object> results = new HashMap<String, Object>();
+    Path segment;
+    if(args.containsKey(Nutch.ARG_SEGMENT)) {
+    	Object seg = args.get(Nutch.ARG_SEGMENT);
+    	if(seg instanceof Path) {
+    		segment = (Path) seg;
+    	}
+    	else {
+    		segment = new Path(seg.toString());
+    	}
+    }
+    else {
+    	String segment_dir = crawlId+"/segments";
+        File segmentsDir = new File(segment_dir);
+        File[] segmentsList = segmentsDir.listFiles();  
+        Arrays.sort(segmentsList, new Comparator<File>(){
+          @Override
+          public int compare(File f1, File f2) {
+            if(f1.lastModified()>f2.lastModified())
+              return -1;
+            else
+              return 0;
+          }      
+        });
+        segment = new Path(segmentsList[0].getPath());
+    }
+    
+    if (args.containsKey("nofilter")) {
+      getConf().setBoolean("parse.filter.urls", false);
+    }
+    if (args.containsKey("nonormalize")) {
+      getConf().setBoolean("parse.normalize.urls", false);
+    }
+    parse(segment);
+    results.put(Nutch.VAL_RESULT, Integer.toString(0));
+    return results;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/parse/ParseStatus.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/parse/ParseStatus.java b/nutch-core/src/main/java/org/apache/nutch/parse/ParseStatus.java
new file mode 100644
index 0000000..b9d5959
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/parse/ParseStatus.java
@@ -0,0 +1,311 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * Created on Apr 28, 2005
+ * Author: Andrzej Bialecki &lt;ab@getopt.org&gt;
+ *
+ */
+package org.apache.nutch.parse;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.hadoop.io.VersionMismatchException;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.io.WritableUtils;
+import org.apache.hadoop.conf.Configuration;
+
+import org.apache.nutch.metadata.Metadata;
+
+/**
+ * @author Andrzej Bialecki &lt;ab@getopt.org&gt;
+ */
+public class ParseStatus implements Writable {
+
+  private final static byte VERSION = 2;
+
+  // Primary status codes:
+
+  /** Parsing was not performed. */
+  public static final byte NOTPARSED = 0;
+  /** Parsing succeeded. */
+  public static final byte SUCCESS = 1;
+  /** General failure. There may be a more specific error message in arguments. */
+  public static final byte FAILED = 2;
+
+  public static final String[] majorCodes = { "notparsed", "success", "failed" };
+
+  // Secondary success codes go here:
+
+  /**
+   * Parsed content contains a directive to redirect to another URL. The target
+   * URL can be retrieved from the arguments.
+   */
+  public static final short SUCCESS_REDIRECT = 100;
+
+  // Secondary failure codes go here:
+
+  /**
+   * Parsing failed. An Exception occured (which may be retrieved from the
+   * arguments).
+   */
+  public static final short FAILED_EXCEPTION = 200;
+  /**
+   * Parsing failed. Content was truncated, but the parser cannot handle
+   * incomplete content.
+   */
+  public static final short FAILED_TRUNCATED = 202;
+  /**
+   * Parsing failed. Invalid format - the content may be corrupted or of wrong
+   * type.
+   */
+  public static final short FAILED_INVALID_FORMAT = 203;
+  /**
+   * Parsing failed. Other related parts of the content are needed to complete
+   * parsing. The list of URLs to missing parts may be provided in arguments.
+   * The Fetcher may decide to fetch these parts at once, then put them into
+   * Content.metadata, and supply them for re-parsing.
+   */
+  public static final short FAILED_MISSING_PARTS = 204;
+  /**
+   * Parsing failed. There was no content to be parsed - probably caused by
+   * errors at protocol stage.
+   */
+  public static final short FAILED_MISSING_CONTENT = 205;
+
+  public static final ParseStatus STATUS_NOTPARSED = new ParseStatus(NOTPARSED);
+  public static final ParseStatus STATUS_SUCCESS = new ParseStatus(SUCCESS);
+  public static final ParseStatus STATUS_FAILURE = new ParseStatus(FAILED);
+
+  private byte majorCode = 0;
+  private short minorCode = 0;
+  private String[] args = null;
+
+  public byte getVersion() {
+    return VERSION;
+  }
+
+  public ParseStatus() {
+
+  }
+
+  public ParseStatus(int majorCode, int minorCode, String[] args) {
+    this.args = args;
+    this.majorCode = (byte) majorCode;
+    this.minorCode = (short) minorCode;
+  }
+
+  public ParseStatus(int majorCode) {
+    this(majorCode, 0, (String[]) null);
+  }
+
+  public ParseStatus(int majorCode, String[] args) {
+    this(majorCode, 0, args);
+  }
+
+  public ParseStatus(int majorCode, int minorCode) {
+    this(majorCode, minorCode, (String[]) null);
+  }
+
+  /** Simplified constructor for passing just a text message. */
+  public ParseStatus(int majorCode, int minorCode, String message) {
+    this(majorCode, minorCode, new String[] { message });
+  }
+
+  /** Simplified constructor for passing just a text message. */
+  public ParseStatus(int majorCode, String message) {
+    this(majorCode, 0, new String[] { message });
+  }
+
+  public ParseStatus(Throwable t) {
+    this(FAILED, FAILED_EXCEPTION, new String[] { t.toString() });
+  }
+
+  public static ParseStatus read(DataInput in) throws IOException {
+    ParseStatus res = new ParseStatus();
+    res.readFields(in);
+    return res;
+  }
+
+  public void readFields(DataInput in) throws IOException {
+    byte version = in.readByte();
+    switch (version) {
+    case 1:
+      majorCode = in.readByte();
+      minorCode = in.readShort();
+      args = WritableUtils.readCompressedStringArray(in);
+      break;
+    case 2:
+      majorCode = in.readByte();
+      minorCode = in.readShort();
+      args = WritableUtils.readStringArray(in);
+      break;
+    default:
+      throw new VersionMismatchException(VERSION, version);
+    }
+  }
+
+  public void write(DataOutput out) throws IOException {
+    out.writeByte(VERSION);
+    out.writeByte(majorCode);
+    out.writeShort(minorCode);
+    if (args == null) {
+      out.writeInt(-1);
+    } else {
+      WritableUtils.writeStringArray(out, args);
+    }
+  }
+
+  /**
+   * A convenience method. Returns true if majorCode is SUCCESS, false
+   * otherwise.
+   */
+
+  public boolean isSuccess() {
+    return majorCode == SUCCESS;
+  }
+
+  /**
+   * A convenience method. Return a String representation of the first argument,
+   * or null.
+   */
+  public String getMessage() {
+    if (args != null && args.length > 0 && args[0] != null)
+      return args[0];
+    return null;
+  }
+
+  public String[] getArgs() {
+    return args;
+  }
+
+  public int getMajorCode() {
+    return majorCode;
+  }
+
+  public int getMinorCode() {
+    return minorCode;
+  }
+
+  /**
+   * A convenience method. Creates an empty Parse instance, which returns this
+   * status.
+   */
+  public Parse getEmptyParse(Configuration conf) {
+    return new EmptyParseImpl(this, conf);
+  }
+
+  /**
+   * A convenience method. Creates an empty ParseResult, which contains this
+   * status.
+   */
+  public ParseResult getEmptyParseResult(String url, Configuration conf) {
+    return ParseResult.createParseResult(url, getEmptyParse(conf));
+  }
+
+  public String toString() {
+    StringBuffer res = new StringBuffer();
+    String name = null;
+    if (majorCode >= 0 && majorCode < majorCodes.length)
+      name = majorCodes[majorCode];
+    else
+      name = "UNKNOWN!";
+    res.append(name + "(" + majorCode + "," + minorCode + ")");
+    if (args != null) {
+      if (args.length == 1) {
+        res.append(": " + String.valueOf(args[0]));
+      } else {
+        for (int i = 0; i < args.length; i++) {
+          if (args[i] != null)
+            res.append(", args[" + i + "]=" + String.valueOf(args[i]));
+        }
+      }
+    }
+    return res.toString();
+  }
+
+  public void setArgs(String[] args) {
+    this.args = args;
+  }
+
+  public void setMessage(String msg) {
+    if (args == null || args.length == 0) {
+      args = new String[1];
+    }
+    args[0] = msg;
+  }
+
+  public void setMajorCode(byte majorCode) {
+    this.majorCode = majorCode;
+  }
+
+  public void setMinorCode(short minorCode) {
+    this.minorCode = minorCode;
+  }
+
+  public boolean equals(Object o) {
+    if (o == null)
+      return false;
+    if (!(o instanceof ParseStatus))
+      return false;
+    boolean res = true;
+    ParseStatus other = (ParseStatus) o;
+    res = res && (this.majorCode == other.majorCode)
+        && (this.minorCode == other.minorCode);
+    if (!res)
+      return res;
+    if (this.args == null) {
+      if (other.args == null)
+        return true;
+      else
+        return false;
+    } else {
+      if (other.args == null)
+        return false;
+      if (other.args.length != this.args.length)
+        return false;
+      for (int i = 0; i < this.args.length; i++) {
+        if (!this.args[i].equals(other.args[i]))
+          return false;
+      }
+    }
+    return true;
+  }
+
+  private static class EmptyParseImpl implements Parse {
+
+    private ParseData data = null;
+
+    public EmptyParseImpl(ParseStatus status, Configuration conf) {
+      data = new ParseData(status, "", new Outlink[0], new Metadata(),
+          new Metadata());
+    }
+
+    public ParseData getData() {
+      return data;
+    }
+
+    public String getText() {
+      return "";
+    }
+
+    public boolean isCanonical() {
+      return true;
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/parse/ParseText.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/parse/ParseText.java b/nutch-core/src/main/java/org/apache/nutch/parse/ParseText.java
new file mode 100644
index 0000000..13416cf
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/parse/ParseText.java
@@ -0,0 +1,119 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse;
+
+import java.io.*;
+import org.apache.hadoop.io.*;
+import org.apache.hadoop.util.GenericOptionsParser;
+import org.apache.hadoop.fs.*;
+import org.apache.hadoop.conf.*;
+import org.apache.commons.cli.Options;
+import org.apache.nutch.util.NutchConfiguration;
+
+/* The text conversion of page's content, stored using gzip compression.
+ * @see Parse#getText()
+ */
+public final class ParseText implements Writable {
+  public static final String DIR_NAME = "parse_text";
+
+  private final static byte VERSION = 2;
+
+  public ParseText() {
+  }
+
+  private String text;
+
+  public ParseText(String text) {
+    this.text = text;
+  }
+
+  public void readFields(DataInput in) throws IOException {
+    byte version = in.readByte();
+    switch (version) {
+    case 1:
+      text = WritableUtils.readCompressedString(in);
+      break;
+    case VERSION:
+      text = Text.readString(in);
+      break;
+    default:
+      throw new VersionMismatchException(VERSION, version);
+    }
+  }
+
+  public final void write(DataOutput out) throws IOException {
+    out.write(VERSION);
+    Text.writeString(out, text);
+  }
+
+  public final static ParseText read(DataInput in) throws IOException {
+    ParseText parseText = new ParseText();
+    parseText.readFields(in);
+    return parseText;
+  }
+
+  //
+  // Accessor methods
+  //
+  public String getText() {
+    return text;
+  }
+
+  public boolean equals(Object o) {
+    if (!(o instanceof ParseText))
+      return false;
+    ParseText other = (ParseText) o;
+    return this.text.equals(other.text);
+  }
+
+  public String toString() {
+    return text;
+  }
+
+  public static void main(String argv[]) throws Exception {
+    String usage = "ParseText (-local | -dfs <namenode:port>) recno segment";
+
+    if (argv.length < 3) {
+      System.out.println("usage:" + usage);
+      return;
+    }
+    Options opts = new Options();
+    Configuration conf = NutchConfiguration.create();
+
+    GenericOptionsParser parser = new GenericOptionsParser(conf, opts, argv);
+
+    String[] remainingArgs = parser.getRemainingArgs();
+
+    FileSystem fs = FileSystem.get(conf);
+    try {
+      int recno = Integer.parseInt(remainingArgs[0]);
+      String segment = remainingArgs[1];
+      String filename = new Path(segment, ParseText.DIR_NAME).toString();
+
+      ParseText parseText = new ParseText();
+      ArrayFile.Reader parseTexts = new ArrayFile.Reader(fs, filename, conf);
+
+      parseTexts.get(recno, parseText);
+      System.out.println("Retrieved " + recno + " from file " + filename);
+      System.out.println(parseText);
+      parseTexts.close();
+    } finally {
+      fs.close();
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/parse/ParseUtil.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/parse/ParseUtil.java b/nutch-core/src/main/java/org/apache/nutch/parse/ParseUtil.java
new file mode 100644
index 0000000..39024dc
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/parse/ParseUtil.java
@@ -0,0 +1,181 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse;
+
+// Commons Logging imports
+
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.protocol.Content;
+
+import com.google.common.util.concurrent.ThreadFactoryBuilder;
+
+/**
+ * A Utility class containing methods to simply perform parsing utilities such
+ * as iterating through a preferred list of {@link Parser}s to obtain
+ * {@link Parse} objects.
+ * 
+ * @author mattmann
+ * @author J&eacute;r&ocirc;me Charron
+ * @author S&eacute;bastien Le Callonnec
+ */
+public class ParseUtil {
+
+  /* our log stream */
+  public static final Logger LOG = LoggerFactory.getLogger(ParseUtil.class);
+  private ParserFactory parserFactory;
+  /** Parser timeout set to 30 sec by default. Set -1 to deactivate **/
+  private int maxParseTime = 30;
+  private ExecutorService executorService;
+
+  /**
+   * 
+   * @param conf
+   */
+  public ParseUtil(Configuration conf) {
+    this.parserFactory = new ParserFactory(conf);
+    maxParseTime = conf.getInt("parser.timeout", 30);
+    executorService = Executors.newCachedThreadPool(new ThreadFactoryBuilder()
+        .setNameFormat("parse-%d").setDaemon(true).build());
+  }
+
+  /**
+   * Performs a parse by iterating through a List of preferred {@link Parser}s
+   * until a successful parse is performed and a {@link Parse} object is
+   * returned. If the parse is unsuccessful, a message is logged to the
+   * <code>WARNING</code> level, and an empty parse is returned.
+   * 
+   * @param content
+   *          The content to try and parse.
+   * @return &lt;key, {@link Parse}&gt; pairs.
+   * @throws ParseException
+   *           If no suitable parser is found to perform the parse.
+   */
+  public ParseResult parse(Content content) throws ParseException {
+    Parser[] parsers = null;
+
+    try {
+      parsers = this.parserFactory.getParsers(content.getContentType(),
+          content.getUrl() != null ? content.getUrl() : "");
+    } catch (ParserNotFound e) {
+      if (LOG.isWarnEnabled()) {
+        LOG.warn("No suitable parser found when trying to parse content "
+            + content.getUrl() + " of type " + content.getContentType());
+      }
+      throw new ParseException(e.getMessage());
+    }
+
+    ParseResult parseResult = null;
+    for (int i = 0; i < parsers.length; i++) {
+      if (LOG.isDebugEnabled()) {
+        LOG.debug("Parsing [" + content.getUrl() + "] with [" + parsers[i]
+            + "]");
+      }
+      if (maxParseTime != -1)
+        parseResult = runParser(parsers[i], content);
+      else
+        parseResult = parsers[i].getParse(content);
+
+      if (parseResult != null && !parseResult.isEmpty())
+        return parseResult;
+    }
+
+    if (LOG.isWarnEnabled()) {
+      LOG.warn("Unable to successfully parse content " + content.getUrl()
+          + " of type " + content.getContentType());
+    }
+    return new ParseStatus(new ParseException(
+        "Unable to successfully parse content")).getEmptyParseResult(
+        content.getUrl(), null);
+  }
+
+  /**
+   * Method parses a {@link Content} object using the {@link Parser} specified
+   * by the parameter <code>extId</code>, i.e., the Parser's extension ID. If a
+   * suitable {@link Parser} is not found, then a <code>WARNING</code> level
+   * message is logged, and a ParseException is thrown. If the parse is
+   * uncessful for any other reason, then a <code>WARNING</code> level message
+   * is logged, and a <code>ParseStatus.getEmptyParse()</code> is returned.
+   * 
+   * @param extId
+   *          The extension implementation ID of the {@link Parser} to use to
+   *          parse the specified content.
+   * @param content
+   *          The content to parse.
+   * 
+   * @return &lt;key, {@link Parse}&gt; pairs if the parse is successful,
+   *         otherwise, a single &lt;key,
+   *         <code>ParseStatus.getEmptyParse()</code>&gt; pair.
+   * 
+   * @throws ParseException
+   *           If there is no suitable {@link Parser} found to perform the
+   *           parse.
+   */
+  public ParseResult parseByExtensionId(String extId, Content content)
+      throws ParseException {
+    Parser p = null;
+
+    try {
+      p = this.parserFactory.getParserById(extId);
+    } catch (ParserNotFound e) {
+      if (LOG.isWarnEnabled()) {
+        LOG.warn("No suitable parser found when trying to parse content "
+            + content.getUrl() + " of type " + content.getContentType());
+      }
+      throw new ParseException(e.getMessage());
+    }
+
+    ParseResult parseResult = null;
+    if (maxParseTime != -1)
+      parseResult = runParser(p, content);
+    else
+      parseResult = p.getParse(content);
+    if (parseResult != null && !parseResult.isEmpty()) {
+      return parseResult;
+    } else {
+      if (LOG.isWarnEnabled()) {
+        LOG.warn("Unable to successfully parse content " + content.getUrl()
+            + " of type " + content.getContentType());
+      }
+      return new ParseStatus(new ParseException(
+          "Unable to successfully parse content")).getEmptyParseResult(
+          content.getUrl(), null);
+    }
+  }
+
+  private ParseResult runParser(Parser p, Content content) {
+    ParseCallable pc = new ParseCallable(p, content);
+    Future<ParseResult> task = executorService.submit(pc);
+    ParseResult res = null;
+    try {
+      res = task.get(maxParseTime, TimeUnit.SECONDS);
+    } catch (Exception e) {
+      LOG.warn("Error parsing " + content.getUrl() + " with " + p, e);
+      task.cancel(true);
+    } finally {
+      pc = null;
+    }
+    return res;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/parse/Parser.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/parse/Parser.java b/nutch-core/src/main/java/org/apache/nutch/parse/Parser.java
new file mode 100644
index 0000000..d101453
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/parse/Parser.java
@@ -0,0 +1,58 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configurable;
+
+// Nutch imports
+import org.apache.nutch.plugin.Pluggable;
+import org.apache.nutch.protocol.Content;
+
+/**
+ * A parser for content generated by a
+ * {@link org.apache.nutch.protocol.Protocol} implementation. This interface is
+ * implemented by extensions. Nutch's core contains no page parsing code.
+ */
+public interface Parser extends Pluggable, Configurable {
+  /** The name of the extension point. */
+  public final static String X_POINT_ID = Parser.class.getName();
+
+  /**
+   * <p>
+   * This method parses the given content and returns a map of &lt;key,
+   * parse&gt; pairs. {@link Parse} instances will be persisted under the given
+   * key.
+   * </p>
+   * <p>
+   * Note: Meta-redirects should be followed only when they are coming from the
+   * original URL. That is: <br>
+   * Assume fetcher is in parsing mode and is currently processing
+   * foo.bar.com/redirect.html. If this url contains a meta redirect to another
+   * url, fetcher should only follow the redirect if the map contains an entry
+   * of the form &lt;"foo.bar.com/redirect.html", {@link Parse} with a
+   * {@link ParseStatus} indicating the redirect&gt;.
+   * </p>
+   * 
+   * @param c
+   *          Content to be parsed
+   * @return a map containing &lt;key, parse&gt; pairs
+   * @since NUTCH-443
+   */
+  ParseResult getParse(Content c);
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/parse/ParserChecker.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/parse/ParserChecker.java b/nutch-core/src/main/java/org/apache/nutch/parse/ParserChecker.java
new file mode 100644
index 0000000..7e5b146
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/parse/ParserChecker.java
@@ -0,0 +1,270 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse;
+
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.Map;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.SignatureFactory;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.protocol.ProtocolOutput;
+import org.apache.nutch.scoring.ScoringFilters;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.URLUtil;
+import org.apache.nutch.util.StringUtil;
+
+/**
+ * Parser checker, useful for testing parser. It also accurately reports
+ * possible fetching and parsing failures and presents protocol status signals
+ * to aid debugging. The tool enables us to retrieve the following data from any
+ * url:
+ * <ol>
+ * <li><tt>contentType</tt>: The URL {@link org.apache.nutch.protocol.Content}
+ * type.</li>
+ * <li><tt>signature</tt>: Digest is used to identify pages (like unique ID) and
+ * is used to remove duplicates during the dedup procedure. It is calculated
+ * using {@link org.apache.nutch.crawl.MD5Signature} or
+ * {@link org.apache.nutch.crawl.TextProfileSignature}.</li>
+ * <li><tt>Version</tt>: From {@link org.apache.nutch.parse.ParseData}.</li>
+ * <li><tt>Status</tt>: From {@link org.apache.nutch.parse.ParseData}.</li>
+ * <li><tt>Title</tt>: of the URL</li>
+ * <li><tt>Outlinks</tt>: associated with the URL</li>
+ * <li><tt>Content Metadata</tt>: such as <i>X-AspNet-Version</i>, <i>Date</i>,
+ * <i>Content-length</i>, <i>servedBy</i>, <i>Content-Type</i>,
+ * <i>Cache-Control</>, etc.</li>
+ * <li><tt>Parse Metadata</tt>: such as <i>CharEncodingForConversion</i>,
+ * <i>OriginalCharEncoding</i>, <i>language</i>, etc.</li>
+ * <li><tt>ParseText</tt>: The page parse text which varies in length depdnecing
+ * on <code>content.length</code> configuration.</li>
+ * </ol>
+ * 
+ * @author John Xing
+ */
+
+public class ParserChecker implements Tool {
+
+  public static final Logger LOG = LoggerFactory.getLogger(ParserChecker.class);
+  private Configuration conf;
+
+  public ParserChecker() {
+  }
+
+  public int run(String[] args) throws Exception {
+    boolean dumpText = false;
+    boolean force = false;
+    String contentType = null;
+    String url = null;
+
+    String usage = "Usage: ParserChecker [-dumpText] [-forceAs mimeType] [-md key=value] url";
+
+    if (args.length == 0) {
+      LOG.error(usage);
+      return (-1);
+    }
+
+    // used to simulate the metadata propagated from injection
+    HashMap<String, String> metadata = new HashMap<String, String>();
+
+    for (int i = 0; i < args.length; i++) {
+      if (args[i].equals("-forceAs")) {
+        force = true;
+        contentType = args[++i];
+      } else if (args[i].equals("-dumpText")) {
+        dumpText = true;
+      } else if (args[i].equals("-md")) {
+        String k = null, v = null;
+        String nextOne = args[++i];
+        int firstEquals = nextOne.indexOf("=");
+        if (firstEquals != -1) {
+          k = nextOne.substring(0, firstEquals);
+          v = nextOne.substring(firstEquals + 1);
+        } else
+          k = nextOne;
+        metadata.put(k, v);
+      } else if (i != args.length - 1) {
+        LOG.error(usage);
+        System.exit(-1);
+      } else {
+        url = URLUtil.toASCII(args[i]);
+      }
+    }
+
+    if (LOG.isInfoEnabled()) {
+      LOG.info("fetching: " + url);
+    }
+
+    CrawlDatum cd = new CrawlDatum();
+
+    Iterator<String> iter = metadata.keySet().iterator();
+    while (iter.hasNext()) {
+      String key = iter.next();
+      String value = metadata.get(key);
+      if (value == null)
+        value = "";
+      cd.getMetaData().put(new Text(key), new Text(value));
+    }
+
+    ProtocolFactory factory = new ProtocolFactory(conf);
+    Protocol protocol = factory.getProtocol(url);
+    Text turl = new Text(url);
+    ProtocolOutput output = protocol.getProtocolOutput(turl, cd);
+
+    // If the configuration permits, handle redirects until we either run
+    // out of allowed redirects or we stop getting redirect statuses.
+    int maxRedirects = conf.getInt("http.redirect.max", 0);
+    int numRedirects = 0;
+    while (output.getStatus().isRedirect() && numRedirects < maxRedirects) {
+        String newURL = URLUtil.toASCII(output.getStatus().getArgs()[0]);
+        LOG.info("Handling redirect to " + newURL);
+
+        protocol = factory.getProtocol(newURL);
+        turl = new Text(newURL);
+        output = protocol.getProtocolOutput(turl, cd);
+
+        numRedirects++;
+    }
+
+    if (!output.getStatus().isSuccess()) {
+      System.err.println("Fetch failed with protocol status: "
+          + output.getStatus());
+
+      if (output.getStatus().isRedirect()) {
+          System.err.println("Redirect(s) not handled due to configuration.");
+          System.err.println("Max Redirects to handle per config: " + maxRedirects);
+          System.err.println("Number of Redirects handled: " + numRedirects);
+      }
+      return (-1);
+    }
+
+    Content content = output.getContent();
+
+    if (content == null) {
+      LOG.error("No content for " + url);
+      return (-1);
+    }
+
+    if (force) {
+      content.setContentType(contentType);
+    } else {
+      contentType = content.getContentType();
+    }
+
+    if (contentType == null) {
+      LOG.error("Failed to determine content type!");
+      return (-1);
+    }
+
+    if (ParseSegment.isTruncated(content)) {
+      LOG.warn("Content is truncated, parse may fail!");
+    }
+
+    ScoringFilters scfilters = new ScoringFilters(conf);
+    // call the scoring filters
+    try {
+      scfilters.passScoreBeforeParsing(turl, cd, content);
+    } catch (Exception e) {
+      if (LOG.isWarnEnabled()) {
+        LOG.warn("Couldn't pass score before parsing, url " + turl + " (" + e
+            + ")");
+        LOG.warn(StringUtils.stringifyException(e));
+      }
+    }
+
+    ParseResult parseResult = new ParseUtil(conf).parse(content);
+
+    if (parseResult == null) {
+      LOG.error("Parsing content failed!");
+      return (-1);
+    }
+
+    // Calculate the signature
+    byte[] signature = SignatureFactory.getSignature(getConf()).calculate(
+        content, parseResult.get(new Text(url)));
+
+    if (LOG.isInfoEnabled()) {
+      LOG.info("parsing: " + url);
+      LOG.info("contentType: " + contentType);
+      LOG.info("signature: " + StringUtil.toHexString(signature));
+    }
+
+    Parse parse = parseResult.get(turl);
+    if (parse == null) {
+      LOG.error("Failed to get parse from parse result");
+      LOG.error("Available parses in parse result (by URL key):");
+      for (Map.Entry<Text, Parse> entry : parseResult) {
+        LOG.error("  " + entry.getKey());
+      }
+      LOG.error("Parse result does not contain a parse for URL to be checked:");
+      LOG.error("  " + turl);
+      return -1;
+    }
+
+    // call the scoring filters
+    try {
+      scfilters.passScoreAfterParsing(turl, content, parse);
+    } catch (Exception e) {
+      if (LOG.isWarnEnabled()) {
+        LOG.warn("Couldn't pass score after parsing, url " + turl + " (" + e
+            + ")");
+        LOG.warn(StringUtils.stringifyException(e));
+      }
+    }
+
+    for (Map.Entry<Text, Parse> entry : parseResult) {
+      parse = entry.getValue();
+      LOG.info("---------\nUrl\n---------------\n");
+      System.out.print(entry.getKey());
+      LOG.info("\n---------\nParseData\n---------\n");
+      System.out.print(parse.getData().toString());
+      if (dumpText) {
+        LOG.info("---------\nParseText\n---------\n");
+        System.out.print(parse.getText());
+      }
+    }
+
+    return 0;
+  }
+
+  @Override
+  public Configuration getConf() {
+    return conf;
+  }
+
+  @Override
+  public void setConf(Configuration c) {
+    conf = c;
+  }
+
+  public static void main(String[] args) throws Exception {
+    int res = ToolRunner.run(NutchConfiguration.create(), new ParserChecker(),
+        args);
+    System.exit(res);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/parse/ParserFactory.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/parse/ParserFactory.java b/nutch-core/src/main/java/org/apache/nutch/parse/ParserFactory.java
new file mode 100644
index 0000000..0982de4
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/parse/ParserFactory.java
@@ -0,0 +1,428 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse;
+
+// JDK imports
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Vector;
+
+// Commons Logging imports
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
+
+// Nutch imports
+import org.apache.nutch.plugin.Extension;
+import org.apache.nutch.plugin.ExtensionPoint;
+import org.apache.nutch.plugin.PluginRuntimeException;
+import org.apache.nutch.plugin.PluginRepository;
+import org.apache.nutch.util.MimeUtil;
+import org.apache.nutch.util.ObjectCache;
+
+/** Creates and caches {@link Parser} plugins. */
+public final class ParserFactory {
+
+  public static final Logger LOG = LoggerFactory.getLogger(ParserFactory.class);
+
+  /** Wildcard for default plugins. */
+  public static final String DEFAULT_PLUGIN = "*";
+
+  /** Empty extension list for caching purposes. */
+  private final List<Extension> EMPTY_EXTENSION_LIST = Collections
+      .<Extension> emptyList();
+
+  private Configuration conf;
+  private ExtensionPoint extensionPoint;
+  private ParsePluginList parsePluginList;
+
+  public ParserFactory(Configuration conf) {
+    this.conf = conf;
+    ObjectCache objectCache = ObjectCache.get(conf);
+    this.extensionPoint = PluginRepository.get(conf).getExtensionPoint(
+        Parser.X_POINT_ID);
+    this.parsePluginList = (ParsePluginList) objectCache
+        .getObject(ParsePluginList.class.getName());
+
+    if (this.parsePluginList == null) {
+      this.parsePluginList = new ParsePluginsReader().parse(conf);
+      objectCache.setObject(ParsePluginList.class.getName(),
+          this.parsePluginList);
+    }
+
+    if (this.extensionPoint == null) {
+      throw new RuntimeException("x point " + Parser.X_POINT_ID + " not found.");
+    }
+    if (this.parsePluginList == null) {
+      throw new RuntimeException(
+          "Parse Plugins preferences could not be loaded.");
+    }
+  }
+
+  /**
+   * Function returns an array of {@link Parser}s for a given content type.
+   * 
+   * The function consults the internal list of parse plugins for the
+   * ParserFactory to determine the list of pluginIds, then gets the appropriate
+   * extension points to instantiate as {@link Parser}s.
+   * 
+   * @param contentType
+   *          The contentType to return the <code>Array</code> of {@link Parser}
+   *          s for.
+   * @param url
+   *          The url for the content that may allow us to get the type from the
+   *          file suffix.
+   * @return An <code>Array</code> of {@link Parser}s for the given contentType.
+   *         If there were plugins mapped to a contentType via the
+   *         <code>parse-plugins.xml</code> file, but never enabled via the
+   *         <code>plugin.includes</code> Nutch conf, then those plugins won't
+   *         be part of this array, i.e., they will be skipped. So, if the
+   *         ordered list of parsing plugins for <code>text/plain</code> was
+   *         <code>[parse-text,parse-html,
+   *         parse-rtf]</code>, and only <code>parse-html</code> and
+   *         <code>parse-rtf</code> were enabled via
+   *         <code>plugin.includes</code>, then this ordered Array would consist
+   *         of two {@link Parser} interfaces,
+   *         <code>[parse-html, parse-rtf]</code>.
+   */
+  public Parser[] getParsers(String contentType, String url)
+      throws ParserNotFound {
+
+    List<Parser> parsers = null;
+    List<Extension> parserExts = null;
+
+    ObjectCache objectCache = ObjectCache.get(conf);
+
+    // TODO once the MimeTypes is available
+    // parsers = getExtensions(MimeUtils.map(contentType));
+    // if (parsers != null) {
+    // return parsers;
+    // }
+    // Last Chance: Guess content-type from file url...
+    // parsers = getExtensions(MimeUtils.getMimeType(url));
+
+    parserExts = getExtensions(contentType);
+    if (parserExts == null) {
+      throw new ParserNotFound(url, contentType);
+    }
+
+    parsers = new Vector<Parser>(parserExts.size());
+    for (Iterator<Extension> i = parserExts.iterator(); i.hasNext();) {
+      Extension ext = i.next();
+      Parser p = null;
+      try {
+        // check to see if we've cached this parser instance yet
+        p = (Parser) objectCache.getObject(ext.getId());
+        if (p == null) {
+          // go ahead and instantiate it and then cache it
+          p = (Parser) ext.getExtensionInstance();
+          objectCache.setObject(ext.getId(), p);
+        }
+        parsers.add(p);
+      } catch (PluginRuntimeException e) {
+        if (LOG.isWarnEnabled()) {
+          LOG.warn("ParserFactory:PluginRuntimeException when "
+              + "initializing parser plugin "
+              + ext.getDescriptor().getPluginId() + " instance in getParsers "
+              + "function: attempting to continue instantiating parsers");
+        }
+      }
+    }
+    return parsers.toArray(new Parser[] {});
+  }
+
+  /**
+   * Function returns a {@link Parser} instance with the specified
+   * <code>extId</code>, representing its extension ID. If the Parser instance
+   * isn't found, then the function throws a <code>ParserNotFound</code>
+   * exception. If the function is able to find the {@link Parser} in the
+   * internal <code>PARSER_CACHE</code> then it will return the already
+   * instantiated Parser. Otherwise, if it has to instantiate the Parser itself
+   * , then this function will cache that Parser in the internal
+   * <code>PARSER_CACHE</code>.
+   * 
+   * @param id
+   *          The string extension ID (e.g.,
+   *          "org.apache.nutch.parse.rss.RSSParser",
+   *          "org.apache.nutch.parse.rtf.RTFParseFactory") of the
+   *          {@link Parser} implementation to return.
+   * @return A {@link Parser} implementation specified by the parameter
+   *         <code>id</code>.
+   * @throws ParserNotFound
+   *           If the Parser is not found (i.e., registered with the extension
+   *           point), or if the there a {@link PluginRuntimeException}
+   *           instantiating the {@link Parser}.
+   */
+  public Parser getParserById(String id) throws ParserNotFound {
+
+    Extension[] extensions = this.extensionPoint.getExtensions();
+    Extension parserExt = null;
+
+    ObjectCache objectCache = ObjectCache.get(conf);
+
+    if (id != null) {
+      parserExt = getExtension(extensions, id);
+    }
+    if (parserExt == null) {
+      parserExt = getExtensionFromAlias(extensions, id);
+    }
+
+    if (parserExt == null) {
+      throw new ParserNotFound("No Parser Found for id [" + id + "]");
+    }
+
+    // first check the cache
+    if (objectCache.getObject(parserExt.getId()) != null) {
+      return (Parser) objectCache.getObject(parserExt.getId());
+
+      // if not found in cache, instantiate the Parser
+    } else {
+      try {
+        Parser p = (Parser) parserExt.getExtensionInstance();
+        objectCache.setObject(parserExt.getId(), p);
+        return p;
+      } catch (PluginRuntimeException e) {
+        if (LOG.isWarnEnabled()) {
+          LOG.warn("Canno initialize parser "
+              + parserExt.getDescriptor().getPluginId() + " (cause: "
+              + e.toString());
+        }
+        throw new ParserNotFound("Cannot init parser for id [" + id + "]");
+      }
+    }
+  }
+
+  /**
+   * Finds the best-suited parse plugin for a given contentType.
+   * 
+   * @param contentType
+   *          Content-Type for which we seek a parse plugin.
+   * @return a list of extensions to be used for this contentType. If none,
+   *         returns <code>null</code>.
+   */
+  @SuppressWarnings("unchecked")
+  protected List<Extension> getExtensions(String contentType) {
+
+    ObjectCache objectCache = ObjectCache.get(conf);
+    // First of all, tries to clean the content-type
+    String type = null;
+    type = MimeUtil.cleanMimeType(contentType);
+
+    List<Extension> extensions = (List<Extension>) objectCache.getObject(type);
+
+    // Just compare the reference:
+    // if this is the empty list, we know we will find no extension.
+    if (extensions == EMPTY_EXTENSION_LIST) {
+      return null;
+    }
+
+    if (extensions == null) {
+      extensions = findExtensions(type);
+      if (extensions != null) {
+        objectCache.setObject(type, extensions);
+      } else {
+        // Put the empty extension list into cache
+        // to remember we don't know any related extension.
+        objectCache.setObject(type, EMPTY_EXTENSION_LIST);
+      }
+    }
+    return extensions;
+  }
+
+  /**
+   * searches a list of suitable parse plugins for the given contentType.
+   * <p>
+   * It first looks for a preferred plugin defined in the parse-plugin file. If
+   * none is found, it returns a list of default plugins.
+   * 
+   * @param contentType
+   *          Content-Type for which we seek a parse plugin.
+   * @return List - List of extensions to be used for this contentType. If none,
+   *         returns null.
+   */
+  private List<Extension> findExtensions(String contentType) {
+
+    Extension[] extensions = this.extensionPoint.getExtensions();
+
+    // Look for a preferred plugin.
+    List<String> parsePluginList = this.parsePluginList
+        .getPluginList(contentType);
+    List<Extension> extensionList = matchExtensions(parsePluginList,
+        extensions, contentType);
+    if (extensionList != null) {
+      return extensionList;
+    }
+
+    // If none found, look for a default plugin.
+    parsePluginList = this.parsePluginList.getPluginList(DEFAULT_PLUGIN);
+    return matchExtensions(parsePluginList, extensions, DEFAULT_PLUGIN);
+  }
+
+  /**
+   * Tries to find a suitable parser for the given contentType.
+   * <ol>
+   * <li>It checks if a parser which accepts the contentType can be found in the
+   * <code>plugins</code> list;</li>
+   * <li>If this list is empty, it tries to find amongst the loaded extensions
+   * whether some of them might suit and warns the user.</li>
+   * </ol>
+   * 
+   * @param plugins
+   *          List of candidate plugins.
+   * @param extensions
+   *          Array of loaded extensions.
+   * @param contentType
+   *          Content-Type for which we seek a parse plugin.
+   * @return List - List of extensions to be used for this contentType. If none,
+   *         returns null.
+   */
+  private List<Extension> matchExtensions(List<String> plugins,
+      Extension[] extensions, String contentType) {
+
+    List<Extension> extList = new ArrayList<Extension>();
+    if (plugins != null) {
+
+      for (String parsePluginId : plugins) {
+
+        Extension ext = getExtension(extensions, parsePluginId, contentType);
+        // the extension returned may be null
+        // that means that it was not enabled in the plugin.includes
+        // nutch conf property, but it was mapped in the
+        // parse-plugins.xml
+        // file.
+        // OR it was enabled in plugin.includes, but the plugin's plugin.xml
+        // file does not claim that the plugin supports the specified mimeType
+        // in either case, LOG the appropriate error message to WARN level
+
+        if (ext == null) {
+          // try to get it just by its pluginId
+          ext = getExtension(extensions, parsePluginId);
+
+          if (LOG.isWarnEnabled()) {
+            if (ext != null) {
+              // plugin was enabled via plugin.includes
+              // its plugin.xml just doesn't claim to support that
+              // particular mimeType
+              LOG.warn("ParserFactory:Plugin: " + parsePluginId
+                  + " mapped to contentType " + contentType
+                  + " via parse-plugins.xml, but " + "its plugin.xml "
+                  + "file does not claim to support contentType: "
+                  + contentType);
+            } else {
+              // plugin wasn't enabled via plugin.includes
+              LOG.warn("ParserFactory: Plugin: " + parsePluginId
+                  + " mapped to contentType " + contentType
+                  + " via parse-plugins.xml, but not enabled via "
+                  + "plugin.includes in nutch-default.xml");
+            }
+          }
+        }
+
+        if (ext != null) {
+          // add it to the list
+          extList.add(ext);
+        }
+      }
+
+    } else {
+      // okay, there were no list of plugins defined for
+      // this mimeType, however, there may be plugins registered
+      // via the plugin.includes nutch conf property that claim
+      // via their plugin.xml file to support this contentType
+      // so, iterate through the list of extensions and if you find
+      // any extensions where this is the case, throw a
+      // NotMappedParserException
+
+      for (int i = 0; i < extensions.length; i++) {
+        if ("*".equals(extensions[i].getAttribute("contentType"))) {
+          extList.add(0, extensions[i]);
+        } else if (extensions[i].getAttribute("contentType") != null
+            && contentType.matches(escapeContentType(extensions[i]
+                .getAttribute("contentType")))) {
+          extList.add(extensions[i]);
+        }
+      }
+
+      if (extList.size() > 0) {
+        if (LOG.isInfoEnabled()) {
+          StringBuffer extensionsIDs = new StringBuffer("[");
+          boolean isFirst = true;
+          for (Extension ext : extList) {
+            if (!isFirst)
+              extensionsIDs.append(" - ");
+            else
+              isFirst = false;
+            extensionsIDs.append(ext.getId());
+          }
+          extensionsIDs.append("]");
+          LOG.info("The parsing plugins: " + extensionsIDs.toString()
+              + " are enabled via the plugin.includes system "
+              + "property, and all claim to support the content type "
+              + contentType + ", but they are not mapped to it  in the "
+              + "parse-plugins.xml file");
+        }
+      } else if (LOG.isDebugEnabled()) {
+        LOG.debug("ParserFactory:No parse plugins mapped or enabled for "
+            + "contentType " + contentType);
+      }
+    }
+
+    return (extList.size() > 0) ? extList : null;
+  }
+
+  private String escapeContentType(String contentType) {
+    // Escapes contentType in order to use as a regex
+    // (and keep backwards compatibility).
+    // This enables to accept multiple types for a single parser.
+    return contentType.replace("+", "\\+").replace(".", "\\.");
+  }
+
+  private boolean match(Extension extension, String id, String type) {
+    return ((id.equals(extension.getId())) && (extension.getAttribute(
+        "contentType").equals("*")
+        || type
+            .matches(escapeContentType(extension.getAttribute("contentType"))) || type
+          .equals(DEFAULT_PLUGIN)));
+  }
+
+  /** Get an extension from its id and supported content-type. */
+  private Extension getExtension(Extension[] list, String id, String type) {
+    for (int i = 0; i < list.length; i++) {
+      if (match(list[i], id, type)) {
+        return list[i];
+      }
+    }
+    return null;
+  }
+
+  private Extension getExtension(Extension[] list, String id) {
+    for (int i = 0; i < list.length; i++) {
+      if (id.equals(list[i].getId())) {
+        return list[i];
+      }
+    }
+    return null;
+  }
+
+  private Extension getExtensionFromAlias(Extension[] list, String id) {
+    return getExtension(list, parsePluginList.getAliases().get(id));
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/parse/ParserNotFound.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/parse/ParserNotFound.java b/nutch-core/src/main/java/org/apache/nutch/parse/ParserNotFound.java
new file mode 100644
index 0000000..2857efa
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/parse/ParserNotFound.java
@@ -0,0 +1,47 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse;
+
+public class ParserNotFound extends ParseException {
+
+  private static final long serialVersionUID = 23993993939L;
+  private String url;
+  private String contentType;
+
+  public ParserNotFound(String message) {
+    super(message);
+  }
+
+  public ParserNotFound(String url, String contentType) {
+    this(url, contentType, "parser not found for contentType=" + contentType
+        + " url=" + url);
+  }
+
+  public ParserNotFound(String url, String contentType, String message) {
+    super(message);
+    this.url = url;
+    this.contentType = contentType;
+  }
+
+  public String getUrl() {
+    return url;
+  }
+
+  public String getContentType() {
+    return contentType;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/parse/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/parse/package-info.java b/nutch-core/src/main/java/org/apache/nutch/parse/package-info.java
new file mode 100644
index 0000000..40bd3e2
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/parse/package-info.java
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * The {@link org.apache.nutch.parse.Parse Parse} interface and related classes.
+ */
+package org.apache.nutch.parse;
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/plugin/CircularDependencyException.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/plugin/CircularDependencyException.java b/nutch-core/src/main/java/org/apache/nutch/plugin/CircularDependencyException.java
new file mode 100644
index 0000000..f50c11a
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/plugin/CircularDependencyException.java
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.plugin;
+
+/**
+ * <code>CircularDependencyException</code> will be thrown if a circular
+ * dependency is detected.
+ * 
+ * @author J&eacute;r&ocirc;me Charron
+ */
+public class CircularDependencyException extends Exception {
+
+  private static final long serialVersionUID = 1L;
+
+  public CircularDependencyException(Throwable cause) {
+    super(cause);
+  }
+
+  public CircularDependencyException(String message) {
+    super(message);
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/plugin/Extension.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/plugin/Extension.java b/nutch-core/src/main/java/org/apache/nutch/plugin/Extension.java
new file mode 100644
index 0000000..b0ee0af
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/plugin/Extension.java
@@ -0,0 +1,194 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.plugin;
+
+import java.util.HashMap;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configurable;
+
+/**
+ * An <code>Extension</code> is a kind of listener descriptor that will be
+ * installed on a concrete <code>ExtensionPoint</code> that acts as kind of
+ * Publisher.
+ */
+public class Extension {
+  private PluginDescriptor fDescriptor;
+  private String fId;
+  private String fTargetPoint;
+  private String fClazz;
+  private HashMap<String, String> fAttributes;
+  private Configuration conf;
+
+  /**
+   * @param pDescriptor
+   *          a plugin descriptor
+   * @param pExtensionPoint
+   *          an extension porin
+   * @param pId
+   *          an unique id of the plugin
+   */
+  public Extension(PluginDescriptor pDescriptor, String pExtensionPoint,
+      String pId, String pExtensionClass, Configuration conf,
+      PluginRepository pluginRepository) {
+    fAttributes = new HashMap<String, String>();
+    setDescriptor(pDescriptor);
+    setExtensionPoint(pExtensionPoint);
+    setId(pId);
+    setClazz(pExtensionClass);
+    this.conf = conf;
+  }
+
+  /**
+   * @param point
+   */
+  private void setExtensionPoint(String point) {
+    fTargetPoint = point;
+  }
+
+  /**
+   * Returns a attribute value, that is setuped in the manifest file and is
+   * definied by the extension point xml schema.
+   * 
+   * @param pKey
+   *          a key
+   * @return String a value
+   */
+  public String getAttribute(String pKey) {
+    return fAttributes.get(pKey);
+  }
+
+  /**
+   * Returns the full class name of the extension point implementation
+   * 
+   * @return String
+   */
+  public String getClazz() {
+    return fClazz;
+  }
+
+  /**
+   * Return the unique id of the extension.
+   * 
+   * @return String
+   */
+  public String getId() {
+    return fId;
+  }
+
+  /**
+   * Adds a attribute and is only used until model creation at plugin system
+   * start up.
+   * 
+   * @param pKey
+   *          a key
+   * @param pValue
+   *          a value
+   */
+  public void addAttribute(String pKey, String pValue) {
+    fAttributes.put(pKey, pValue);
+  }
+
+  /**
+   * Sets the Class that implement the concret extension and is only used until
+   * model creation at system start up.
+   * 
+   * @param extensionClazz
+   *          The extensionClasname to set
+   */
+  public void setClazz(String extensionClazz) {
+    fClazz = extensionClazz;
+  }
+
+  /**
+   * Sets the unique extension Id and is only used until model creation at
+   * system start up.
+   * 
+   * @param extensionID
+   *          The extensionID to set
+   */
+  public void setId(String extensionID) {
+    fId = extensionID;
+  }
+
+  /**
+   * Returns the Id of the extension point, that is implemented by this
+   * extension.
+   */
+  public String getTargetPoint() {
+    return fTargetPoint;
+  }
+
+  /**
+   * Return an instance of the extension implementatio. Before we create a
+   * extension instance we startup the plugin if it is not already done. The
+   * plugin instance and the extension instance use the same
+   * <code>PluginClassLoader</code>. Each Plugin use its own classloader. The
+   * PluginClassLoader knows only own <i>Plugin runtime libraries </i> setuped
+   * in the plugin manifest file and exported libraries of the depenedend
+   * plugins.
+   * 
+   * @return Object An instance of the extension implementation
+   */
+  public Object getExtensionInstance() throws PluginRuntimeException {
+    // Must synchronize here to make sure creation and initialization
+    // of a plugin instance and it extension instance are done by
+    // one and only one thread.
+    // The same is in PluginRepository.getPluginInstance().
+    // Suggested by Stefan Groschupf <sg...@media-style.com>
+    synchronized (getId()) {
+      try {
+        PluginRepository pluginRepository = PluginRepository.get(conf);
+        Class<?> extensionClazz = pluginRepository.getCachedClass(fDescriptor,
+            getClazz());
+        // lazy loading of Plugin in case there is no instance of the plugin
+        // already.
+        pluginRepository.getPluginInstance(getDescriptor());
+        Object object = extensionClazz.newInstance();
+        if (object instanceof Configurable) {
+          ((Configurable) object).setConf(this.conf);
+        }
+        return object;
+      } catch (ClassNotFoundException e) {
+        throw new PluginRuntimeException(e);
+      } catch (InstantiationException e) {
+        throw new PluginRuntimeException(e);
+      } catch (IllegalAccessException e) {
+        throw new PluginRuntimeException(e);
+      }
+    }
+  }
+
+  /**
+   * return the plugin descriptor.
+   * 
+   * @return PluginDescriptor
+   */
+  public PluginDescriptor getDescriptor() {
+    return fDescriptor;
+  }
+
+  /**
+   * Sets the plugin descriptor and is only used until model creation at system
+   * start up.
+   * 
+   * @param pDescriptor
+   */
+  public void setDescriptor(PluginDescriptor pDescriptor) {
+    fDescriptor = pDescriptor;
+  }
+}

[36/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build for nutch-core and nutch-plugins

Posted by th...@apache.org.

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java b/nutch-core/src/main/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java
new file mode 100644
index 0000000..1b425c4
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java
@@ -0,0 +1,393 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tools;
+
+import java.io.IOException;
+import java.io.UnsupportedEncodingException;
+import java.net.InetAddress;
+import java.net.URLEncoder;
+import java.net.UnknownHostException;
+import java.text.ParseException;
+import java.util.List;
+
+import org.apache.commons.httpclient.URIException;
+import org.apache.commons.httpclient.util.URIUtil;
+import org.apache.commons.lang.NotImplementedException;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.URLUtil;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.ibm.icu.text.SimpleDateFormat;
+
+/**
+ * Abstract class that implements {@see CommonCrawlFormat} interface. 
+ *
+ */
+public abstract class AbstractCommonCrawlFormat implements CommonCrawlFormat {
+  protected static final Logger LOG = LoggerFactory.getLogger(AbstractCommonCrawlFormat.class.getName());
+
+  protected String url;
+
+  protected Content content;
+
+  protected Metadata metadata;
+
+  protected Configuration conf;
+
+  protected String keyPrefix;
+
+  protected boolean simpleDateFormat;
+
+  protected boolean jsonArray;
+
+  protected boolean reverseKey;
+
+  protected String reverseKeyValue;
+
+  protected List<String> inLinks;
+
+  public AbstractCommonCrawlFormat(String url, Content content, Metadata metadata, Configuration nutchConf, CommonCrawlConfig config) throws IOException {
+    this.url = url;
+    this.content = content;
+    this.metadata = metadata;
+    this.conf = nutchConf;
+
+    this.keyPrefix = config.getKeyPrefix();
+    this.simpleDateFormat = config.getSimpleDateFormat();
+    this.jsonArray = config.getJsonArray();
+    this.reverseKey = config.getReverseKey();
+    this.reverseKeyValue = config.getReverseKeyValue();
+  }
+
+  public String getJsonData(String url, Content content, Metadata metadata)
+      throws IOException {
+    this.url = url;
+    this.content = content;
+    this.metadata = metadata;
+
+    return this.getJsonData();
+  }
+
+  public String getJsonData(String url, Content content, Metadata metadata,
+      ParseData parseData) throws IOException {
+
+    // override of this is required in the actual formats
+    throw new NotImplementedException();
+  }
+
+  @Override
+  public String getJsonData() throws IOException {
+    try {
+      startObject(null);
+
+      // url
+      writeKeyValue("url", getUrl());
+
+      // timestamp
+      writeKeyValue("timestamp", getTimestamp());
+
+      // request
+      startObject("request");
+      writeKeyValue("method", getMethod());
+      startObject("client");
+      writeKeyValue("hostname", getRequestHostName());
+      writeKeyValue("address", getRequestHostAddress());
+      writeKeyValue("software", getRequestSoftware());
+      writeKeyValue("robots", getRequestRobots());
+      startObject("contact");
+      writeKeyValue("name", getRequestContactName());
+      writeKeyValue("email", getRequestContactEmail());
+      closeObject("contact");
+      closeObject("client");
+      // start request headers
+      startHeaders("headers", false, true);
+      writeKeyValueWrapper("Accept", getRequestAccept());
+      writeKeyValueWrapper("Accept-Encoding", getRequestAcceptEncoding());
+      writeKeyValueWrapper("Accept-Language", getRequestAcceptLanguage());
+      writeKeyValueWrapper("User-Agent", getRequestUserAgent());
+      //closeObject("headers");
+      closeHeaders("headers", false, true);
+      writeKeyNull("body");
+      closeObject("request");
+
+      // response
+      startObject("response");
+      writeKeyValue("status", getResponseStatus());
+      startObject("server");
+      writeKeyValue("hostname", getResponseHostName());
+      writeKeyValue("address", getResponseAddress());
+      closeObject("server");
+      // start response headers
+      startHeaders("headers", false, true);
+      writeKeyValueWrapper("Content-Encoding", getResponseContentEncoding());
+      writeKeyValueWrapper("Content-Type", getResponseContentType());
+      writeKeyValueWrapper("Date", getResponseDate());
+      writeKeyValueWrapper("Server", getResponseServer());
+      for (String name : metadata.names()) {
+        if (name.equalsIgnoreCase("Content-Encoding") || name.equalsIgnoreCase("Content-Type") || name.equalsIgnoreCase("Date") || name.equalsIgnoreCase("Server")) {
+          continue;
+        }
+        writeKeyValueWrapper(name, metadata.get(name));
+      }
+      closeHeaders("headers", false, true);
+      writeKeyValue("body", getResponseContent());
+      closeObject("response");
+
+      // key
+      if (!this.keyPrefix.isEmpty()) {
+        this.keyPrefix += "-";
+      }
+      writeKeyValue("key", this.keyPrefix + getKey());
+
+      // imported
+      writeKeyValue("imported", getImported());
+
+      if (getInLinks() != null){
+        startArray("inlinks", false, true);
+        for (String link : getInLinks()) {
+          writeArrayValue(link);
+        }
+        closeArray("inlinks", false, true);
+      }
+      closeObject(null);
+
+      return generateJson();
+
+    } catch (IOException ioe) {
+      LOG.warn("Error in processing file " + url + ": " + ioe.getMessage());
+      throw new IOException("Error in generating JSON:" + ioe.getMessage());
+    }
+  }
+
+  // abstract methods
+
+  protected abstract void writeKeyValue(String key, String value) throws IOException;
+
+  protected abstract void writeKeyNull(String key) throws IOException;
+
+  protected abstract void startArray(String key, boolean nested, boolean newline) throws IOException;
+
+  protected abstract void closeArray(String key, boolean nested, boolean newline) throws IOException;
+
+  protected abstract void writeArrayValue(String value) throws IOException;
+
+  protected abstract void startObject(String key) throws IOException;
+
+  protected abstract void closeObject(String key) throws IOException;
+
+  protected abstract String generateJson() throws IOException;
+
+  // getters
+
+  protected String getUrl() {
+    try {
+      return URIUtil.encodePath(url);
+    } catch (URIException e) {
+      LOG.error("Can't encode URL " + url);
+    }
+
+    return url;
+  }
+
+  protected String getTimestamp() {
+    if (this.simpleDateFormat) {
+      String timestamp = null;
+      try {
+        long epoch = new SimpleDateFormat("EEE, d MMM yyyy HH:mm:ss z").parse(ifNullString(metadata.get(Metadata.LAST_MODIFIED))).getTime();
+        timestamp = String.valueOf(epoch);
+      } catch (ParseException pe) {
+        LOG.warn(pe.getMessage());
+      }
+      return timestamp;
+    } else {
+      return ifNullString(metadata.get(Metadata.LAST_MODIFIED));
+    }
+  }
+
+  protected String getMethod() {
+    return new String("GET");
+  }
+
+  protected String getRequestHostName() {
+    String hostName = "";
+    try {
+      hostName = InetAddress.getLocalHost().getHostName();
+    } catch (UnknownHostException uhe) {
+
+    }
+    return hostName;
+  }
+
+  protected String getRequestHostAddress() {
+    String hostAddress = "";
+    try {
+      hostAddress = InetAddress.getLocalHost().getHostAddress();
+    } catch (UnknownHostException uhe) {
+
+    }
+    return hostAddress;
+  }
+
+  protected String getRequestSoftware() {
+    return conf.get("http.agent.version", "");
+  }
+
+  protected String getRequestRobots() {
+    return new String("CLASSIC");
+  }
+
+  protected String getRequestContactName() {
+    return conf.get("http.agent.name", "");
+  }
+
+  protected String getRequestContactEmail() {
+    return conf.get("http.agent.email", "");
+  }
+
+  protected String getRequestAccept() {
+    return conf.get("http.accept", "");
+  }
+
+  protected String getRequestAcceptEncoding() {
+    return new String(""); // TODO
+  }
+
+  protected String getRequestAcceptLanguage() {
+    return conf.get("http.accept.language", "");
+  }
+
+  protected String getRequestUserAgent() {
+    return conf.get("http.robots.agents", "");
+  }
+
+  protected String getResponseStatus() {
+    return ifNullString(metadata.get("status"));
+  }
+
+  protected String getResponseHostName() {
+    return URLUtil.getHost(url);
+  }
+
+  protected String getResponseAddress() {
+    return ifNullString(metadata.get("_ip_"));
+  }
+
+  protected String getResponseContentEncoding() {
+    return ifNullString(metadata.get("Content-Encoding"));
+  }
+
+  protected String getResponseContentType() {
+    return ifNullString(metadata.get("Content-Type"));
+  }
+
+  public List<String> getInLinks() {
+    return inLinks;
+  }
+
+  public void setInLinks(List<String> inLinks) {
+    this.inLinks = inLinks;
+  }
+
+  protected String getResponseDate() {
+    if (this.simpleDateFormat) {
+      String timestamp = null;
+      try {
+        long epoch = new SimpleDateFormat("EEE, dd MMM yyyy HH:mm:ss z").parse(ifNullString(metadata.get("Date"))).getTime();
+        timestamp = String.valueOf(epoch);
+      } catch (ParseException pe) {
+        LOG.warn(pe.getMessage());
+      }
+      return timestamp;
+    } else {
+      return ifNullString(metadata.get("Date"));
+    }
+  }
+
+  protected String getResponseServer() {
+    return ifNullString(metadata.get("Server"));
+  }
+
+  protected String getResponseContent() {
+    return new String(content.getContent());
+  }
+
+  protected String getKey() {
+    if (this.reverseKey) {
+      return this.reverseKeyValue;
+    }
+    else {
+      return url;
+    }
+  }
+
+  protected String getImported() {
+    if (this.simpleDateFormat) {
+      String timestamp = null;
+      try {
+        long epoch = new SimpleDateFormat("EEE, d MMM yyyy HH:mm:ss z").parse(ifNullString(metadata.get("Date"))).getTime();
+        timestamp = String.valueOf(epoch);
+      } catch (ParseException pe) {
+        LOG.warn(pe.getMessage());
+      }
+      return timestamp;
+    } else {
+      return ifNullString(metadata.get("Date"));
+    }
+  }
+
+  private static String ifNullString(String value) {
+    return (value != null) ? value : "";
+  }
+
+  private void startHeaders(String key, boolean nested, boolean newline) throws IOException {
+    if (this.jsonArray) {
+      startArray(key, nested, newline);
+    }
+    else {
+      startObject(key);
+    }
+  }
+
+  private void closeHeaders(String key, boolean nested, boolean newline) throws IOException {
+    if (this.jsonArray) {
+      closeArray(key, nested, newline);
+    }
+    else {
+      closeObject(key);
+    }
+  }
+
+  private void writeKeyValueWrapper(String key, String value) throws IOException {
+    if (this.jsonArray) {
+      startArray(null, true, false);
+      writeArrayValue(key);
+      writeArrayValue(value);
+      closeArray(null, true, false);
+    }
+    else {
+      writeKeyValue(key, value);
+    }
+  }
+
+  @Override
+  public void close() {}
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/tools/Benchmark.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/tools/Benchmark.java b/nutch-core/src/main/java/org/apache/nutch/tools/Benchmark.java
new file mode 100755
index 0000000..ba42745
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/tools/Benchmark.java
@@ -0,0 +1,284 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tools;
+
+import java.io.OutputStream;
+import java.text.SimpleDateFormat;
+import java.util.ArrayList;
+import java.util.Date;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.nutch.crawl.CrawlDb;
+import org.apache.nutch.crawl.CrawlDbReader;
+import org.apache.nutch.crawl.Generator;
+import org.apache.nutch.crawl.Injector;
+import org.apache.nutch.crawl.LinkDb;
+import org.apache.nutch.fetcher.Fetcher;
+import org.apache.nutch.parse.ParseSegment;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.NutchJob;
+
+public class Benchmark extends Configured implements Tool {
+  private static final Log LOG = LogFactory.getLog(Benchmark.class);
+
+  public static void main(String[] args) throws Exception {
+    Configuration conf = NutchConfiguration.create();
+    int res = ToolRunner.run(conf, new Benchmark(), args);
+    System.exit(res);
+  }
+
+  @SuppressWarnings("unused")
+  private static String getDate() {
+    return new SimpleDateFormat("yyyyMMddHHmmss").format(new Date(System
+        .currentTimeMillis()));
+  }
+
+  private void createSeeds(FileSystem fs, Path seedsDir, int count)
+      throws Exception {
+    OutputStream os = fs.create(new Path(seedsDir, "seeds"));
+    for (int i = 0; i < count; i++) {
+      String url = "http://www.test-" + i + ".com/\r\n";
+      os.write(url.getBytes());
+    }
+    os.flush();
+    os.close();
+  }
+
+  public static final class BenchmarkResults {
+    Map<String, Map<String, Long>> timings = new HashMap<String, Map<String, Long>>();
+    List<String> runs = new ArrayList<String>();
+    List<String> stages = new ArrayList<String>();
+    int seeds, depth, threads;
+    boolean delete;
+    long topN;
+    long elapsed;
+    String plugins;
+
+    public void addTiming(String stage, String run, long timing) {
+      if (!runs.contains(run)) {
+        runs.add(run);
+      }
+      if (!stages.contains(stage)) {
+        stages.add(stage);
+      }
+      Map<String, Long> t = timings.get(stage);
+      if (t == null) {
+        t = new HashMap<String, Long>();
+        timings.put(stage, t);
+      }
+      t.put(run, timing);
+    }
+
+    public String toString() {
+      StringBuilder sb = new StringBuilder();
+      sb.append("* Plugins:\t" + plugins + "\n");
+      sb.append("* Seeds:\t" + seeds + "\n");
+      sb.append("* Depth:\t" + depth + "\n");
+      sb.append("* Threads:\t" + threads + "\n");
+      sb.append("* TopN:\t" + topN + "\n");
+      sb.append("* Delete:\t" + delete + "\n");
+      sb.append("* TOTAL ELAPSED:\t" + elapsed + "\n");
+      for (String stage : stages) {
+        Map<String, Long> timing = timings.get(stage);
+        if (timing == null)
+          continue;
+        sb.append("- stage: " + stage + "\n");
+        for (String r : runs) {
+          Long Time = timing.get(r);
+          if (Time == null) {
+            continue;
+          }
+          sb.append("\trun " + r + "\t" + Time + "\n");
+        }
+      }
+      return sb.toString();
+    }
+
+    public List<String> getStages() {
+      return stages;
+    }
+
+    public List<String> getRuns() {
+      return runs;
+    }
+  }
+
+  public int run(String[] args) throws Exception {
+    String plugins = "protocol-http|parse-tika|scoring-opic|urlfilter-regex|urlnormalizer-pass";
+    int seeds = 1;
+    int depth = 10;
+    int threads = 10;
+    boolean delete = true;
+    long topN = Long.MAX_VALUE;
+
+    if (args.length == 0) {
+      System.err
+          .println("Usage: Benchmark [-seeds NN] [-depth NN] [-threads NN] [-keep] [-maxPerHost NN] [-plugins <regex>]");
+      System.err
+          .println("\t-seeds NN\tcreate NN unique hosts in a seed list (default: 1)");
+      System.err.println("\t-depth NN\tperform NN crawl cycles (default: 10)");
+      System.err
+          .println("\t-threads NN\tuse NN threads per Fetcher task (default: 10)");
+      System.err
+          .println("\t-keep\tkeep segment data (default: delete after updatedb)");
+      System.err.println("\t-plugins <regex>\toverride 'plugin.includes'.");
+      System.err.println("\tNOTE: if not specified, this is reset to: "
+          + plugins);
+      System.err
+          .println("\tNOTE: if 'default' is specified then a value set in nutch-default/nutch-site is used.");
+      System.err
+          .println("\t-maxPerHost NN\tmax. # of URLs per host in a fetchlist");
+      return -1;
+    }
+    int maxPerHost = Integer.MAX_VALUE;
+    for (int i = 0; i < args.length; i++) {
+      if (args[i].equals("-seeds")) {
+        seeds = Integer.parseInt(args[++i]);
+      } else if (args[i].equals("-threads")) {
+        threads = Integer.parseInt(args[++i]);
+      } else if (args[i].equals("-depth")) {
+        depth = Integer.parseInt(args[++i]);
+      } else if (args[i].equals("-keep")) {
+        delete = false;
+      } else if (args[i].equals("-plugins")) {
+        plugins = args[++i];
+      } else if (args[i].equalsIgnoreCase("-maxPerHost")) {
+        maxPerHost = Integer.parseInt(args[++i]);
+      } else {
+        LOG.fatal("Invalid argument: '" + args[i] + "'");
+        return -1;
+      }
+    }
+    BenchmarkResults res = benchmark(seeds, depth, threads, maxPerHost, topN,
+        delete, plugins);
+    System.out.println(res);
+    return 0;
+  }
+
+  public BenchmarkResults benchmark(int seeds, int depth, int threads,
+      int maxPerHost, long topN, boolean delete, String plugins)
+      throws Exception {
+    Configuration conf = getConf();
+    conf.set("http.proxy.host", "localhost");
+    conf.setInt("http.proxy.port", 8181);
+    conf.set("http.agent.name", "test");
+    conf.set("http.robots.agents", "test,*");
+    if (!plugins.equals("default")) {
+      conf.set("plugin.includes", plugins);
+    }
+    conf.setInt(Generator.GENERATOR_MAX_COUNT, maxPerHost);
+    conf.set(Generator.GENERATOR_COUNT_MODE,
+        Generator.GENERATOR_COUNT_VALUE_HOST);
+    JobConf job = new NutchJob(getConf());
+    FileSystem fs = FileSystem.get(job);
+    Path dir = new Path(getConf().get("hadoop.tmp.dir"), "bench-"
+        + System.currentTimeMillis());
+    fs.mkdirs(dir);
+    Path rootUrlDir = new Path(dir, "seed");
+    fs.mkdirs(rootUrlDir);
+    createSeeds(fs, rootUrlDir, seeds);
+
+    if (LOG.isInfoEnabled()) {
+      LOG.info("crawl started in: " + dir);
+      LOG.info("rootUrlDir = " + rootUrlDir);
+      LOG.info("threads = " + threads);
+      LOG.info("depth = " + depth);
+    }
+    BenchmarkResults res = new BenchmarkResults();
+    res.delete = delete;
+    res.depth = depth;
+    res.plugins = plugins;
+    res.seeds = seeds;
+    res.threads = threads;
+    res.topN = topN;
+    Path crawlDb = new Path(dir + "/crawldb");
+    Path linkDb = new Path(dir + "/linkdb");
+    Path segments = new Path(dir + "/segments");
+    res.elapsed = System.currentTimeMillis();
+    Injector injector = new Injector(getConf());
+    Generator generator = new Generator(getConf());
+    Fetcher fetcher = new Fetcher(getConf());
+    ParseSegment parseSegment = new ParseSegment(getConf());
+    CrawlDb crawlDbTool = new CrawlDb(getConf());
+    LinkDb linkDbTool = new LinkDb(getConf());
+
+    // initialize crawlDb
+    long start = System.currentTimeMillis();
+    injector.inject(crawlDb, rootUrlDir);
+    long delta = System.currentTimeMillis() - start;
+    res.addTiming("inject", "0", delta);
+    int i;
+    for (i = 0; i < depth; i++) { // generate new segment
+      start = System.currentTimeMillis();
+      Path[] segs = generator.generate(crawlDb, segments, -1, topN,
+          System.currentTimeMillis());
+      delta = System.currentTimeMillis() - start;
+      res.addTiming("generate", i + "", delta);
+      if (segs == null) {
+        LOG.info("Stopping at depth=" + i + " - no more URLs to fetch.");
+        break;
+      }
+      start = System.currentTimeMillis();
+      fetcher.fetch(segs[0], threads); // fetch it
+      delta = System.currentTimeMillis() - start;
+      res.addTiming("fetch", i + "", delta);
+      if (!Fetcher.isParsing(job)) {
+        start = System.currentTimeMillis();
+        parseSegment.parse(segs[0]); // parse it, if needed
+        delta = System.currentTimeMillis() - start;
+        res.addTiming("parse", i + "", delta);
+      }
+      start = System.currentTimeMillis();
+      crawlDbTool.update(crawlDb, segs, true, true); // update crawldb
+      delta = System.currentTimeMillis() - start;
+      res.addTiming("update", i + "", delta);
+      start = System.currentTimeMillis();
+      linkDbTool.invert(linkDb, segs, true, true, false); // invert links
+      delta = System.currentTimeMillis() - start;
+      res.addTiming("invert", i + "", delta);
+      // delete data
+      if (delete) {
+        for (Path p : segs) {
+          fs.delete(p, true);
+        }
+      }
+    }
+    if (i == 0) {
+      LOG.warn("No URLs to fetch - check your seed list and URL filters.");
+    }
+    if (LOG.isInfoEnabled()) {
+      LOG.info("crawl finished: " + dir);
+    }
+    res.elapsed = System.currentTimeMillis() - res.elapsed;
+    CrawlDbReader dbreader = new CrawlDbReader();
+    dbreader.processStatJob(crawlDb.toString(), job, false);
+    return res;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/tools/CommonCrawlConfig.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/tools/CommonCrawlConfig.java b/nutch-core/src/main/java/org/apache/nutch/tools/CommonCrawlConfig.java
new file mode 100644
index 0000000..d8c06c0
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/tools/CommonCrawlConfig.java
@@ -0,0 +1,147 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tools;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Serializable;
+import java.util.Properties;
+
+public class CommonCrawlConfig implements Serializable {
+
+	/**
+	 * Serial version UID
+	 */
+	private static final long serialVersionUID = 5235013733207799661L;
+	
+	// Prefix for key value in the output format
+	private String keyPrefix = "";
+	
+	private boolean simpleDateFormat = false;
+	
+	private boolean jsonArray = false;
+	
+	private boolean reverseKey = false;
+	
+	private String reverseKeyValue = "";
+
+	private boolean compressed = false;
+
+	private long warcSize = 0;
+
+	private String outputDir;
+	
+	/**
+	 * Default constructor
+	 */
+	public CommonCrawlConfig() {
+		// TODO init(this.getClass().getResourceAsStream("CommonCrawlConfig.properties"));
+	}
+	
+	public CommonCrawlConfig(InputStream stream) {
+		init(stream);
+	}
+	
+	private void init(InputStream stream) {
+		if (stream == null) {
+			return;
+		}
+		Properties properties = new Properties();
+		
+		try {
+			properties.load(stream);
+		} catch (IOException e) {
+			// TODO
+		} finally {
+			try {
+				stream.close();
+			} catch (IOException e) {
+				// TODO
+			}
+		}
+
+		setKeyPrefix(properties.getProperty("keyPrefix", ""));
+		setSimpleDateFormat(Boolean.parseBoolean(properties.getProperty("simpleDateFormat", "False")));
+		setJsonArray(Boolean.parseBoolean(properties.getProperty("jsonArray", "False")));
+		setReverseKey(Boolean.parseBoolean(properties.getProperty("reverseKey", "False")));
+	}
+	
+	public void setKeyPrefix(String keyPrefix) {
+		this.keyPrefix = keyPrefix;
+	}
+	
+	public void setSimpleDateFormat(boolean simpleDateFormat) {
+		this.simpleDateFormat = simpleDateFormat;
+	}
+	
+	public void setJsonArray(boolean jsonArray) {
+		this.jsonArray = jsonArray;
+	}
+	
+	public void setReverseKey(boolean reverseKey) {
+		this.reverseKey = reverseKey;
+	}
+	
+	public void setReverseKeyValue(String reverseKeyValue) {
+		this.reverseKeyValue = reverseKeyValue;
+	}
+	
+	public String getKeyPrefix() {
+		return this.keyPrefix;
+	}
+	
+	public boolean getSimpleDateFormat() {
+		return this.simpleDateFormat;
+	}
+	
+	public boolean getJsonArray() {
+		return this.jsonArray;
+	}
+	
+	public boolean getReverseKey() {
+		return this.reverseKey;
+	}
+	
+	public String getReverseKeyValue() {
+		return this.reverseKeyValue;
+	}
+
+	public boolean isCompressed() {
+		return compressed;
+	}
+
+	public void setCompressed(boolean compressed) {
+		this.compressed = compressed;
+	}
+
+	public long getWarcSize() {
+		return warcSize;
+	}
+
+	public void setWarcSize(long warcSize) {
+		this.warcSize = warcSize;
+	}
+
+	public String getOutputDir() {
+		return outputDir;
+	}
+
+	public void setOutputDir(String outputDir) {
+		this.outputDir = outputDir;
+	}
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/tools/CommonCrawlDataDumper.java b/nutch-core/src/main/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
new file mode 100644
index 0000000..b4fc0a7
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
@@ -0,0 +1,716 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tools;
+
+//JDK imports
+
+import java.io.BufferedOutputStream;
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.text.ParseException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Date;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.LinkedHashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.CommandLineParser;
+import org.apache.commons.cli.GnuParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.OptionBuilder;
+import org.apache.commons.cli.Options;
+import org.apache.commons.codec.digest.DigestUtils;
+import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
+import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream;
+import org.apache.commons.compress.compressors.gzip.GzipCompressorOutputStream;
+//Commons imports
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.io.FilenameUtils;
+
+//Hadoop
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.LocatedFileStatus;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.RemoteIterator;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.nutch.crawl.Inlink;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.crawl.LinkDbReader;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.DumpFileUtil;
+import org.apache.nutch.util.NutchConfiguration;
+//Tika imports
+import org.apache.tika.Tika;
+
+import com.fasterxml.jackson.dataformat.cbor.CBORFactory;
+import com.fasterxml.jackson.dataformat.cbor.CBORGenerator;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.ibm.icu.text.DateFormat;
+import com.ibm.icu.text.SimpleDateFormat;
+
+/**
+ * <p>
+ * The Common Crawl Data Dumper tool enables one to reverse generate the raw
+ * content from Nutch segment data directories into a common crawling data
+ * format, consumed by many applications. The data is then serialized as <a
+ * href="http://cbor.io">CBOR</a>
+ * </p>
+ * <p>
+ * Text content will be stored in a structured document format. Below is a
+ * schema for storage of data and metadata related to a crawling request, with
+ * the response body truncated for readability. This document must be encoded
+ * using CBOR and should be compressed with gzip after encoding. The timestamped
+ * URL key for these records' keys follows the same layout as the media file
+ * directory structure, with underscores in place of directory separators. </li>
+ * </p>
+ * <p>
+ * Thus, the timestamped url key for the record is provided below followed by an
+ * example record:
+ * <p/>
+ * <pre>
+ * {@code
+ * com_somepage_33a3e36bbef59c2a5242c2ccee59239ab30d51f3_1411623696000
+ *
+ *     {
+ *         "url": "http:\/\/somepage.com\/22\/14560817",
+ *         "timestamp": "1411623696000",
+ *         "request": {
+ *             "method": "GET",
+ *             "client": {
+ *                 "hostname": "crawler01.local",
+ *                 "address": "74.347.129.200",
+ *                 "software": "Apache Nutch v1.10",
+ *                 "robots": "classic",
+ *                 "contact": {
+ *                     "name": "Nutch Admin",
+ *                     "email": "nutch.pro@nutchadmin.org"
+ *                 }
+ *             },
+ *             "headers": {
+ *                 "Accept": "text\/html,application\/xhtml+xml,application\/xml",
+ *                 "Accept-Encoding": "gzip,deflate,sdch",
+ *                 "Accept-Language": "en-US,en",
+ *                 "User-Agent": "Mozilla\/5.0",
+ *                 "...": "..."
+ *             },
+ *             "body": null
+ *         },
+ *         "response": {
+ *             "status": "200",
+ *             "server": {
+ *                 "hostname": "somepage.com",
+ *                 "address": "55.33.51.19",
+ *             },
+ *             "headers": {
+ *                 "Content-Encoding": "gzip",
+ *                 "Content-Type": "text\/html",
+ *                 "Date": "Thu, 25 Sep 2014 04:16:58 GMT",
+ *                 "Expires": "Thu, 25 Sep 2014 04:16:57 GMT",
+ *                 "Server": "nginx",
+ *                 "...": "..."
+ *             },
+ *             "body": "\r\n  <!DOCTYPE html PUBLIC ... \r\n\r\n  \r\n    </body>\r\n    </html>\r\n  \r\n\r\n",
+ *         },
+ *         "key": "com_somepage_33a3e36bbef59c2a5242c2ccee59239ab30d51f3_1411623696000",
+ *         "imported": "1411623698000"
+ *     }
+ *     }
+ * </pre>
+ * <p/>
+ * <p>
+ * Upon successful completion the tool displays a very convenient JSON snippet
+ * detailing the mimetype classifications and the counts of documents which fall
+ * into those classifications. An example is as follows:
+ * </p>
+ * <p/>
+ * <pre>
+ * {@code
+ * INFO: File Types:
+ *   TOTAL Stats:    {
+ *     {"mimeType":"application/xml","count":19"}
+ *     {"mimeType":"image/png","count":47"}
+ *     {"mimeType":"image/jpeg","count":141"}
+ *     {"mimeType":"image/vnd.microsoft.icon","count":4"}
+ *     {"mimeType":"text/plain","count":89"}
+ *     {"mimeType":"video/quicktime","count":2"}
+ *     {"mimeType":"image/gif","count":63"}
+ *     {"mimeType":"application/xhtml+xml","count":1670"}
+ *     {"mimeType":"application/octet-stream","count":40"}
+ *     {"mimeType":"text/html","count":1863"}
+ *   }
+ * }
+ * </pre>
+ */
+public class CommonCrawlDataDumper extends Configured implements Tool {
+
+  private static final Logger LOG = LoggerFactory
+      .getLogger(CommonCrawlDataDumper.class.getName());
+  private static final int MAX_INLINKS = 5000;
+  
+  private CommonCrawlConfig config = null;
+
+  // Gzip initialization
+  private FileOutputStream fileOutput = null;
+  private BufferedOutputStream bufOutput = null;
+  private GzipCompressorOutputStream gzipOutput = null;
+  private TarArchiveOutputStream tarOutput = null;
+  private ArrayList<String> fileList = null;
+
+  /**
+   * Main method for invoking this tool
+   *
+   * @param args 1) output directory (which will be created if it does not
+   *             already exist) to host the CBOR data and 2) a directory
+   *             containing one or more segments from which we wish to generate
+   *             CBOR data from. Optionally, 3) a list of mimetypes and the 4)
+   *             the gzip option may be provided.
+   * @throws Exception
+   */
+  public static void main(String[] args) throws Exception {
+    Configuration conf = NutchConfiguration.create();
+    int res = ToolRunner.run(conf, new CommonCrawlDataDumper(), args);
+    System.exit(res);
+  }
+
+  /**
+   * Constructor
+   */
+  public CommonCrawlDataDumper(CommonCrawlConfig config) {
+    this.config = config;
+  }
+
+  public CommonCrawlDataDumper() {
+  }
+
+  /**
+   * Dumps the reverse engineered CBOR content from the provided segment
+   * directories if a parent directory contains more than one segment,
+   * otherwise a single segment can be passed as an argument. If the boolean
+   * argument is provided then the CBOR is also zipped.
+   *
+   * @param outputDir      the directory you wish to dump the raw content to. This
+   *                       directory will be created.
+   * @param segmentRootDir a directory containing one or more segments.
+   * @param linkdb         Path to linkdb.
+   * @param gzip           a boolean flag indicating whether the CBOR content should also
+   *                       be gzipped.
+   * @param epochFilename  if {@code true}, output files will be names using the epoch time (in milliseconds).
+   * @param extension      a file extension to use with output documents.
+   * @throws Exception if any exception occurs.
+   */
+  public void dump(File outputDir, File segmentRootDir, File linkdb, boolean gzip,
+      String[] mimeTypes, boolean epochFilename, String extension, boolean warc)
+      throws Exception {
+    if (gzip) {
+      LOG.info("Gzipping CBOR data has been skipped");
+    }
+    // total file counts
+    Map<String, Integer> typeCounts = new HashMap<String, Integer>();
+    // filtered file counters
+    Map<String, Integer> filteredCounts = new HashMap<String, Integer>();
+
+    Configuration nutchConfig = NutchConfiguration.create();
+    final FileSystem fs = FileSystem.get(nutchConfig);
+    Path segmentRootPath = new Path(segmentRootDir.toString());
+
+    //get all paths
+    List<Path> parts = new ArrayList<>();
+    RemoteIterator<LocatedFileStatus> files = fs.listFiles(segmentRootPath, true);
+    String partPattern = ".*" + File.separator + Content.DIR_NAME
+        + File.separator + "part-[0-9]{5}" + File.separator + "data";
+    while (files.hasNext()) {
+      LocatedFileStatus next = files.next();
+      if (next.isFile()) {
+        Path path = next.getPath();
+        if (path.toString().matches(partPattern)){
+          parts.add(path);
+        }
+      }
+    }
+
+    LinkDbReader linkDbReader = null;
+    if (linkdb != null) {
+      linkDbReader = new LinkDbReader(fs.getConf(), new Path(linkdb.toString()));
+    }
+    if (parts == null || parts.size() == 0) {
+      LOG.error( "No segment directories found in {} ",
+          segmentRootDir.getAbsolutePath());
+      System.exit(1);
+    }
+    LOG.info("Found {} segment parts", parts.size());
+    if (gzip && !warc) {
+      fileList = new ArrayList<>();
+      constructNewStream(outputDir);
+    }
+
+    for (Path segmentPart : parts) {
+      LOG.info("Processing segment Part : [ {} ]", segmentPart);
+      try {
+        SequenceFile.Reader reader = new SequenceFile.Reader(nutchConfig,
+            SequenceFile.Reader.file(segmentPart));
+
+        Writable key = (Writable) reader.getKeyClass().newInstance();
+
+        Content content = null;
+        while (reader.next(key)) {
+          content = new Content();
+          reader.getCurrentValue(content);
+          Metadata metadata = content.getMetadata();
+          String url = key.toString();
+
+          String baseName = FilenameUtils.getBaseName(url);
+          String extensionName = FilenameUtils.getExtension(url);
+
+          if (!extension.isEmpty()) {
+            extensionName = extension;
+          } else if ((extensionName == null) || extensionName.isEmpty()) {
+            extensionName = "html";
+          }
+
+          String outputFullPath = null;
+          String outputRelativePath = null;
+          String filename = null;
+          String timestamp = null;
+          String reverseKey = null;
+
+          if (epochFilename || config.getReverseKey()) {
+            try {
+              long epoch = new SimpleDateFormat("EEE, d MMM yyyy HH:mm:ss z")
+                  .parse(getDate(metadata.get("Date"))).getTime();
+              timestamp = String.valueOf(epoch);
+            } catch (ParseException pe) {
+              LOG.warn(pe.getMessage());
+            }
+
+            reverseKey = reverseUrl(url);
+            config.setReverseKeyValue(
+                reverseKey.replace("/", "_") + "_" + DigestUtils.sha1Hex(url)
+                    + "_" + timestamp);
+          }
+
+          if (!warc) {
+            if (epochFilename) {
+              outputFullPath = DumpFileUtil
+                  .createFileNameFromUrl(outputDir.getAbsolutePath(),
+                      reverseKey, url, timestamp, extensionName, !gzip);
+              outputRelativePath = outputFullPath
+                  .substring(0, outputFullPath.lastIndexOf(File.separator) - 1);
+              filename = content.getMetadata().get(Metadata.DATE) + "."
+                  + extensionName;
+            } else {
+              String md5Ofurl = DumpFileUtil.getUrlMD5(url);
+              String fullDir = DumpFileUtil
+                  .createTwoLevelsDirectory(outputDir.getAbsolutePath(),
+                      md5Ofurl, !gzip);
+              filename = DumpFileUtil
+                  .createFileName(md5Ofurl, baseName, extensionName);
+              outputFullPath = String.format("%s/%s", fullDir, filename);
+
+              String[] fullPathLevels = fullDir.split(File.separator);
+              String firstLevelDirName = fullPathLevels[fullPathLevels.length
+                  - 2];
+              String secondLevelDirName = fullPathLevels[fullPathLevels.length
+                  - 1];
+              outputRelativePath = firstLevelDirName + secondLevelDirName;
+            }
+          }
+          // Encode all filetypes if no mimetypes have been given
+          Boolean filter = (mimeTypes == null);
+
+          String jsonData = "";
+          try {
+            String mimeType = new Tika().detect(content.getContent());
+            // Maps file to JSON-based structure
+
+            Set<String> inUrls = null; //there may be duplicates, so using set
+            if (linkDbReader != null) {
+              Inlinks inlinks = linkDbReader.getInlinks((Text) key);
+              if (inlinks != null) {
+                Iterator<Inlink> iterator = inlinks.iterator();
+                inUrls = new LinkedHashSet<>();
+                while (inUrls.size() <= MAX_INLINKS && iterator.hasNext()){
+                  inUrls.add(iterator.next().getFromUrl());
+                }
+              }
+            }
+            //TODO: Make this Jackson Format implementation reusable
+            try (CommonCrawlFormat format = CommonCrawlFormatFactory
+                .getCommonCrawlFormat(warc ? "WARC" : "JACKSON", nutchConfig, config)) {
+              if (inUrls != null) {
+                format.setInLinks(new ArrayList<>(inUrls));
+              }
+              jsonData = format.getJsonData(url, content, metadata);
+            }
+
+            collectStats(typeCounts, mimeType);
+            // collects statistics for the given mimetypes
+            if ((mimeType != null) && (mimeTypes != null) && Arrays
+                .asList(mimeTypes).contains(mimeType)) {
+              collectStats(filteredCounts, mimeType);
+              filter = true;
+            }
+          } catch (IOException ioe) {
+            LOG.error("Fatal error in creating JSON data: " + ioe.getMessage());
+            return;
+          }
+
+          if (!warc) {
+            if (filter) {
+              byte[] byteData = serializeCBORData(jsonData);
+
+              if (!gzip) {
+                File outputFile = new File(outputFullPath);
+                if (outputFile.exists()) {
+                  LOG.info("Skipping writing: [" + outputFullPath
+                      + "]: file already exists");
+                } else {
+                  LOG.info("Writing: [" + outputFullPath + "]");
+                  IOUtils.copy(new ByteArrayInputStream(byteData),
+                      new FileOutputStream(outputFile));
+                }
+              } else {
+                if (fileList.contains(outputFullPath)) {
+                  LOG.info("Skipping compressing: [" + outputFullPath
+                      + "]: file already exists");
+                } else {
+                  fileList.add(outputFullPath);
+                  LOG.info("Compressing: [" + outputFullPath + "]");
+                  //TarArchiveEntry tarEntry = new TarArchiveEntry(firstLevelDirName + File.separator + secondLevelDirName + File.separator + filename);
+                  TarArchiveEntry tarEntry = new TarArchiveEntry(
+                      outputRelativePath + File.separator + filename);
+                  tarEntry.setSize(byteData.length);
+                  tarOutput.putArchiveEntry(tarEntry);
+                  tarOutput.write(byteData);
+                  tarOutput.closeArchiveEntry();
+                }
+              }
+            }
+          }
+        }
+        reader.close();
+      } catch (Exception e){
+        LOG.warn("SKIPPED: {} Because : {}", segmentPart, e.getMessage());
+      } finally {
+        fs.close();
+      }
+    }
+
+    if (gzip && !warc) {
+      closeStream();
+    }
+
+    if (!typeCounts.isEmpty()) {
+      LOG.info("CommonsCrawlDataDumper File Stats: " + DumpFileUtil
+          .displayFileTypes(typeCounts, filteredCounts));
+    }
+
+  }
+
+  private void closeStream() {
+    try {
+      tarOutput.finish();
+
+      tarOutput.close();
+      gzipOutput.close();
+      bufOutput.close();
+      fileOutput.close();
+    } catch (IOException ioe) {
+      LOG.warn("Error in closing stream: " + ioe.getMessage());
+    }
+  }
+
+  private void constructNewStream(File outputDir) throws IOException {
+    String archiveName = new SimpleDateFormat("yyyyMMddhhmm'.tar.gz'")
+        .format(new Date());
+    LOG.info("Creating a new gzip archive: " + archiveName);
+    fileOutput = new FileOutputStream(
+        new File(outputDir + File.separator + archiveName));
+    bufOutput = new BufferedOutputStream(fileOutput);
+    gzipOutput = new GzipCompressorOutputStream(bufOutput);
+    tarOutput = new TarArchiveOutputStream(gzipOutput);
+    tarOutput.setLongFileMode(TarArchiveOutputStream.LONGFILE_GNU);
+  }
+
+  /**
+   * Writes the CBOR "Self-Describe Tag" (value 55799, serialized as 3-byte
+   * sequence of {@code 0xd9d9f7}) at the current position. This method must
+   * be used to write the CBOR magic number at the beginning of the document.
+   * Since version 2.5, <a
+   * href="https://github.com/FasterXML/jackson-dataformat-cbor"
+   * >jackson-dataformat-cbor</a> will support the {@code WRITE_TYPE_HEADER}
+   * feature to write that type tag at the beginning of the document.
+   *
+   * @param generator {@link CBORGenerator} object used to create a CBOR-encoded document.
+   * @throws IOException if any I/O error occurs.
+   * @see <a href="https://tools.ietf.org/html/rfc7049#section-2.4.5">RFC
+   * 7049</a>
+   */
+  private void writeMagicHeader(CBORGenerator generator) throws IOException {
+    // Writes self-describe CBOR
+    // https://tools.ietf.org/html/rfc7049#section-2.4.5
+    // It will be supported in jackson-cbor since 2.5
+    byte[] header = new byte[3];
+    header[0] = (byte) 0xd9;
+    header[1] = (byte) 0xd9;
+    header[2] = (byte) 0xf7;
+    generator.writeBytes(header, 0, header.length);
+  }
+
+  private byte[] serializeCBORData(String jsonData) {
+    CBORFactory factory = new CBORFactory();
+
+    CBORGenerator generator = null;
+    ByteArrayOutputStream stream = null;
+
+    try {
+      stream = new ByteArrayOutputStream();
+      generator = factory.createGenerator(stream);
+      // Writes CBOR tag
+      writeMagicHeader(generator);
+      generator.writeString(jsonData);
+      generator.flush();
+      stream.flush();
+
+      return stream.toByteArray();
+
+    } catch (Exception e) {
+      LOG.warn("CBOR encoding failed: " + e.getMessage());
+    } finally {
+      try {
+        generator.close();
+        stream.close();
+      } catch (IOException e) {
+        // nothing to do
+      }
+    }
+
+    return null;
+  }
+
+  private void collectStats(Map<String, Integer> typeCounts, String mimeType) {
+    typeCounts.put(mimeType,
+        typeCounts.containsKey(mimeType) ? typeCounts.get(mimeType) + 1 : 1);
+  }
+
+  /**
+   * Gets the current date if the given timestamp is empty or null.
+   *
+   * @param timestamp the timestamp
+   * @return the current timestamp if the given one is null.
+   */
+  private String getDate(String timestamp) {
+    if (timestamp == null || timestamp.isEmpty()) {
+      DateFormat dateFormat = new SimpleDateFormat(
+          "EEE, d MMM yyyy HH:mm:ss z");
+      timestamp = dateFormat.format(new Date());
+    }
+    return timestamp;
+
+  }
+
+  public static String reverseUrl(String urlString) {
+    URL url;
+    String reverseKey = null;
+    try {
+      url = new URL(urlString);
+
+      String[] hostPart = url.getHost().replace('.', '/').split("/");
+
+      StringBuilder sb = new StringBuilder();
+      sb.append(hostPart[hostPart.length - 1]);
+      for (int i = hostPart.length - 2; i >= 0; i--) {
+        sb.append("/" + hostPart[i]);
+      }
+
+      reverseKey = sb.toString();
+
+    } catch (MalformedURLException e) {
+      LOG.error("Failed to parse URL: {}", urlString);
+    }
+
+    return reverseKey;
+  }
+
+  @Override
+  public int run(String[] args) throws Exception {
+    Option helpOpt = new Option("h", "help", false, "show this help message.");
+    // argument options
+    @SuppressWarnings("static-access")
+    Option outputOpt = OptionBuilder.withArgName("outputDir").hasArg()
+        .withDescription(
+            "output directory (which will be created) to host the CBOR data.")
+        .create("outputDir");
+    // WARC format
+    Option warcOpt = new Option("warc", "export to a WARC file");
+
+    @SuppressWarnings("static-access")
+    Option segOpt = OptionBuilder.withArgName("segment").hasArgs()
+        .withDescription("the segment or directory containing segments to use").create("segment");
+    // create mimetype and gzip options
+    @SuppressWarnings("static-access")
+    Option mimeOpt = OptionBuilder.isRequired(false).withArgName("mimetype")
+        .hasArgs().withDescription(
+            "an optional list of mimetypes to dump, excluding all others. Defaults to all.")
+        .create("mimetype");
+    @SuppressWarnings("static-access")
+    Option gzipOpt = OptionBuilder.withArgName("gzip").hasArg(false)
+        .withDescription(
+            "an optional flag indicating whether to additionally gzip the data.")
+        .create("gzip");
+    @SuppressWarnings("static-access")
+    Option keyPrefixOpt = OptionBuilder.withArgName("keyPrefix").hasArg(true)
+        .withDescription("an optional prefix for key in the output format.")
+        .create("keyPrefix");
+    @SuppressWarnings("static-access")
+    Option simpleDateFormatOpt = OptionBuilder.withArgName("SimpleDateFormat")
+        .hasArg(false).withDescription(
+            "an optional format for timestamp in GMT epoch milliseconds.")
+        .create("SimpleDateFormat");
+    @SuppressWarnings("static-access")
+    Option epochFilenameOpt = OptionBuilder.withArgName("epochFilename")
+        .hasArg(false)
+        .withDescription("an optional format for output filename.")
+        .create("epochFilename");
+    @SuppressWarnings("static-access")
+    Option jsonArrayOpt = OptionBuilder.withArgName("jsonArray").hasArg(false)
+        .withDescription("an optional format for JSON output.")
+        .create("jsonArray");
+    @SuppressWarnings("static-access")
+    Option reverseKeyOpt = OptionBuilder.withArgName("reverseKey").hasArg(false)
+        .withDescription("an optional format for key value in JSON output.")
+        .create("reverseKey");
+    @SuppressWarnings("static-access")
+    Option extensionOpt = OptionBuilder.withArgName("extension").hasArg(true)
+        .withDescription("an optional file extension for output documents.")
+        .create("extension");
+    @SuppressWarnings("static-access")
+    Option sizeOpt = OptionBuilder.withArgName("warcSize").hasArg(true)
+        .withType(Number.class)
+        .withDescription("an optional file size in bytes for the WARC file(s)")
+        .create("warcSize");
+    @SuppressWarnings("static-access")
+    Option linkDbOpt = OptionBuilder.withArgName("linkdb").hasArg(true)
+        .withDescription("an optional linkdb parameter to include inlinks in dump files")
+        .isRequired(false)
+        .create("linkdb");
+
+    // create the options
+    Options options = new Options();
+    options.addOption(helpOpt);
+    options.addOption(outputOpt);
+    options.addOption(segOpt);
+    // create mimetypes and gzip options
+    options.addOption(warcOpt);
+    options.addOption(mimeOpt);
+    options.addOption(gzipOpt);
+    // create keyPrefix option
+    options.addOption(keyPrefixOpt);
+    // create simpleDataFormat option
+    options.addOption(simpleDateFormatOpt);
+    options.addOption(epochFilenameOpt);
+    options.addOption(jsonArrayOpt);
+    options.addOption(reverseKeyOpt);
+    options.addOption(extensionOpt);
+    options.addOption(sizeOpt);
+    options.addOption(linkDbOpt);
+
+    CommandLineParser parser = new GnuParser();
+    try {
+      CommandLine line = parser.parse(options, args);
+      if (line.hasOption("help") || !line.hasOption("outputDir") || (!line
+          .hasOption("segment"))) {
+        HelpFormatter formatter = new HelpFormatter();
+        formatter
+            .printHelp(CommonCrawlDataDumper.class.getName(), options, true);
+        return 0;
+      }
+
+      File outputDir = new File(line.getOptionValue("outputDir"));
+      File segmentRootDir = new File(line.getOptionValue("segment"));
+      String[] mimeTypes = line.getOptionValues("mimetype");
+      boolean gzip = line.hasOption("gzip");
+      boolean epochFilename = line.hasOption("epochFilename");
+
+      String keyPrefix = line.getOptionValue("keyPrefix", "");
+      boolean simpleDateFormat = line.hasOption("SimpleDateFormat");
+      boolean jsonArray = line.hasOption("jsonArray");
+      boolean reverseKey = line.hasOption("reverseKey");
+      String extension = line.getOptionValue("extension", "");
+      boolean warc = line.hasOption("warc");
+      long warcSize = 0;
+
+      if (line.getParsedOptionValue("warcSize") != null) {
+        warcSize = (Long) line.getParsedOptionValue("warcSize");
+      }
+      String linkdbPath = line.getOptionValue("linkdb");
+      File linkdb = linkdbPath == null ? null : new File(linkdbPath);
+
+      CommonCrawlConfig config = new CommonCrawlConfig();
+      config.setKeyPrefix(keyPrefix);
+      config.setSimpleDateFormat(simpleDateFormat);
+      config.setJsonArray(jsonArray);
+      config.setReverseKey(reverseKey);
+      config.setCompressed(gzip);
+      config.setWarcSize(warcSize);
+      config.setOutputDir(line.getOptionValue("outputDir"));
+
+      if (!outputDir.exists()) {
+        LOG.warn("Output directory: [" + outputDir.getAbsolutePath()
+            + "]: does not exist, creating it.");
+        if (!outputDir.mkdirs())
+          throw new Exception(
+              "Unable to create: [" + outputDir.getAbsolutePath() + "]");
+      }
+
+      CommonCrawlDataDumper dumper = new CommonCrawlDataDumper(config);
+
+      dumper.dump(outputDir, segmentRootDir, linkdb, gzip, mimeTypes, epochFilename,
+          extension, warc);
+
+    } catch (Exception e) {
+      LOG.error(CommonCrawlDataDumper.class.getName() + ": " + StringUtils
+          .stringifyException(e));
+      e.printStackTrace();
+      return -1;
+    }
+
+    return 0;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/tools/CommonCrawlFormat.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/tools/CommonCrawlFormat.java b/nutch-core/src/main/java/org/apache/nutch/tools/CommonCrawlFormat.java
new file mode 100644
index 0000000..0834d95
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/tools/CommonCrawlFormat.java
@@ -0,0 +1,87 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tools;
+
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.protocol.Content;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.util.List;
+
+/**
+ * Interface for all CommonCrawl formatter. It provides the signature for the
+ * method used to get JSON data.
+ *
+ * @author gtotaro
+ *
+ */
+public interface CommonCrawlFormat extends Closeable {
+
+  /**
+   *
+   * @param mapAll If {@code true} maps all metdata on the JSON structure.
+   * @return the JSON data
+   */
+  //public String getJsonData(boolean mapAll) throws IOException;
+  public String getJsonData() throws IOException;
+
+  /**
+   * Returns a string representation of the JSON structure of the URL content
+   *
+   * @param url
+   * @param content
+   * @param metadata
+   * @return
+   */
+  public String getJsonData(String url, Content content, Metadata metadata)
+      throws IOException;
+
+  /**
+   * Returns a string representation of the JSON structure of the URL content
+   * takes into account the parsed metadata about the URL
+   *
+   * @param url
+   * @param content
+   * @param metadata
+   * @return
+   */
+  public String getJsonData(String url, Content content, Metadata metadata,
+      ParseData parseData) throws IOException;
+
+
+  /**
+   * sets inlinks of this document
+   * @param inLinks list of inlinks
+   */
+  void setInLinks(List<String> inLinks);
+
+
+  /**
+   * gets set of inlinks
+   * @return gets inlinks of this document
+   */
+  List<String> getInLinks();
+
+  /**
+   * Optional method that could be implemented if the actual format needs some
+   * close procedure.
+   */
+  public abstract void close();
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/tools/CommonCrawlFormatFactory.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/tools/CommonCrawlFormatFactory.java b/nutch-core/src/main/java/org/apache/nutch/tools/CommonCrawlFormatFactory.java
new file mode 100644
index 0000000..8814168
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/tools/CommonCrawlFormatFactory.java
@@ -0,0 +1,74 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tools;
+
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.protocol.Content;
+
+/**
+ * Factory class that creates new {@see CommonCrawlFormat} objects (a.k.a. formatter) that map crawled files to CommonCrawl format.   
+ *
+ */
+public class CommonCrawlFormatFactory {
+	
+	/**
+	 * Returns a new instance of a {@see CommonCrawlFormat} object specifying the type of formatter. 
+	 * @param formatType the type of formatter to be created.
+	 * @param url the url.
+	 * @param content the content.
+	 * @param metadata the metadata.
+	 * @param nutchConf the configuration.
+	 * @param config the CommonCrawl output configuration.
+	 * @return the new {@see CommonCrawlFormat} object.
+	 * @throws IOException If any I/O error occurs.
+	 * @deprecated
+	 */
+	public static CommonCrawlFormat getCommonCrawlFormat(String formatType, String url, Content content,	Metadata metadata, Configuration nutchConf, CommonCrawlConfig config) throws IOException {
+		if (formatType == null) {
+			return null;
+		}
+		
+		if (formatType.equalsIgnoreCase("jackson")) {
+			return new CommonCrawlFormatJackson(url, content, metadata, nutchConf, config);
+		}
+		else if (formatType.equalsIgnoreCase("jettinson")) {
+			return new CommonCrawlFormatJettinson(url, content, metadata, nutchConf, config);
+		}
+		else if (formatType.equalsIgnoreCase("simple")) {
+			return new CommonCrawlFormatSimple(url, content, metadata, nutchConf, config);
+		}
+		
+		return null;
+	}
+
+	// The format should not depend on variable attributes, essentially this
+	// should be one for the full job
+	public static CommonCrawlFormat getCommonCrawlFormat(String formatType, Configuration nutchConf, CommonCrawlConfig config) throws IOException {
+		if (formatType.equalsIgnoreCase("WARC")) {
+			return new CommonCrawlFormatWARC(nutchConf, config);
+		}
+
+		if (formatType.equalsIgnoreCase("JACKSON")) {
+			return new CommonCrawlFormatJackson( nutchConf, config);
+		}
+		return null;
+	}
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/tools/CommonCrawlFormatJackson.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/tools/CommonCrawlFormatJackson.java b/nutch-core/src/main/java/org/apache/nutch/tools/CommonCrawlFormatJackson.java
new file mode 100644
index 0000000..0d6cae2
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/tools/CommonCrawlFormatJackson.java
@@ -0,0 +1,109 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tools;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.metadata.Metadata;
+
+import com.fasterxml.jackson.core.JsonFactory;
+import com.fasterxml.jackson.core.JsonGenerator;
+import org.apache.nutch.protocol.Content;
+
+/**
+ * This class provides methods to map crawled data on JSON using Jackson Streaming APIs. 
+ *
+ */
+public class CommonCrawlFormatJackson extends AbstractCommonCrawlFormat {
+	
+	private ByteArrayOutputStream out;
+	
+	private JsonGenerator generator;
+
+	public CommonCrawlFormatJackson(Configuration nutchConf,
+			CommonCrawlConfig config) throws IOException {
+		super(null, null, null, nutchConf, config);
+
+		JsonFactory factory = new JsonFactory();
+		this.out = new ByteArrayOutputStream();
+		this.generator = factory.createGenerator(out);
+
+		this.generator.useDefaultPrettyPrinter(); // INDENTED OUTPUT
+	}
+	
+	public CommonCrawlFormatJackson(String url, Content content, Metadata metadata, Configuration nutchConf, CommonCrawlConfig config) throws IOException {
+		super(url, content, metadata, nutchConf, config);
+		
+		JsonFactory factory = new JsonFactory();
+		this.out = new ByteArrayOutputStream();
+		this.generator = factory.createGenerator(out);
+		
+		this.generator.useDefaultPrettyPrinter(); // INDENTED OUTPUT
+	}
+	
+	@Override
+	protected void writeKeyValue(String key, String value) throws IOException {
+		generator.writeFieldName(key);
+		generator.writeString(value);
+	}
+	
+	@Override
+	protected void writeKeyNull(String key) throws IOException {
+		generator.writeFieldName(key);
+		generator.writeNull();
+	}
+	
+	@Override
+	protected void startArray(String key, boolean nested, boolean newline) throws IOException {
+		if (key != null) {
+			generator.writeFieldName(key);
+		}
+		generator.writeStartArray();
+	}
+	
+	@Override
+	protected void closeArray(String key, boolean nested, boolean newline) throws IOException {
+		generator.writeEndArray();
+	}
+	
+	@Override
+	protected void writeArrayValue(String value) throws IOException {
+		generator.writeString(value);
+	}
+	
+	@Override
+	protected void startObject(String key) throws IOException {
+		if (key != null) {
+			generator.writeFieldName(key);
+		}
+		generator.writeStartObject();
+	}
+	
+	@Override
+	protected void closeObject(String key) throws IOException {
+		generator.writeEndObject();
+	}
+	
+	@Override
+	protected String generateJson() throws IOException {
+		this.generator.flush();
+		return this.out.toString();
+	}
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/tools/CommonCrawlFormatJettinson.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/tools/CommonCrawlFormatJettinson.java b/nutch-core/src/main/java/org/apache/nutch/tools/CommonCrawlFormatJettinson.java
new file mode 100644
index 0000000..6950e2a
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/tools/CommonCrawlFormatJettinson.java
@@ -0,0 +1,122 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tools;
+
+import java.io.IOException;
+import java.util.ArrayDeque;
+import java.util.Deque;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.protocol.Content;
+import org.codehaus.jettison.json.JSONArray;
+import org.codehaus.jettison.json.JSONException;
+import org.codehaus.jettison.json.JSONObject;
+
+/**
+ * This class provides methods to map crawled data on JSON using Jettinson APIs. 
+ *
+ */
+public class CommonCrawlFormatJettinson extends AbstractCommonCrawlFormat {
+	
+	private Deque<JSONObject> stackObjects;
+	
+	private Deque<JSONArray> stackArrays;
+
+	public CommonCrawlFormatJettinson(String url, Content content, Metadata metadata, Configuration nutchConf, CommonCrawlConfig config) throws IOException {
+		super(url, content, metadata, nutchConf, config);
+		
+		stackObjects = new ArrayDeque<JSONObject>();
+		stackArrays = new ArrayDeque<JSONArray>();
+	}
+	
+	@Override
+	protected void writeKeyValue(String key, String value) throws IOException {
+		try {
+			stackObjects.getFirst().put(key, value);
+		} catch (JSONException jsone) {
+			throw new IOException(jsone.getMessage());
+		}
+	}
+	
+	@Override
+	protected void writeKeyNull(String key) throws IOException {
+		try {
+			stackObjects.getFirst().put(key, JSONObject.NULL);
+		} catch (JSONException jsone) {
+			throw new IOException(jsone.getMessage());
+		}
+	}
+	
+	@Override
+	protected void startArray(String key, boolean nested, boolean newline) throws IOException {
+		JSONArray array = new JSONArray();
+		stackArrays.push(array);
+	}
+	
+	@Override
+	protected void closeArray(String key, boolean nested, boolean newline) throws IOException {
+		try {
+			if (stackArrays.size() > 1) {
+				JSONArray array = stackArrays.pop();
+				if (nested) {
+					stackArrays.getFirst().put(array);
+				}
+				else {
+					stackObjects.getFirst().put(key, array);
+				}
+			}
+		} catch (JSONException jsone) {
+			throw new IOException(jsone.getMessage());
+		}
+	}
+	
+	@Override
+	protected void writeArrayValue(String value) throws IOException {
+		if (stackArrays.size() > 1) {
+			stackArrays.getFirst().put(value);
+		}
+	}
+	
+	@Override
+	protected void startObject(String key) throws IOException {
+		JSONObject object = new JSONObject();
+		stackObjects.push(object);
+	}
+	
+	@Override
+	protected void closeObject(String key) throws IOException {
+		try {
+			if (stackObjects.size() > 1) {
+				JSONObject object = stackObjects.pop();
+				stackObjects.getFirst().put(key, object);
+			}
+		} catch (JSONException jsone) {
+			throw new IOException(jsone.getMessage());
+		}
+	}
+	
+	@Override
+	protected String generateJson() throws IOException {
+		try {
+			return stackObjects.getFirst().toString(2);
+		} catch (JSONException jsone) {
+			throw new IOException(jsone.getMessage());
+		}
+	}
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/tools/CommonCrawlFormatSimple.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/tools/CommonCrawlFormatSimple.java b/nutch-core/src/main/java/org/apache/nutch/tools/CommonCrawlFormatSimple.java
new file mode 100644
index 0000000..a1aaa44
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/tools/CommonCrawlFormatSimple.java
@@ -0,0 +1,174 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tools;
+
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.protocol.Content;
+
+/**
+ * This class provides methods to map crawled data on JSON using a {@see StringBuilder} object. 
+ *
+ */
+public class CommonCrawlFormatSimple extends AbstractCommonCrawlFormat {
+	
+	private StringBuilder sb;
+	
+	private int tabCount;
+	
+	public CommonCrawlFormatSimple(String url, Content content, Metadata metadata, Configuration nutchConf, CommonCrawlConfig config) throws IOException {
+		super(url, content, metadata, nutchConf, config);
+		
+		this.sb = new StringBuilder();
+		this.tabCount = 0;
+	}
+	
+	@Override
+	protected void writeKeyValue(String key, String value) throws IOException {
+		sb.append(printTabs() + "\"" + key + "\": " + quote(value) + ",\n");
+	}
+	
+	@Override
+	protected void writeKeyNull(String key) throws IOException {
+		sb.append(printTabs() + "\"" + key + "\": null,\n");
+	}
+	
+	@Override
+	protected void startArray(String key, boolean nested, boolean newline) throws IOException {
+		String name = (key != null) ? "\"" + key + "\": " : "";
+		String nl = (newline) ? "\n" : "";
+		sb.append(printTabs() + name + "[" + nl);
+		if (newline) {
+			this.tabCount++;
+		}
+	}
+	
+	@Override
+	protected void closeArray(String key, boolean nested, boolean newline) throws IOException {
+		if (sb.charAt(sb.length()-1) == ',') {
+			sb.deleteCharAt(sb.length()-1); // delete comma
+		}
+		else if (sb.charAt(sb.length()-2) == ',') {
+			sb.deleteCharAt(sb.length()-2); // delete comma
+		}
+		String nl = (newline) ? printTabs() : "";
+		if (newline) {
+			this.tabCount++;
+		}
+		sb.append(nl + "],\n");
+	}
+	
+	@Override
+	protected void writeArrayValue(String value) {
+		sb.append("\"" + value + "\",");
+	}
+	
+	protected void startObject(String key) throws IOException {
+		String name = "";
+		if (key != null) {
+			name = "\"" + key + "\": ";
+		}
+		sb.append(printTabs() + name + "{\n");
+		this.tabCount++;
+	}
+	
+	protected void closeObject(String key) throws IOException {
+		if (sb.charAt(sb.length()-2) == ',') {
+			sb.deleteCharAt(sb.length()-2); // delete comma
+		}
+		this.tabCount--;
+		sb.append(printTabs() + "},\n");
+	}
+	
+	protected String generateJson() throws IOException {
+		sb.deleteCharAt(sb.length()-1); // delete new line
+		sb.deleteCharAt(sb.length()-1); // delete comma
+		return sb.toString();
+	}
+	
+	private String printTabs() {
+		StringBuilder sb = new StringBuilder();
+		for (int i=0; i < this.tabCount ;i++) {
+			sb.append("\t");
+		}
+		return sb.toString();
+	}
+	
+    private static String quote(String string) throws IOException {
+    	StringBuilder sb = new StringBuilder();
+    	
+        if (string == null || string.length() == 0) {
+            sb.append("\"\"");
+            return sb.toString();
+        }
+
+        char b;
+        char c = 0;
+        String hhhh;
+        int i;
+        int len = string.length();
+
+        sb.append('"');
+        for (i = 0; i < len; i += 1) {
+            b = c;
+            c = string.charAt(i);
+            switch (c) {
+            case '\\':
+            case '"':
+                sb.append('\\');
+                sb.append(c);
+                break;
+            case '/':
+                if (b == '<') {
+                	sb.append('\\');
+                }
+                sb.append(c);
+                break;
+            case '\b':
+            	sb.append("\\b");
+                break;
+            case '\t':
+            	sb.append("\\t");
+                break;
+            case '\n':
+            	sb.append("\\n");
+                break;
+            case '\f':
+            	sb.append("\\f");
+                break;
+            case '\r':
+            	sb.append("\\r");
+                break;
+            default:
+                if (c < ' ' || (c >= '\u0080' && c < '\u00a0')
+                        || (c >= '\u2000' && c < '\u2100')) {
+                	sb.append("\\u");
+                    hhhh = Integer.toHexString(c);
+                    sb.append("0000", 0, 4 - hhhh.length());
+                    sb.append(hhhh);
+                } else {
+                	sb.append(c);
+                }
+            }
+        }
+        sb.append('"');
+        return sb.toString();
+    }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/tools/CommonCrawlFormatWARC.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/tools/CommonCrawlFormatWARC.java b/nutch-core/src/main/java/org/apache/nutch/tools/CommonCrawlFormatWARC.java
new file mode 100644
index 0000000..191e42e
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/tools/CommonCrawlFormatWARC.java
@@ -0,0 +1,286 @@
+package org.apache.nutch.tools;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.IOException;
+import java.net.URI;
+import java.text.ParseException;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Date;
+import java.util.List;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import com.ibm.icu.text.SimpleDateFormat;
+import org.apache.commons.lang.NotImplementedException;
+import org.apache.commons.lang.StringUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.ParseData;
+
+import org.apache.nutch.parse.ParseSegment;
+import org.apache.nutch.protocol.Content;
+import org.archive.format.warc.WARCConstants;
+import org.archive.io.WriterPoolMember;
+import org.archive.io.warc.WARCRecordInfo;
+import org.archive.io.warc.WARCWriter;
+import org.archive.io.warc.WARCWriterPoolSettingsData;
+import org.archive.uid.UUIDGenerator;
+import org.archive.util.DateUtils;
+import org.archive.util.anvl.ANVLRecord;
+
+public class CommonCrawlFormatWARC extends AbstractCommonCrawlFormat {
+
+  public static final String MAX_WARC_FILE_SIZE = "warc.file.size.max";
+  public static final String TEMPLATE = "${prefix}-${timestamp17}-${serialno}";
+
+  private static final AtomicInteger SERIALNO = new AtomicInteger();
+  private final static UUIDGenerator GENERATOR = new UUIDGenerator();
+
+  private String outputDir = null;
+  private ByteArrayOutputStream out;
+  private WARCWriter writer;
+  private ParseData parseData;
+
+  public CommonCrawlFormatWARC(Configuration nutchConf,
+      CommonCrawlConfig config) throws IOException {
+    super(null, null, null, nutchConf, config);
+
+    this.out = new ByteArrayOutputStream();
+
+    ANVLRecord info = WARCUtils.getWARCInfoContent(nutchConf);
+    List<String> md = Collections.singletonList(info.toString());
+
+    this.outputDir = config.getOutputDir();
+
+    if (null == outputDir) {
+      String message = "Missing output directory configuration: " + outputDir;
+
+      throw new RuntimeException(message);
+    }
+
+    File file = new File(outputDir);
+
+    long maxSize = WARCConstants.DEFAULT_MAX_WARC_FILE_SIZE;
+
+    if (config.getWarcSize() > 0) {
+      maxSize = config.getWarcSize();
+    }
+
+    WARCWriterPoolSettingsData settings = new WARCWriterPoolSettingsData(
+        WriterPoolMember.DEFAULT_PREFIX, TEMPLATE, maxSize,
+        config.isCompressed(), Arrays.asList(new File[] { file }), md,
+        new UUIDGenerator());
+
+    writer = new WARCWriter(SERIALNO, settings);
+  }
+
+  public CommonCrawlFormatWARC(String url, Content content, Metadata metadata,
+      Configuration nutchConf, CommonCrawlConfig config, ParseData parseData)
+      throws IOException {
+    super(url, content, metadata, nutchConf, config);
+
+    this.out = new ByteArrayOutputStream();
+    this.parseData = parseData;
+
+    ANVLRecord info = WARCUtils.getWARCInfoContent(conf);
+    List<String> md = Collections.singletonList(info.toString());
+
+    this.outputDir = config.getOutputDir();
+
+    if (null == outputDir) {
+      String message = "Missing output directory configuration: " + outputDir;
+
+      throw new RuntimeException(message);
+    }
+
+    File file = new File(outputDir);
+
+    long maxSize = WARCConstants.DEFAULT_MAX_WARC_FILE_SIZE;
+
+    if (config.getWarcSize() > 0) {
+      maxSize = config.getWarcSize();
+    }
+
+    WARCWriterPoolSettingsData settings = new WARCWriterPoolSettingsData(
+        WriterPoolMember.DEFAULT_PREFIX, TEMPLATE, maxSize,
+        config.isCompressed(), Arrays.asList(new File[] { file }), md,
+        new UUIDGenerator());
+
+    writer = new WARCWriter(SERIALNO, settings);
+  }
+
+  public String getJsonData(String url, Content content, Metadata metadata,
+      ParseData parseData) throws IOException {
+    this.url = url;
+    this.content = content;
+    this.metadata = metadata;
+    this.parseData = parseData;
+
+    return this.getJsonData();
+  }
+
+  @Override
+  public String getJsonData() throws IOException {
+
+    long position = writer.getPosition();
+
+    try {
+      // See if we need to open a new file because we've exceeded maxBytes
+
+      // checkSize will open a new file if we exceeded the maxBytes setting
+      writer.checkSize();
+
+      if (writer.getPosition() != position) {
+        // We just closed the file because it was larger than maxBytes.
+        position = writer.getPosition();
+      }
+
+      // response record
+      URI id = writeResponse();
+
+      if (StringUtils.isNotBlank(metadata.get("_request_"))) {
+        // write the request method if any request info is found
+        writeRequest(id);
+      }
+    } catch (IOException e) {
+      // Launch the corresponding IO error
+      throw e;
+    } catch (ParseException e) {
+      // do nothing, as we can't establish a valid WARC-Date for this record
+      // lets skip it altogether
+      LOG.error("Can't get a valid date from: {}", url);
+    }
+
+    return null;
+  }
+
+  protected URI writeResponse() throws IOException, ParseException {
+    WARCRecordInfo record = new WARCRecordInfo();
+
+    record.setType(WARCConstants.WARCRecordType.response);
+    record.setUrl(getUrl());
+
+    String fetchTime;
+
+    record.setCreate14DigitDate(DateUtils
+        .getLog14Date(Long.parseLong(metadata.get("nutch.fetch.time"))));
+    record.setMimetype(WARCConstants.HTTP_RESPONSE_MIMETYPE);
+    record.setRecordId(GENERATOR.getRecordID());
+
+    String IP = getResponseAddress();
+
+    if (StringUtils.isNotBlank(IP))
+      record.addExtraHeader(WARCConstants.HEADER_KEY_IP, IP);
+
+    if (ParseSegment.isTruncated(content))
+      record.addExtraHeader(WARCConstants.HEADER_KEY_TRUNCATED, "unspecified");
+
+    ByteArrayOutputStream output = new ByteArrayOutputStream();
+
+    String httpHeaders = metadata.get("_response.headers_");
+
+    if (StringUtils.isNotBlank(httpHeaders)) {
+      output.write(httpHeaders.getBytes());
+    } else {
+      // change the record type to resource as we not have information about
+      // the headers
+      record.setType(WARCConstants.WARCRecordType.resource);
+      record.setMimetype(content.getContentType());
+    }
+
+    output.write(getResponseContent().getBytes());
+
+    record.setContentLength(output.size());
+    record.setContentStream(new ByteArrayInputStream(output.toByteArray()));
+
+    if (output.size() > 0) {
+      // avoid generating a 0 sized record, as the webarchive library will
+      // complain about it
+      writer.writeRecord(record);
+    }
+
+    return record.getRecordId();
+  }
+
+  protected URI writeRequest(URI id) throws IOException, ParseException {
+    WARCRecordInfo record = new WARCRecordInfo();
+
+    record.setType(WARCConstants.WARCRecordType.request);
+    record.setUrl(getUrl());
+    record.setCreate14DigitDate(DateUtils
+        .getLog14Date(Long.parseLong(metadata.get("nutch.fetch.time"))));
+    record.setMimetype(WARCConstants.HTTP_REQUEST_MIMETYPE);
+    record.setRecordId(GENERATOR.getRecordID());
+
+    if (id != null) {
+      ANVLRecord headers = new ANVLRecord();
+      headers.addLabelValue(WARCConstants.HEADER_KEY_CONCURRENT_TO,
+          '<' + id.toString() + '>');
+      record.setExtraHeaders(headers);
+    }
+
+    ByteArrayOutputStream output = new ByteArrayOutputStream();
+
+    output.write(metadata.get("_request_").getBytes());
+    record.setContentLength(output.size());
+    record.setContentStream(new ByteArrayInputStream(output.toByteArray()));
+
+    writer.writeRecord(record);
+
+    return record.getRecordId();
+  }
+
+  @Override
+  protected String generateJson() throws IOException {
+    return null;
+  }
+
+  @Override
+  protected void writeKeyValue(String key, String value) throws IOException {
+    throw new NotImplementedException();
+  }
+
+  @Override
+  protected void writeKeyNull(String key) throws IOException {
+    throw new NotImplementedException();
+  }
+
+  @Override
+  protected void startArray(String key, boolean nested, boolean newline)
+      throws IOException {
+    throw new NotImplementedException();
+  }
+
+  @Override
+  protected void closeArray(String key, boolean nested, boolean newline)
+      throws IOException {
+    throw new NotImplementedException();
+  }
+
+  @Override
+  protected void writeArrayValue(String value) throws IOException {
+    throw new NotImplementedException();
+  }
+
+  @Override
+  protected void startObject(String key) throws IOException {
+    throw new NotImplementedException();
+  }
+
+  @Override
+  protected void closeObject(String key) throws IOException {
+    throw new NotImplementedException();
+  }
+
+  @Override
+  public void close() {
+    if (writer != null)
+      try {
+        writer.close();
+      } catch (IOException e) {
+        throw new RuntimeException(e);
+      }
+  }
+}

[35/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build for nutch-core and nutch-plugins

Posted by th...@apache.org.

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/tools/DmozParser.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/tools/DmozParser.java b/nutch-core/src/main/java/org/apache/nutch/tools/DmozParser.java
new file mode 100644
index 0000000..54ec543
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/tools/DmozParser.java
@@ -0,0 +1,391 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tools;
+
+import java.io.*;
+import java.util.*;
+import java.util.regex.*;
+
+import javax.xml.parsers.*;
+import org.xml.sax.*;
+import org.xml.sax.helpers.*;
+import org.apache.xerces.util.XMLChar;
+
+// Slf4j Logging imports
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.io.*;
+import org.apache.hadoop.fs.*;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.NutchConfiguration;
+
+/** Utility that converts DMOZ RDF into a flat file of URLs to be injected. */
+public class DmozParser {
+  public static final Logger LOG = LoggerFactory.getLogger(DmozParser.class);
+
+  long pages = 0;
+
+  /**
+   * This filter fixes characters that might offend our parser. This lets us be
+   * tolerant of errors that might appear in the input XML.
+   */
+  private static class XMLCharFilter extends FilterReader {
+    private boolean lastBad = false;
+
+    public XMLCharFilter(Reader reader) {
+      super(reader);
+    }
+
+    public int read() throws IOException {
+      int c = in.read();
+      int value = c;
+      if (c != -1 && !(XMLChar.isValid(c))) // fix invalid characters
+        value = 'X';
+      else if (lastBad && c == '<') { // fix mis-matched brackets
+        in.mark(1);
+        if (in.read() != '/')
+          value = 'X';
+        in.reset();
+      }
+      lastBad = (c == 65533);
+
+      return value;
+    }
+
+    public int read(char[] cbuf, int off, int len) throws IOException {
+      int n = in.read(cbuf, off, len);
+      if (n != -1) {
+        for (int i = 0; i < n; i++) {
+          char c = cbuf[off + i];
+          char value = c;
+          if (!(XMLChar.isValid(c))) // fix invalid characters
+            value = 'X';
+          else if (lastBad && c == '<') { // fix mis-matched brackets
+            if (i != n - 1 && cbuf[off + i + 1] != '/')
+              value = 'X';
+          }
+          lastBad = (c == 65533);
+          cbuf[off + i] = value;
+        }
+      }
+      return n;
+    }
+  }
+
+  /**
+   * The RDFProcessor receives tag messages during a parse of RDF XML data. We
+   * build whatever structures we need from these messages.
+   */
+  private class RDFProcessor extends DefaultHandler {
+    String curURL = null, curSection = null;
+    boolean titlePending = false, descPending = false,
+        insideAdultSection = false;
+    Pattern topicPattern = null;
+    StringBuffer title = new StringBuffer(), desc = new StringBuffer();
+    XMLReader reader;
+    int subsetDenom;
+    int hashSkew;
+    boolean includeAdult;
+    Locator location;
+
+    /**
+     * Pass in an XMLReader, plus a flag as to whether we should include adult
+     * material.
+     */
+    public RDFProcessor(XMLReader reader, int subsetDenom,
+        boolean includeAdult, int skew, Pattern topicPattern)
+        throws IOException {
+      this.reader = reader;
+      this.subsetDenom = subsetDenom;
+      this.includeAdult = includeAdult;
+      this.topicPattern = topicPattern;
+
+      this.hashSkew = skew != 0 ? skew : new Random().nextInt();
+    }
+
+    //
+    // Interface ContentHandler
+    //
+
+    /**
+     * Start of an XML elt
+     */
+    public void startElement(String namespaceURI, String localName,
+        String qName, Attributes atts) throws SAXException {
+      if ("Topic".equals(qName)) {
+        curSection = atts.getValue("r:id");
+      } else if ("ExternalPage".equals(qName)) {
+        // Porn filter
+        if ((!includeAdult) && curSection.startsWith("Top/Adult")) {
+          return;
+        }
+
+        if (topicPattern != null && !topicPattern.matcher(curSection).matches()) {
+          return;
+        }
+
+        // Subset denominator filter.
+        // Only emit with a chance of 1/denominator.
+        String url = atts.getValue("about");
+        int hashValue = MD5Hash.digest(url).hashCode();
+        hashValue = Math.abs(hashValue ^ hashSkew);
+        if ((hashValue % subsetDenom) != 0) {
+          return;
+        }
+
+        // We actually claim the URL!
+        curURL = url;
+      } else if (curURL != null && "d:Title".equals(qName)) {
+        titlePending = true;
+      } else if (curURL != null && "d:Description".equals(qName)) {
+        descPending = true;
+      }
+    }
+
+    /**
+     * The contents of an XML elt
+     */
+    public void characters(char ch[], int start, int length) {
+      if (titlePending) {
+        title.append(ch, start, length);
+      } else if (descPending) {
+        desc.append(ch, start, length);
+      }
+    }
+
+    /**
+     * Termination of XML elt
+     */
+    public void endElement(String namespaceURI, String localName, String qName)
+        throws SAXException {
+      if (curURL != null) {
+        if ("ExternalPage".equals(qName)) {
+          //
+          // Inc the number of pages, insert the page, and
+          // possibly print status.
+          //
+          System.out.println(curURL);
+          pages++;
+
+          //
+          // Clear out the link text. This is what
+          // you would use for adding to the linkdb.
+          //
+          if (title.length() > 0) {
+            title.delete(0, title.length());
+          }
+          if (desc.length() > 0) {
+            desc.delete(0, desc.length());
+          }
+
+          // Null out the URL.
+          curURL = null;
+        } else if ("d:Title".equals(qName)) {
+          titlePending = false;
+        } else if ("d:Description".equals(qName)) {
+          descPending = false;
+        }
+      }
+    }
+
+    /**
+     * When parsing begins
+     */
+    public void startDocument() {
+      LOG.info("Begin parse");
+    }
+
+    /**
+     * When parsing ends
+     */
+    public void endDocument() {
+      LOG.info("Completed parse.  Found " + pages + " pages.");
+    }
+
+    /**
+     * From time to time the Parser will set the "current location" by calling
+     * this function. It's useful for emitting locations for error messages.
+     */
+    public void setDocumentLocator(Locator locator) {
+      location = locator;
+    }
+
+    //
+    // Interface ErrorHandler
+    //
+
+    /**
+     * Emit the exception message
+     */
+    public void error(SAXParseException spe) {
+      if (LOG.isErrorEnabled()) {
+        LOG.error("Error: " + spe.toString() + ": " + spe.getMessage());
+      }
+    }
+
+    /**
+     * Emit the exception message, with line numbers
+     */
+    public void errorError(SAXParseException spe) {
+      if (LOG.isErrorEnabled()) {
+        LOG.error("Fatal err: " + spe.toString() + ": " + spe.getMessage());
+        LOG.error("Last known line is " + location.getLineNumber()
+            + ", column " + location.getColumnNumber());
+      }
+    }
+
+    /**
+     * Emit exception warning message
+     */
+    public void warning(SAXParseException spe) {
+      if (LOG.isWarnEnabled()) {
+        LOG.warn("Warning: " + spe.toString() + ": " + spe.getMessage());
+      }
+    }
+  }
+
+  /**
+   * Iterate through all the items in this structured DMOZ file. Add each URL to
+   * the web db.
+   */
+  public void parseDmozFile(File dmozFile, int subsetDenom,
+      boolean includeAdult, int skew, Pattern topicPattern)
+
+  throws IOException, SAXException, ParserConfigurationException {
+
+    SAXParserFactory parserFactory = SAXParserFactory.newInstance();
+    SAXParser parser = parserFactory.newSAXParser();
+    XMLReader reader = parser.getXMLReader();
+
+    // Create our own processor to receive SAX events
+    RDFProcessor rp = new RDFProcessor(reader, subsetDenom, includeAdult, skew,
+        topicPattern);
+    reader.setContentHandler(rp);
+    reader.setErrorHandler(rp);
+    LOG.info("skew = " + rp.hashSkew);
+
+    //
+    // Open filtered text stream. The TextFilter makes sure that
+    // only appropriate XML-approved Text characters are received.
+    // Any non-conforming characters are silently skipped.
+    //
+    XMLCharFilter in = new XMLCharFilter(new BufferedReader(
+        new InputStreamReader(new BufferedInputStream(new FileInputStream(
+            dmozFile)), "UTF-8")));
+    try {
+      InputSource is = new InputSource(in);
+      reader.parse(is);
+    } catch (Exception e) {
+      if (LOG.isErrorEnabled()) {
+        LOG.error(e.toString());
+      }
+      System.exit(0);
+    } finally {
+      in.close();
+    }
+  }
+
+  private static void addTopicsFromFile(String topicFile, Vector<String> topics)
+      throws IOException {
+    BufferedReader in = null;
+    try {
+      in = new BufferedReader(new InputStreamReader(new FileInputStream(
+          topicFile), "UTF-8"));
+      String line = null;
+      while ((line = in.readLine()) != null) {
+        topics.addElement(new String(line));
+      }
+    } catch (Exception e) {
+      if (LOG.isErrorEnabled()) {
+        LOG.error(e.toString());
+      }
+      System.exit(0);
+    } finally {
+      in.close();
+    }
+  }
+
+  /**
+   * Command-line access. User may add URLs via a flat text file or the
+   * structured DMOZ file. By default, we ignore Adult material (as categorized
+   * by DMOZ).
+   */
+  public static void main(String argv[]) throws Exception {
+    if (argv.length < 1) {
+      System.err
+          .println("Usage: DmozParser <dmoz_file> [-subset <subsetDenominator>] [-includeAdultMaterial] [-skew skew] [-topicFile <topic list file>] [-topic <topic> [-topic <topic> [...]]]");
+      return;
+    }
+
+    //
+    // Parse the command line, figure out what kind of
+    // URL file we need to load
+    //
+    int subsetDenom = 1;
+    int skew = 0;
+    String dmozFile = argv[0];
+    boolean includeAdult = false;
+    Pattern topicPattern = null;
+    Vector<String> topics = new Vector<String>();
+
+    Configuration conf = NutchConfiguration.create();
+    FileSystem fs = FileSystem.get(conf);
+    try {
+      for (int i = 1; i < argv.length; i++) {
+        if ("-includeAdultMaterial".equals(argv[i])) {
+          includeAdult = true;
+        } else if ("-subset".equals(argv[i])) {
+          subsetDenom = Integer.parseInt(argv[i + 1]);
+          i++;
+        } else if ("-topic".equals(argv[i])) {
+          topics.addElement(argv[i + 1]);
+          i++;
+        } else if ("-topicFile".equals(argv[i])) {
+          addTopicsFromFile(argv[i + 1], topics);
+          i++;
+        } else if ("-skew".equals(argv[i])) {
+          skew = Integer.parseInt(argv[i + 1]);
+          i++;
+        }
+      }
+
+      DmozParser parser = new DmozParser();
+
+      if (!topics.isEmpty()) {
+        String regExp = new String("^(");
+        int j = 0;
+        for (; j < topics.size() - 1; ++j) {
+          regExp = regExp.concat(topics.get(j));
+          regExp = regExp.concat("|");
+        }
+        regExp = regExp.concat(topics.get(j));
+        regExp = regExp.concat(").*");
+        LOG.info("Topic selection pattern = " + regExp);
+        topicPattern = Pattern.compile(regExp);
+      }
+
+      parser.parseDmozFile(new File(dmozFile), subsetDenom, includeAdult, skew,
+          topicPattern);
+
+    } finally {
+      fs.close();
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/tools/FileDumper.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/tools/FileDumper.java b/nutch-core/src/main/java/org/apache/nutch/tools/FileDumper.java
new file mode 100644
index 0000000..b7c1805
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/tools/FileDumper.java
@@ -0,0 +1,419 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tools;
+
+//JDK imports
+import java.io.DataOutputStream;
+import java.io.File;
+import java.io.FileFilter;
+import java.io.FileOutputStream;
+import java.io.ByteArrayInputStream;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Map;
+import com.google.common.base.Strings;
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.CommandLineParser;
+import org.apache.commons.cli.GnuParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.OptionBuilder;
+import org.apache.commons.cli.Options;
+//Commons imports
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.io.FilenameUtils;
+import org.apache.commons.codec.digest.DigestUtils;
+
+//Hadoop
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.DumpFileUtil;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.TableUtil;
+
+//Tika imports
+import org.apache.tika.Tika;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * <p>
+ * The file dumper tool enables one to reverse generate the raw content from
+ * Nutch segment data directories.
+ * </p>
+ * <p>
+ * The tool has a number of immediate uses:
+ * <ol>
+ * <li>one can see what a page looked like at the time it was crawled</li>
+ * <li>one can see different media types acquired as part of the crawl</li>
+ * <li>it enables us to see webpages before we augment them with additional
+ * metadata, this can be handy for providing a provenance trail for your crawl
+ * data.</li>
+ * </ol>
+ * </p>
+ * <p>
+ * Upon successful completion the tool displays a very convenient JSON snippet
+ * detailing the mimetype classifications and the counts of documents which fall
+ * into those classifications. An example is as follows:
+ * </p>
+ * 
+ * <pre>
+ * {@code
+ * INFO: File Types: 
+ *   TOTAL Stats:    
+ *    [
+ *     {"mimeType":"application/xml","count":"19"}
+ *     {"mimeType":"image/png","count":"47"}
+ *     {"mimeType":"image/jpeg","count":"141"}
+ *     {"mimeType":"image/vnd.microsoft.icon","count":"4"}
+ *     {"mimeType":"text/plain","count":"89"}
+ *     {"mimeType":"video/quicktime","count":"2"}
+ *     {"mimeType":"image/gif","count":"63"}
+ *     {"mimeType":"application/xhtml+xml","count":"1670"}
+ *     {"mimeType":"application/octet-stream","count":"40"}
+ *     {"mimeType":"text/html","count":"1863"}
+ *   ]
+ *   
+ *   FILTER Stats: 
+ *   [
+ *     {"mimeType":"image/png","count":"47"}
+ *     {"mimeType":"image/jpeg","count":"141"}
+ *     {"mimeType":"image/vnd.microsoft.icon","count":"4"}
+ *     {"mimeType":"video/quicktime","count":"2"}
+ *     {"mimeType":"image/gif","count":"63"}
+ *   ]
+ * }
+ * </pre>
+ * <p>
+ * In the case above, the tool would have been run with the <b>-mimeType
+ * image/png image/jpeg image/vnd.microsoft.icon video/quicktime image/gif</b>
+ * flag and corresponding values activated.
+ * 
+ */
+public class FileDumper {
+
+  private static final Logger LOG = LoggerFactory.getLogger(FileDumper.class
+      .getName());
+
+  /**
+   * Dumps the reverse engineered raw content from the provided segment
+   * directories if a parent directory contains more than one segment, otherwise
+   * a single segment can be passed as an argument.
+   * 
+   * @param outputDir
+   *          the directory you wish to dump the raw content to. This directory
+   *          will be created.
+   * @param segmentRootDir
+   *          a directory containing one or more segments.
+   * @param mimeTypes
+   *          an array of mime types we have to dump, all others will be
+   *          filtered out.
+   * @param flatDir
+   *          a boolean flag specifying whether the output directory should contain
+   *          only files instead of using nested directories to prevent naming
+   *          conflicts.
+   * @param mimeTypeStats
+   *          a flag indicating whether mimetype stats should be displayed
+   *          instead of dumping files.
+   * @throws Exception
+   */
+  public void dump(File outputDir, File segmentRootDir, String[] mimeTypes, boolean flatDir, boolean mimeTypeStats, boolean reverseURLDump)
+      throws Exception {
+    if (mimeTypes == null)
+      LOG.info("Accepting all mimetypes.");
+    // total file counts
+    Map<String, Integer> typeCounts = new HashMap<String, Integer>();
+    // filtered file counts
+    Map<String, Integer> filteredCounts = new HashMap<String, Integer>();
+    Configuration conf = NutchConfiguration.create();
+    FileSystem fs = FileSystem.get(conf);
+    int fileCount = 0;
+    File[] segmentDirs = segmentRootDir.listFiles(new FileFilter() {
+
+      @Override
+      public boolean accept(File file) {
+        return file.canRead() && file.isDirectory();
+      }
+    });
+    if (segmentDirs == null) {
+      LOG.error("No segment directories found in ["
+          + segmentRootDir.getAbsolutePath() + "]");
+      return;
+    }
+
+    for (File segment : segmentDirs) {
+      LOG.info("Processing segment: [" + segment.getAbsolutePath() + "]");
+      DataOutputStream doutputStream = null;
+
+      File segmentDir = new File(segment.getAbsolutePath(), Content.DIR_NAME);
+      File[] partDirs = segmentDir.listFiles(new FileFilter() {
+        @Override
+        public boolean accept(File file) {
+          return file.canRead() && file.isDirectory();
+        }
+      });
+
+      if (partDirs == null) {
+        LOG.warn("Skipping Corrupt Segment: [{}]", segment.getAbsolutePath());
+        continue;
+      }
+
+      for (File partDir : partDirs) {
+        try {
+          String segmentPath = partDir + "/data";
+          Path file = new Path(segmentPath);
+          if (!new File(file.toString()).exists()) {
+            LOG.warn("Skipping segment: [" + segmentPath
+                + "]: no data directory present");
+            continue;
+          }
+
+          SequenceFile.Reader reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(file));
+
+          Writable key = (Writable) reader.getKeyClass().newInstance();
+          Content content = null;
+
+          while (reader.next(key)) {
+            content = new Content();
+            reader.getCurrentValue(content);
+            String url = key.toString();
+            String baseName = FilenameUtils.getBaseName(url);
+            String extension = FilenameUtils.getExtension(url);
+            if (extension == null || (extension != null && extension.equals(""))) {
+              extension = "html";
+            }
+
+            String filename = baseName + "." + extension;
+            ByteArrayInputStream bas = null;
+            Boolean filter = false;
+            try {
+              bas = new ByteArrayInputStream(content.getContent());
+              String mimeType = new Tika().detect(content.getContent());
+              collectStats(typeCounts, mimeType);
+              if (mimeType != null) {
+                if (mimeTypes == null
+                    || Arrays.asList(mimeTypes).contains(mimeType)) {
+                  collectStats(filteredCounts, mimeType);
+                  filter = true;
+                }
+              }
+            } catch (Exception e) {
+              e.printStackTrace();
+              LOG.warn("Tika is unable to detect type for: [" + url + "]");
+            } finally {
+              if (bas != null) {
+                try {
+                  bas.close();
+                } catch (Exception ignore) {
+                }
+              }
+            }
+
+            if (filter) {
+              if (!mimeTypeStats) {
+                String md5Ofurl = DumpFileUtil.getUrlMD5(url);
+
+                String fullDir = outputDir.getAbsolutePath();
+                if (!flatDir && !reverseURLDump) {
+                  fullDir = DumpFileUtil.createTwoLevelsDirectory(fullDir, md5Ofurl);
+                }
+
+                if (!Strings.isNullOrEmpty(fullDir)) {
+                  String outputFullPath;
+
+                  if (reverseURLDump) {
+                    String[] reversedURL = TableUtil.reverseUrl(url).split(":");
+                    reversedURL[0] = reversedURL[0].replace('.', '/');
+
+                    String reversedURLPath = reversedURL[0] + "/" + DigestUtils.sha256Hex(url).toUpperCase();
+                    outputFullPath = String.format("%s/%s", fullDir, reversedURLPath);
+                    
+                    // We'll drop the trailing file name and create the nested structure if it doesn't already exist.
+                    String[] splitPath = outputFullPath.split("/");
+                    File fullOutputDir = new File(org.apache.commons.lang3.StringUtils.join(Arrays.copyOf(splitPath, splitPath.length - 1), "/"));
+
+                    if (!fullOutputDir.exists()) {
+                      fullOutputDir.mkdirs();
+                    }
+                  } else {
+                    outputFullPath = String.format("%s/%s", fullDir, DumpFileUtil.createFileName(md5Ofurl, baseName, extension));
+                  }
+
+                  File outputFile = new File(outputFullPath);
+                  
+                  if (!outputFile.exists()) {
+                    LOG.info("Writing: [" + outputFullPath + "]");
+
+                    // Modified to prevent FileNotFoundException (Invalid Argument) 
+                    FileOutputStream output = null;
+                    try {
+                      output = new FileOutputStream(outputFile);
+                      IOUtils.write(content.getContent(), output);
+                    }
+                    catch (Exception e) {
+                      LOG.warn("Write Error: [" + outputFullPath + "]");
+                      e.printStackTrace();
+                    }
+                    finally {
+                      if (output != null) {
+                        output.flush();
+                        try {
+                          output.close();
+                        } catch (Exception ignore) {
+                        }
+                      }
+                    }
+                    fileCount++;
+                  } else {
+                    LOG.info("Skipping writing: [" + outputFullPath
+                        + "]: file already exists");
+                  }
+                }
+              }
+            }
+          }
+          reader.close();
+        } finally {
+          fs.close();
+          if (doutputStream != null) {
+            try {
+              doutputStream.close();
+            } catch (Exception ignore) {
+            }
+          }
+        }
+      }
+    }
+    LOG.info("Dumper File Stats: "
+        + DumpFileUtil.displayFileTypes(typeCounts, filteredCounts));
+
+    if (mimeTypeStats) {
+      System.out.println("Dumper File Stats: " 
+          + DumpFileUtil.displayFileTypes(typeCounts, filteredCounts));
+    }
+  }
+
+  /**
+   * Main method for invoking this tool
+   * 
+   * @param args
+   *          1) output directory (which will be created) to host the raw data
+   *          and 2) a directory containing one or more segments.
+   * @throws Exception
+   */
+  public static void main(String[] args) throws Exception {
+    // boolean options
+    Option helpOpt = new Option("h", "help", false, "show this help message");
+    // argument options
+    @SuppressWarnings("static-access")
+    Option outputOpt = OptionBuilder
+    .withArgName("outputDir")
+    .hasArg()
+    .withDescription(
+        "output directory (which will be created) to host the raw data")
+    .create("outputDir");
+    @SuppressWarnings("static-access")
+    Option segOpt = OptionBuilder.withArgName("segment").hasArgs()
+    .withDescription("the segment(s) to use").create("segment");
+    @SuppressWarnings("static-access")
+    Option mimeOpt = OptionBuilder
+    .withArgName("mimetype")
+    .hasArgs()
+    .withDescription(
+        "an optional list of mimetypes to dump, excluding all others. Defaults to all.")
+    .create("mimetype");
+    @SuppressWarnings("static-access")
+    Option mimeStat = OptionBuilder
+    .withArgName("mimeStats")
+    .withDescription(
+        "only display mimetype stats for the segment(s) instead of dumping file.")
+    .create("mimeStats");
+    @SuppressWarnings("static-access")
+    Option dirStructureOpt = OptionBuilder
+    .withArgName("flatdir")
+    .withDescription(
+        "optionally specify that the output directory should only contain files.")
+    .create("flatdir");
+    @SuppressWarnings("static-access")
+    Option reverseURLOutput = OptionBuilder
+    .withArgName("reverseUrlDirs")
+    .withDescription(
+        "optionally specify to use reverse URL folders for output structure.")
+    .create("reverseUrlDirs");
+
+    // create the options
+    Options options = new Options();
+    options.addOption(helpOpt);
+    options.addOption(outputOpt);
+    options.addOption(segOpt);
+    options.addOption(mimeOpt);
+    options.addOption(mimeStat);
+    options.addOption(dirStructureOpt);
+    options.addOption(reverseURLOutput);
+
+    CommandLineParser parser = new GnuParser();
+    try {
+      CommandLine line = parser.parse(options, args);
+      if (line.hasOption("help") || !line.hasOption("outputDir")
+          || (!line.hasOption("segment"))) {
+        HelpFormatter formatter = new HelpFormatter();
+        formatter.printHelp("FileDumper", options, true);
+        return;
+      }
+
+      File outputDir = new File(line.getOptionValue("outputDir"));
+      File segmentRootDir = new File(line.getOptionValue("segment"));
+      String[] mimeTypes = line.getOptionValues("mimetype");
+      boolean flatDir = line.hasOption("flatdir");
+      boolean shouldDisplayStats = false;
+      if (line.hasOption("mimeStats"))
+        shouldDisplayStats = true;
+      boolean reverseURLDump = false;
+      if (line.hasOption("reverseUrlDirs"))
+        reverseURLDump = true;
+
+      if (!outputDir.exists()) {
+        LOG.warn("Output directory: [" + outputDir.getAbsolutePath()
+        + "]: does not exist, creating it.");
+        if (!shouldDisplayStats) {
+          if (!outputDir.mkdirs())
+            throw new Exception("Unable to create: ["
+                + outputDir.getAbsolutePath() + "]");
+        }
+      }
+
+      FileDumper dumper = new FileDumper();
+      dumper.dump(outputDir, segmentRootDir, mimeTypes, flatDir, shouldDisplayStats, reverseURLDump);
+    } catch (Exception e) {
+      LOG.error("FileDumper: " + StringUtils.stringifyException(e));
+      e.printStackTrace();
+      return;
+    }
+  }
+
+  private void collectStats(Map<String, Integer> typeCounts, String mimeType) {
+    typeCounts.put(mimeType,
+        typeCounts.containsKey(mimeType) ? typeCounts.get(mimeType) + 1 : 1);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/tools/FreeGenerator.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/tools/FreeGenerator.java b/nutch-core/src/main/java/org/apache/nutch/tools/FreeGenerator.java
new file mode 100644
index 0000000..138372f
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/tools/FreeGenerator.java
@@ -0,0 +1,214 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tools;
+
+import java.io.IOException;
+import java.text.SimpleDateFormat;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.Map.Entry;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.WritableComparable;
+import org.apache.hadoop.mapred.FileInputFormat;
+import org.apache.hadoop.mapred.FileOutputFormat;
+import org.apache.hadoop.mapred.JobClient;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.MapReduceBase;
+import org.apache.hadoop.mapred.Mapper;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reducer;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.hadoop.mapred.SequenceFileOutputFormat;
+import org.apache.hadoop.mapred.TextInputFormat;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Generator;
+import org.apache.nutch.crawl.URLPartitioner;
+import org.apache.nutch.net.URLFilters;
+import org.apache.nutch.net.URLNormalizers;
+import org.apache.nutch.scoring.ScoringFilters;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.NutchJob;
+import org.apache.nutch.util.TimingUtil;
+
+/**
+ * This tool generates fetchlists (segments to be fetched) from plain text files
+ * containing one URL per line. It's useful when arbitrary URL-s need to be
+ * fetched without adding them first to the CrawlDb, or during testing.
+ */
+public class FreeGenerator extends Configured implements Tool {
+  private static final Logger LOG = LoggerFactory
+      .getLogger(FreeGenerator.class);
+
+  private static final String FILTER_KEY = "free.generator.filter";
+  private static final String NORMALIZE_KEY = "free.generator.normalize";
+
+  public static class FG extends MapReduceBase implements
+      Mapper<WritableComparable<?>, Text, Text, Generator.SelectorEntry>,
+      Reducer<Text, Generator.SelectorEntry, Text, CrawlDatum> {
+    private URLNormalizers normalizers = null;
+    private URLFilters filters = null;
+    private ScoringFilters scfilters;
+    private CrawlDatum datum = new CrawlDatum();
+    private Text url = new Text();
+    private int defaultInterval = 0;
+
+    @Override
+    public void configure(JobConf job) {
+      super.configure(job);
+      defaultInterval = job.getInt("db.fetch.interval.default", 0);
+      scfilters = new ScoringFilters(job);
+      if (job.getBoolean(FILTER_KEY, false)) {
+        filters = new URLFilters(job);
+      }
+      if (job.getBoolean(NORMALIZE_KEY, false)) {
+        normalizers = new URLNormalizers(job, URLNormalizers.SCOPE_INJECT);
+      }
+    }
+
+    Generator.SelectorEntry entry = new Generator.SelectorEntry();
+
+    public void map(WritableComparable<?> key, Text value,
+        OutputCollector<Text, Generator.SelectorEntry> output, Reporter reporter)
+        throws IOException {
+      // value is a line of text
+      String urlString = value.toString();
+      try {
+        if (normalizers != null) {
+          urlString = normalizers.normalize(urlString,
+              URLNormalizers.SCOPE_INJECT);
+        }
+        if (urlString != null && filters != null) {
+          urlString = filters.filter(urlString);
+        }
+        if (urlString != null) {
+          url.set(urlString);
+          scfilters.injectedScore(url, datum);
+        }
+      } catch (Exception e) {
+        LOG.warn("Error adding url '" + value.toString() + "', skipping: "
+            + StringUtils.stringifyException(e));
+        return;
+      }
+      if (urlString == null) {
+        if (LOG.isDebugEnabled()) {
+          LOG.debug("- skipping " + value.toString());
+        }
+        return;
+      }
+      entry.datum = datum;
+      entry.url = url;
+      // https://issues.apache.org/jira/browse/NUTCH-1430
+      entry.datum.setFetchInterval(defaultInterval);
+      output.collect(url, entry);
+    }
+
+    public void reduce(Text key, Iterator<Generator.SelectorEntry> values,
+        OutputCollector<Text, CrawlDatum> output, Reporter reporter)
+        throws IOException {
+      // pick unique urls from values - discard the reduce key due to hash
+      // collisions
+      HashMap<Text, CrawlDatum> unique = new HashMap<Text, CrawlDatum>();
+      while (values.hasNext()) {
+        Generator.SelectorEntry entry = values.next();
+        unique.put(entry.url, entry.datum);
+      }
+      // output unique urls
+      for (Entry<Text, CrawlDatum> e : unique.entrySet()) {
+        output.collect(e.getKey(), e.getValue());
+      }
+    }
+  }
+
+  public int run(String[] args) throws Exception {
+    if (args.length < 2) {
+      System.err
+          .println("Usage: FreeGenerator <inputDir> <segmentsDir> [-filter] [-normalize]");
+      System.err
+          .println("\tinputDir\tinput directory containing one or more input files.");
+      System.err
+          .println("\t\tEach text file contains a list of URLs, one URL per line");
+      System.err
+          .println("\tsegmentsDir\toutput directory, where new segment will be created");
+      System.err.println("\t-filter\trun current URLFilters on input URLs");
+      System.err
+          .println("\t-normalize\trun current URLNormalizers on input URLs");
+      return -1;
+    }
+    boolean filter = false;
+    boolean normalize = false;
+    if (args.length > 2) {
+      for (int i = 2; i < args.length; i++) {
+        if (args[i].equals("-filter")) {
+          filter = true;
+        } else if (args[i].equals("-normalize")) {
+          normalize = true;
+        } else {
+          LOG.error("Unknown argument: " + args[i] + ", exiting ...");
+          return -1;
+        }
+      }
+    }
+
+    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+    long start = System.currentTimeMillis();
+    LOG.info("FreeGenerator: starting at " + sdf.format(start));
+
+    JobConf job = new NutchJob(getConf());
+    job.setBoolean(FILTER_KEY, filter);
+    job.setBoolean(NORMALIZE_KEY, normalize);
+    FileInputFormat.addInputPath(job, new Path(args[0]));
+    job.setInputFormat(TextInputFormat.class);
+    job.setMapperClass(FG.class);
+    job.setMapOutputKeyClass(Text.class);
+    job.setMapOutputValueClass(Generator.SelectorEntry.class);
+    job.setPartitionerClass(URLPartitioner.class);
+    job.setReducerClass(FG.class);
+    String segName = Generator.generateSegmentName();
+    job.setNumReduceTasks(job.getNumMapTasks());
+    job.setOutputFormat(SequenceFileOutputFormat.class);
+    job.setOutputKeyClass(Text.class);
+    job.setOutputValueClass(CrawlDatum.class);
+    job.setOutputKeyComparatorClass(Generator.HashComparator.class);
+    FileOutputFormat.setOutputPath(job, new Path(args[1], new Path(segName,
+        CrawlDatum.GENERATE_DIR_NAME)));
+    try {
+      JobClient.runJob(job);
+    } catch (Exception e) {
+      LOG.error("FAILED: " + StringUtils.stringifyException(e));
+      return -1;
+    }
+    long end = System.currentTimeMillis();
+    LOG.info("FreeGenerator: finished at " + sdf.format(end) + ", elapsed: "
+        + TimingUtil.elapsedTime(start, end));
+    return 0;
+  }
+
+  public static void main(String[] args) throws Exception {
+    int res = ToolRunner.run(NutchConfiguration.create(), new FreeGenerator(),
+        args);
+    System.exit(res);
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/tools/ResolveUrls.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/tools/ResolveUrls.java b/nutch-core/src/main/java/org/apache/nutch/tools/ResolveUrls.java
new file mode 100644
index 0000000..2b1c63b
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/tools/ResolveUrls.java
@@ -0,0 +1,204 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.tools;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileReader;
+import java.net.InetAddress;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.concurrent.atomic.AtomicLong;
+
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.CommandLineParser;
+import org.apache.commons.cli.GnuParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.OptionBuilder;
+import org.apache.commons.cli.Options;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.nutch.util.URLUtil;
+
+/**
+ * A simple tool that will spin up multiple threads to resolve urls to ip
+ * addresses. This can be used to verify that pages that are failing due to
+ * UnknownHostException during fetching are actually bad and are not failing due
+ * to a dns problem in fetching.
+ */
+public class ResolveUrls {
+
+  public static final Logger LOG = LoggerFactory.getLogger(ResolveUrls.class);
+
+  private String urlsFile = null;
+  private int numThreads = 100;
+  private ExecutorService pool = null;
+  private static AtomicInteger numTotal = new AtomicInteger(0);
+  private static AtomicInteger numErrored = new AtomicInteger(0);
+  private static AtomicInteger numResolved = new AtomicInteger(0);
+  private static AtomicLong totalTime = new AtomicLong(0L);
+
+  /**
+   * A Thread which gets the ip address of a single host by name.
+   */
+  private static class ResolverThread extends Thread {
+
+    private String url = null;
+
+    public ResolverThread(String url) {
+      this.url = url;
+    }
+
+    public void run() {
+
+      numTotal.incrementAndGet();
+      String host = URLUtil.getHost(url);
+      long start = System.currentTimeMillis();
+      try {
+
+        // get the address by name and if no error is thrown then it
+        // is resolved successfully
+        InetAddress.getByName(host);
+        LOG.info("Resolved: " + host);
+        numResolved.incrementAndGet();
+      } catch (Exception uhe) {
+        LOG.info("Error Resolving: " + host);
+        numErrored.incrementAndGet();
+      }
+      long end = System.currentTimeMillis();
+      long total = (end - start);
+      totalTime.addAndGet(total);
+      LOG.info(", " + total + " millis");
+    }
+  }
+
+  /**
+   * Creates a thread pool for resolving urls. Reads in the url file on the
+   * local filesystem. For each url it attempts to resolve it keeping a total
+   * account of the number resolved, errored, and the amount of time.
+   */
+  public void resolveUrls() {
+
+    try {
+
+      // create a thread pool with a fixed number of threads
+      pool = Executors.newFixedThreadPool(numThreads);
+
+      // read in the urls file and loop through each line, one url per line
+      BufferedReader buffRead = new BufferedReader(new FileReader(new File(
+          urlsFile)));
+      String urlStr = null;
+      while ((urlStr = buffRead.readLine()) != null) {
+
+        // spin up a resolver thread per url
+        LOG.info("Starting: " + urlStr);
+        pool.execute(new ResolverThread(urlStr));
+      }
+
+      // close the file and wait for up to 60 seconds before shutting down
+      // the thread pool to give urls time to finish resolving
+      buffRead.close();
+      pool.awaitTermination(60, TimeUnit.SECONDS);
+    } catch (Exception e) {
+
+      // on error shutdown the thread pool immediately
+      pool.shutdownNow();
+      LOG.info(StringUtils.stringifyException(e));
+    }
+
+    // shutdown the thread pool and log totals
+    pool.shutdown();
+    LOG.info("Total: " + numTotal.get() + ", Resovled: " + numResolved.get()
+        + ", Errored: " + numErrored.get() + ", Average Time: "
+        + totalTime.get() / numTotal.get());
+  }
+
+  /**
+   * Create a new ResolveUrls with a file from the local file system.
+   * 
+   * @param urlsFile
+   *          The local urls file, one url per line.
+   */
+  public ResolveUrls(String urlsFile) {
+    this(urlsFile, 100);
+  }
+
+  /**
+   * Create a new ResolveUrls with a urls file and a number of threads for the
+   * Thread pool. Number of threads is 100 by default.
+   * 
+   * @param urlsFile
+   *          The local urls file, one url per line.
+   * @param numThreads
+   *          The number of threads used to resolve urls in parallel.
+   */
+  public ResolveUrls(String urlsFile, int numThreads) {
+    this.urlsFile = urlsFile;
+    this.numThreads = numThreads;
+  }
+
+  /**
+   * Runs the resolve urls tool.
+   */
+  public static void main(String[] args) {
+
+    Options options = new Options();
+    OptionBuilder.withArgName("help");
+    OptionBuilder.withDescription("show this help message");
+    Option helpOpts = OptionBuilder.create("help");
+    options.addOption(helpOpts);
+
+    OptionBuilder.withArgName("urls");
+    OptionBuilder.hasArg();
+    OptionBuilder.withDescription("the urls file to check");
+    Option urlOpts = OptionBuilder.create("urls");
+    options.addOption(urlOpts);
+
+    OptionBuilder.withArgName("numThreads");
+    OptionBuilder.hasArgs();
+    OptionBuilder.withDescription("the number of threads to use");
+    Option numThreadOpts = OptionBuilder.create("numThreads");
+    options.addOption(numThreadOpts);
+
+    CommandLineParser parser = new GnuParser();
+    try {
+      // parse out common line arguments
+      CommandLine line = parser.parse(options, args);
+      if (line.hasOption("help") || !line.hasOption("urls")) {
+        HelpFormatter formatter = new HelpFormatter();
+        formatter.printHelp("ResolveUrls", options);
+        return;
+      }
+
+      // get the urls and the number of threads and start the resolver
+      String urls = line.getOptionValue("urls");
+      int numThreads = 100;
+      String numThreadsStr = line.getOptionValue("numThreads");
+      if (numThreadsStr != null) {
+        numThreads = Integer.parseInt(numThreadsStr);
+      }
+      ResolveUrls resolve = new ResolveUrls(urls, numThreads);
+      resolve.resolveUrls();
+    } catch (Exception e) {
+      LOG.error("ResolveUrls: " + StringUtils.stringifyException(e));
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/tools/WARCUtils.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/tools/WARCUtils.java b/nutch-core/src/main/java/org/apache/nutch/tools/WARCUtils.java
new file mode 100644
index 0000000..d8ae0b3
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/tools/WARCUtils.java
@@ -0,0 +1,154 @@
+package org.apache.nutch.tools;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.UnsupportedEncodingException;
+import java.net.InetAddress;
+import java.net.UnknownHostException;
+import java.util.Date;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.util.StringUtil;
+import org.archive.format.http.HttpHeaders;
+import org.archive.format.warc.WARCConstants;
+import org.archive.io.warc.WARCRecordInfo;
+import org.archive.uid.UUIDGenerator;
+import org.archive.util.DateUtils;
+import org.archive.util.anvl.ANVLRecord;
+
+public class WARCUtils {
+    public final static String SOFTWARE = "software";
+    public final static String HTTP_HEADER_FROM = "http-header-from";
+    public final static String HTTP_HEADER_USER_AGENT = "http-header-user-agent";
+    public final static String HOSTNAME = "hostname";
+    public final static String ROBOTS = "robots";
+    public final static String OPERATOR = "operator";
+    public final static String FORMAT = "format";
+    public final static String CONFORMS_TO = "conformsTo";
+    public final static String IP = "ip";
+    public final static UUIDGenerator generator = new UUIDGenerator();
+
+    public static final ANVLRecord getWARCInfoContent(Configuration conf) {
+        ANVLRecord record = new ANVLRecord();
+
+        // informative headers
+        record.addLabelValue(FORMAT, "WARC File Format 1.0");
+        record.addLabelValue(CONFORMS_TO, "http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf");
+
+        record.addLabelValue(SOFTWARE, conf.get("http.agent.name", ""));
+        record.addLabelValue(HTTP_HEADER_USER_AGENT,
+                getAgentString(conf.get("http.agent.name", ""),
+                        conf.get("http.agent.version", ""),
+                        conf.get("http.agent.description", ""),
+                        conf.get("http.agent.url", ""),
+                        conf.get("http.agent.email", "")));
+        record.addLabelValue(HTTP_HEADER_FROM,
+                conf.get("http.agent.email", ""));
+
+        try {
+            record.addLabelValue(HOSTNAME, getHostname(conf));
+            record.addLabelValue(IP, getIPAddress(conf));
+        } catch (UnknownHostException ignored) {
+            // do nothing as this fields are optional
+        }
+
+        record.addLabelValue(ROBOTS, "classic"); // TODO Make configurable?
+        record.addLabelValue(OPERATOR, conf.get("http.agent.email", ""));
+
+        return record;
+    }
+
+    public static final String getHostname(Configuration conf)
+            throws UnknownHostException {
+
+        return StringUtil.isEmpty(conf.get("http.agent.host", "")) ?
+                InetAddress.getLocalHost().getHostName() :
+                conf.get("http.agent.host");
+    }
+
+    public static final String getIPAddress(Configuration conf)
+            throws UnknownHostException {
+
+        return InetAddress.getLocalHost().getHostAddress();
+    }
+
+    public static final byte[] toByteArray(HttpHeaders headers)
+            throws IOException {
+        ByteArrayOutputStream out = new ByteArrayOutputStream();
+        headers.write(out);
+
+        return out.toByteArray();
+    }
+
+    public static final String getAgentString(String name, String version,
+            String description, String URL, String email) {
+
+        StringBuffer buf = new StringBuffer();
+
+        buf.append(name);
+
+        if (version != null) {
+            buf.append("/").append(version);
+        }
+
+        if (((description != null) && (description.length() != 0)) || (
+                (email != null) && (email.length() != 0)) || ((URL != null) && (
+                URL.length() != 0))) {
+            buf.append(" (");
+
+            if ((description != null) && (description.length() != 0)) {
+                buf.append(description);
+                if ((URL != null) || (email != null))
+                    buf.append("; ");
+            }
+
+            if ((URL != null) && (URL.length() != 0)) {
+                buf.append(URL);
+                if (email != null)
+                    buf.append("; ");
+            }
+
+            if ((email != null) && (email.length() != 0))
+                buf.append(email);
+
+            buf.append(")");
+        }
+
+        return buf.toString();
+    }
+
+    public static final WARCRecordInfo docToMetadata(NutchDocument doc)
+            throws UnsupportedEncodingException {
+        WARCRecordInfo record = new WARCRecordInfo();
+
+        record.setType(WARCConstants.WARCRecordType.metadata);
+        record.setUrl((String) doc.getFieldValue("id"));
+        record.setCreate14DigitDate(
+                DateUtils.get14DigitDate((Date) doc.getFieldValue("tstamp")));
+        record.setMimetype("application/warc-fields");
+        record.setRecordId(generator.getRecordID());
+
+        // metadata
+        ANVLRecord metadata = new ANVLRecord();
+
+        for (String field : doc.getFieldNames()) {
+            List<Object> values = doc.getField(field).getValues();
+            for (Object value : values) {
+                if (value instanceof Date) {
+                    metadata.addLabelValue(field, DateUtils.get14DigitDate());
+                } else {
+                    metadata.addLabelValue(field, (String) value);
+                }
+            }
+        }
+
+        record.setContentLength(metadata.getLength());
+        record.setContentStream(
+                new ByteArrayInputStream(metadata.getUTF8Bytes()));
+
+        return record;
+    }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/tools/arc/ArcInputFormat.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/tools/arc/ArcInputFormat.java b/nutch-core/src/main/java/org/apache/nutch/tools/arc/ArcInputFormat.java
new file mode 100644
index 0000000..0eb7bf6
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/tools/arc/ArcInputFormat.java
@@ -0,0 +1,51 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.tools.arc;
+
+import java.io.IOException;
+
+import org.apache.hadoop.io.BytesWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.FileInputFormat;
+import org.apache.hadoop.mapred.FileSplit;
+import org.apache.hadoop.mapred.InputSplit;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.RecordReader;
+import org.apache.hadoop.mapred.Reporter;
+
+/**
+ * A input format the reads arc files.
+ */
+public class ArcInputFormat extends FileInputFormat<Text, BytesWritable> {
+
+  /**
+   * Returns the <code>RecordReader</code> for reading the arc file.
+   * 
+   * @param split
+   *          The InputSplit of the arc file to process.
+   * @param job
+   *          The job configuration.
+   * @param reporter
+   *          The progress reporter.
+   */
+  public RecordReader<Text, BytesWritable> getRecordReader(InputSplit split,
+      JobConf job, Reporter reporter) throws IOException {
+    reporter.setStatus(split.toString());
+    return new ArcRecordReader(job, (FileSplit) split);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/tools/arc/ArcRecordReader.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/tools/arc/ArcRecordReader.java b/nutch-core/src/main/java/org/apache/nutch/tools/arc/ArcRecordReader.java
new file mode 100644
index 0000000..e9ff58d
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/tools/arc/ArcRecordReader.java
@@ -0,0 +1,299 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.tools.arc;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.util.zip.GZIPInputStream;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.BytesWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.FileSplit;
+import org.apache.hadoop.mapred.RecordReader;
+import org.apache.hadoop.util.ReflectionUtils;
+import org.apache.hadoop.util.StringUtils;
+
+/**
+ * <p>
+ * The <code>ArchRecordReader</code> class provides a record reader which reads
+ * records from arc files.
+ * </p>
+ * 
+ * <p>
+ * Arc files are essentially tars of gzips. Each record in an arc file is a
+ * compressed gzip. Multiple records are concatenated together to form a
+ * complete arc. For more information on the arc file format see {@link http
+ * ://www.archive.org/web/researcher/ArcFileFormat.php } .
+ * </p>
+ * 
+ * <p>
+ * Arc files are used by the internet archive and grub projects.
+ * </p>
+ * 
+ * see {@link http://www.archive.org/ } see {@link http://www.grub.org/ }
+ */
+public class ArcRecordReader implements RecordReader<Text, BytesWritable> {
+
+  public static final Logger LOG = LoggerFactory
+      .getLogger(ArcRecordReader.class);
+
+  protected Configuration conf;
+  protected long splitStart = 0;
+  protected long pos = 0;
+  protected long splitEnd = 0;
+  protected long splitLen = 0;
+  protected long fileLen = 0;
+  protected FSDataInputStream in;
+
+  private static byte[] MAGIC = { (byte) 0x1F, (byte) 0x8B };
+
+  /**
+   * <p>
+   * Returns true if the byte array passed matches the gzip header magic number.
+   * </p>
+   * 
+   * @param input
+   *          The byte array to check.
+   * 
+   * @return True if the byte array matches the gzip header magic number.
+   */
+  public static boolean isMagic(byte[] input) {
+
+    // check for null and incorrect length
+    if (input == null || input.length != MAGIC.length) {
+      return false;
+    }
+
+    // check byte by byte
+    for (int i = 0; i < MAGIC.length; i++) {
+      if (MAGIC[i] != input[i]) {
+        return false;
+      }
+    }
+
+    // must match
+    return true;
+  }
+
+  /**
+   * Constructor that sets the configuration and file split.
+   * 
+   * @param conf
+   *          The job configuration.
+   * @param split
+   *          The file split to read from.
+   * 
+   * @throws IOException
+   *           If an IO error occurs while initializing file split.
+   */
+  public ArcRecordReader(Configuration conf, FileSplit split)
+      throws IOException {
+
+    Path path = split.getPath();
+    FileSystem fs = path.getFileSystem(conf);
+    fileLen = fs.getFileStatus(split.getPath()).getLen();
+    this.conf = conf;
+    this.in = fs.open(split.getPath());
+    this.splitStart = split.getStart();
+    this.splitEnd = splitStart + split.getLength();
+    this.splitLen = split.getLength();
+    in.seek(splitStart);
+  }
+
+  /**
+   * Closes the record reader resources.
+   */
+  public void close() throws IOException {
+    this.in.close();
+  }
+
+  /**
+   * Creates a new instance of the <code>Text</code> object for the key.
+   */
+  public Text createKey() {
+    return ReflectionUtils.newInstance(Text.class, conf);
+  }
+
+  /**
+   * Creates a new instance of the <code>BytesWritable</code> object for the key
+   */
+  public BytesWritable createValue() {
+    return ReflectionUtils.newInstance(BytesWritable.class, conf);
+  }
+
+  /**
+   * Returns the current position in the file.
+   * 
+   * @return The long of the current position in the file.
+   */
+  public long getPos() throws IOException {
+    return in.getPos();
+  }
+
+  /**
+   * Returns the percentage of progress in processing the file. This will be
+   * represented as a float from 0 to 1 with 1 being 100% completed.
+   * 
+   * @return The percentage of progress as a float from 0 to 1.
+   */
+  public float getProgress() throws IOException {
+
+    // if we haven't even started
+    if (splitEnd == splitStart) {
+      return 0.0f;
+    } else {
+      // the progress is current pos - where we started / length of the split
+      return Math.min(1.0f, (getPos() - splitStart) / (float) splitLen);
+    }
+  }
+
+  /**
+   * <p>
+   * Returns true if the next record in the split is read into the key and value
+   * pair. The key will be the arc record header and the values will be the raw
+   * content bytes of the arc record.
+   * </p>
+   * 
+   * @param key
+   *          The record key
+   * @param value
+   *          The record value
+   * 
+   * @return True if the next record is read.
+   * 
+   * @throws IOException
+   *           If an error occurs while reading the record value.
+   */
+  public boolean next(Text key, BytesWritable value) throws IOException {
+
+    try {
+
+      // get the starting position on the input stream
+      long startRead = in.getPos();
+      byte[] magicBuffer = null;
+
+      // we need this loop to handle false positives in reading of gzip records
+      while (true) {
+
+        // while we haven't passed the end of the split
+        if (startRead >= splitEnd) {
+          return false;
+        }
+
+        // scanning for the gzip header
+        boolean foundStart = false;
+        while (!foundStart) {
+
+          // start at the current file position and scan for 1K at time, break
+          // if there is no more to read
+          startRead = in.getPos();
+          magicBuffer = new byte[1024];
+          int read = in.read(magicBuffer);
+          if (read < 0) {
+            break;
+          }
+
+          // scan the byte array for the gzip header magic number. This happens
+          // byte by byte
+          for (int i = 0; i < read - 1; i++) {
+            byte[] testMagic = new byte[2];
+            System.arraycopy(magicBuffer, i, testMagic, 0, 2);
+            if (isMagic(testMagic)) {
+              // set the next start to the current gzip header
+              startRead += i;
+              foundStart = true;
+              break;
+            }
+          }
+        }
+
+        // seek to the start of the gzip header
+        in.seek(startRead);
+        ByteArrayOutputStream baos = null;
+        int totalRead = 0;
+
+        try {
+
+          // read 4K of the gzip at a time putting into a byte array
+          byte[] buffer = new byte[4096];
+          GZIPInputStream zin = new GZIPInputStream(in);
+          int gzipRead = -1;
+          baos = new ByteArrayOutputStream();
+          while ((gzipRead = zin.read(buffer, 0, buffer.length)) != -1) {
+            baos.write(buffer, 0, gzipRead);
+            totalRead += gzipRead;
+          }
+        } catch (Exception e) {
+
+          // there are times we get false positives where the gzip header exists
+          // but it is not an actual gzip record, so we ignore it and start
+          // over seeking
+          System.out.println("Ignoring position: " + (startRead));
+          if (startRead + 1 < fileLen) {
+            in.seek(startRead + 1);
+          }
+          continue;
+        }
+
+        // change the output stream to a byte array
+        byte[] content = baos.toByteArray();
+
+        // the first line of the raw content in arc files is the header
+        int eol = 0;
+        for (int i = 0; i < content.length; i++) {
+          if (i > 0 && content[i] == '\n') {
+            eol = i;
+            break;
+          }
+        }
+
+        // create the header and the raw content minus the header
+        String header = new String(content, 0, eol).trim();
+        byte[] raw = new byte[(content.length - eol) - 1];
+        System.arraycopy(content, eol + 1, raw, 0, raw.length);
+
+        // populate key and values with the header and raw content.
+        Text keyText = key;
+        keyText.set(header);
+        BytesWritable valueBytes = value;
+        valueBytes.set(raw, 0, raw.length);
+
+        // TODO: It would be best to start at the end of the gzip read but
+        // the bytes read in gzip don't match raw bytes in the file so we
+        // overshoot the next header. With this current method you get
+        // some false positives but don't miss records.
+        if (startRead + 1 < fileLen) {
+          in.seek(startRead + 1);
+        }
+
+        // populated the record, now return
+        return true;
+      }
+    } catch (Exception e) {
+      LOG.equals(StringUtils.stringifyException(e));
+    }
+
+    // couldn't populate the record or there is no next record to read
+    return false;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java b/nutch-core/src/main/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java
new file mode 100644
index 0000000..39b8d95
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java
@@ -0,0 +1,426 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.tools.arc;
+
+import java.io.IOException;
+import java.text.SimpleDateFormat;
+import java.util.Date;
+import java.util.Map.Entry;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.BytesWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.FileInputFormat;
+import org.apache.hadoop.mapred.FileOutputFormat;
+import org.apache.hadoop.mapred.JobClient;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.Mapper;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.NutchWritable;
+import org.apache.nutch.crawl.SignatureFactory;
+import org.apache.nutch.fetcher.FetcherOutputFormat;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.net.URLFilters;
+import org.apache.nutch.net.URLNormalizers;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.ParseResult;
+import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.parse.ParseText;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ProtocolStatus;
+import org.apache.nutch.scoring.ScoringFilters;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.NutchJob;
+import org.apache.nutch.util.StringUtil;
+import org.apache.nutch.util.TimingUtil;
+
+/**
+ * <p>
+ * The <code>ArcSegmentCreator</code> is a replacement for fetcher that will
+ * take arc files as input and produce a nutch segment as output.
+ * </p>
+ * 
+ * <p>
+ * Arc files are tars of compressed gzips which are produced by both the
+ * internet archive project and the grub distributed crawler project.
+ * </p>
+ * 
+ */
+public class ArcSegmentCreator extends Configured implements Tool,
+    Mapper<Text, BytesWritable, Text, NutchWritable> {
+
+  public static final Logger LOG = LoggerFactory
+      .getLogger(ArcSegmentCreator.class);
+  public static final String URL_VERSION = "arc.url.version";
+  private JobConf jobConf;
+  private URLFilters urlFilters;
+  private ScoringFilters scfilters;
+  private ParseUtil parseUtil;
+  private URLNormalizers normalizers;
+  private int interval;
+
+  private static SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMddHHmmss");
+
+  public ArcSegmentCreator() {
+
+  }
+
+  /**
+   * <p>
+   * Constructor that sets the job configuration.
+   * </p>
+   * 
+   * @param conf
+   */
+  public ArcSegmentCreator(Configuration conf) {
+    setConf(conf);
+  }
+
+  /**
+   * Generates a random name for the segments.
+   * 
+   * @return The generated segment name.
+   */
+  public static synchronized String generateSegmentName() {
+    try {
+      Thread.sleep(1000);
+    } catch (Throwable t) {
+    }
+    return sdf.format(new Date(System.currentTimeMillis()));
+  }
+
+  /**
+   * <p>
+   * Configures the job. Sets the url filters, scoring filters, url normalizers
+   * and other relevant data.
+   * </p>
+   * 
+   * @param job
+   *          The job configuration.
+   */
+  public void configure(JobConf job) {
+
+    // set the url filters, scoring filters the parse util and the url
+    // normalizers
+    this.jobConf = job;
+    this.urlFilters = new URLFilters(jobConf);
+    this.scfilters = new ScoringFilters(jobConf);
+    this.parseUtil = new ParseUtil(jobConf);
+    this.normalizers = new URLNormalizers(jobConf, URLNormalizers.SCOPE_FETCHER);
+    interval = jobConf.getInt("db.fetch.interval.default", 2592000);
+  }
+
+  public void close() {
+  }
+
+  /**
+   * <p>
+   * Parses the raw content of a single record to create output. This method is
+   * almost the same as the {@link org.apache.nutch.Fetcher#output} method in
+   * terms of processing and output.
+   * 
+   * @param output
+   *          The job output collector.
+   * @param segmentName
+   *          The name of the segment to create.
+   * @param key
+   *          The url of the record.
+   * @param datum
+   *          The CrawlDatum of the record.
+   * @param content
+   *          The raw content of the record
+   * @param pstatus
+   *          The protocol status
+   * @param status
+   *          The fetch status.
+   * 
+   * @return The result of the parse in a ParseStatus object.
+   */
+  private ParseStatus output(OutputCollector<Text, NutchWritable> output,
+      String segmentName, Text key, CrawlDatum datum, Content content,
+      ProtocolStatus pstatus, int status) {
+
+    // set the fetch status and the fetch time
+    datum.setStatus(status);
+    datum.setFetchTime(System.currentTimeMillis());
+    if (pstatus != null)
+      datum.getMetaData().put(Nutch.WRITABLE_PROTO_STATUS_KEY, pstatus);
+
+    ParseResult parseResult = null;
+    if (content != null) {
+      Metadata metadata = content.getMetadata();
+      // add segment to metadata
+      metadata.set(Nutch.SEGMENT_NAME_KEY, segmentName);
+      // add score to content metadata so that ParseSegment can pick it up.
+      try {
+        scfilters.passScoreBeforeParsing(key, datum, content);
+      } catch (Exception e) {
+        if (LOG.isWarnEnabled()) {
+          LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");
+        }
+      }
+
+      try {
+
+        // parse the content
+        parseResult = this.parseUtil.parse(content);
+      } catch (Exception e) {
+        LOG.warn("Error parsing: " + key + ": "
+            + StringUtils.stringifyException(e));
+      }
+
+      // set the content signature
+      if (parseResult == null) {
+        byte[] signature = SignatureFactory.getSignature(getConf()).calculate(
+            content, new ParseStatus().getEmptyParse(getConf()));
+        datum.setSignature(signature);
+      }
+
+      try {
+        output.collect(key, new NutchWritable(datum));
+        output.collect(key, new NutchWritable(content));
+
+        if (parseResult != null) {
+          for (Entry<Text, Parse> entry : parseResult) {
+            Text url = entry.getKey();
+            Parse parse = entry.getValue();
+            ParseStatus parseStatus = parse.getData().getStatus();
+
+            if (!parseStatus.isSuccess()) {
+              LOG.warn("Error parsing: " + key + ": " + parseStatus);
+              parse = parseStatus.getEmptyParse(getConf());
+            }
+
+            // Calculate page signature.
+            byte[] signature = SignatureFactory.getSignature(getConf())
+                .calculate(content, parse);
+            // Ensure segment name and score are in parseData metadata
+            parse.getData().getContentMeta()
+                .set(Nutch.SEGMENT_NAME_KEY, segmentName);
+            parse.getData().getContentMeta()
+                .set(Nutch.SIGNATURE_KEY, StringUtil.toHexString(signature));
+            // Pass fetch time to content meta
+            parse.getData().getContentMeta()
+                .set(Nutch.FETCH_TIME_KEY, Long.toString(datum.getFetchTime()));
+            if (url.equals(key))
+              datum.setSignature(signature);
+            try {
+              scfilters.passScoreAfterParsing(url, content, parse);
+            } catch (Exception e) {
+              if (LOG.isWarnEnabled()) {
+                LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");
+              }
+            }
+            output.collect(url, new NutchWritable(new ParseImpl(new ParseText(
+                parse.getText()), parse.getData(), parse.isCanonical())));
+          }
+        }
+      } catch (IOException e) {
+        if (LOG.isErrorEnabled()) {
+          LOG.error("ArcSegmentCreator caught:"
+              + StringUtils.stringifyException(e));
+        }
+      }
+
+      // return parse status if it exits
+      if (parseResult != null && !parseResult.isEmpty()) {
+        Parse p = parseResult.get(content.getUrl());
+        if (p != null) {
+          return p.getData().getStatus();
+        }
+      }
+    }
+
+    return null;
+  }
+
+  /**
+   * <p>
+   * Logs any error that occurs during conversion.
+   * </p>
+   * 
+   * @param url
+   *          The url we are parsing.
+   * @param t
+   *          The error that occured.
+   */
+  private void logError(Text url, Throwable t) {
+    if (LOG.isInfoEnabled()) {
+      LOG.info("Conversion of " + url + " failed with: "
+          + StringUtils.stringifyException(t));
+    }
+  }
+
+  /**
+   * <p>
+   * Runs the Map job to translate an arc record into output for Nutch segments.
+   * </p>
+   * 
+   * @param key
+   *          The arc record header.
+   * @param bytes
+   *          The arc record raw content bytes.
+   * @param output
+   *          The output collecter.
+   * @param reporter
+   *          The progress reporter.
+   */
+  public void map(Text key, BytesWritable bytes,
+      OutputCollector<Text, NutchWritable> output, Reporter reporter)
+      throws IOException {
+
+    String[] headers = key.toString().split("\\s+");
+    String urlStr = headers[0];
+    String version = headers[2];
+    String contentType = headers[3];
+
+    // arcs start with a file description. for now we ignore this as it is not
+    // a content record
+    if (urlStr.startsWith("filedesc://")) {
+      LOG.info("Ignoring file header: " + urlStr);
+      return;
+    }
+    LOG.info("Processing: " + urlStr);
+
+    // get the raw bytes from the arc file, create a new crawldatum
+    Text url = new Text();
+    CrawlDatum datum = new CrawlDatum(CrawlDatum.STATUS_DB_FETCHED, interval,
+        1.0f);
+    String segmentName = getConf().get(Nutch.SEGMENT_NAME_KEY);
+
+    // normalize and filter the urls
+    try {
+      urlStr = normalizers.normalize(urlStr, URLNormalizers.SCOPE_FETCHER);
+      urlStr = urlFilters.filter(urlStr); // filter the url
+    } catch (Exception e) {
+      if (LOG.isWarnEnabled()) {
+        LOG.warn("Skipping " + url + ":" + e);
+      }
+      urlStr = null;
+    }
+
+    // if still a good url then process
+    if (urlStr != null) {
+
+      url.set(urlStr);
+      try {
+
+        // set the protocol status to success and the crawl status to success
+        // create the content from the normalized url and the raw bytes from
+        // the arc file, TODO: currently this doesn't handle text of errors
+        // pages (i.e. 404, etc.). We assume we won't get those.
+        ProtocolStatus status = ProtocolStatus.STATUS_SUCCESS;
+        Content content = new Content(urlStr, urlStr, bytes.getBytes(),
+            contentType, new Metadata(), getConf());
+
+        // set the url version into the metadata
+        content.getMetadata().set(URL_VERSION, version);
+        ParseStatus pstatus = null;
+        pstatus = output(output, segmentName, url, datum, content, status,
+            CrawlDatum.STATUS_FETCH_SUCCESS);
+        reporter.progress();
+      } catch (Throwable t) { // unexpected exception
+        logError(url, t);
+        output(output, segmentName, url, datum, null, null,
+            CrawlDatum.STATUS_FETCH_RETRY);
+      }
+    }
+  }
+
+  /**
+   * <p>
+   * Creates the arc files to segments job.
+   * </p>
+   * 
+   * @param arcFiles
+   *          The path to the directory holding the arc files
+   * @param segmentsOutDir
+   *          The output directory for writing the segments
+   * 
+   * @throws IOException
+   *           If an IO error occurs while running the job.
+   */
+  public void createSegments(Path arcFiles, Path segmentsOutDir)
+      throws IOException {
+
+    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+    long start = System.currentTimeMillis();
+    if (LOG.isInfoEnabled()) {
+      LOG.info("ArcSegmentCreator: starting at " + sdf.format(start));
+      LOG.info("ArcSegmentCreator: arc files dir: " + arcFiles);
+    }
+
+    JobConf job = new NutchJob(getConf());
+    job.setJobName("ArcSegmentCreator " + arcFiles);
+    String segName = generateSegmentName();
+    job.set(Nutch.SEGMENT_NAME_KEY, segName);
+    FileInputFormat.addInputPath(job, arcFiles);
+    job.setInputFormat(ArcInputFormat.class);
+    job.setMapperClass(ArcSegmentCreator.class);
+    FileOutputFormat.setOutputPath(job, new Path(segmentsOutDir, segName));
+    job.setOutputFormat(FetcherOutputFormat.class);
+    job.setOutputKeyClass(Text.class);
+    job.setOutputValueClass(NutchWritable.class);
+
+    JobClient.runJob(job);
+
+    long end = System.currentTimeMillis();
+    LOG.info("ArcSegmentCreator: finished at " + sdf.format(end)
+        + ", elapsed: " + TimingUtil.elapsedTime(start, end));
+  }
+
+  public static void main(String args[]) throws Exception {
+    int res = ToolRunner.run(NutchConfiguration.create(),
+        new ArcSegmentCreator(), args);
+    System.exit(res);
+  }
+
+  public int run(String[] args) throws Exception {
+
+    String usage = "Usage: ArcSegmentCreator <arcFiles> <segmentsOutDir>";
+
+    if (args.length < 2) {
+      System.err.println(usage);
+      return -1;
+    }
+
+    // set the arc files directory and the segments output directory
+    Path arcFiles = new Path(args[0]);
+    Path segmentsOutDir = new Path(args[1]);
+
+    try {
+      // create the segments from the arc files
+      createSegments(arcFiles, segmentsOutDir);
+      return 0;
+    } catch (Exception e) {
+      LOG.error("ArcSegmentCreator: " + StringUtils.stringifyException(e));
+      return -1;
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/tools/arc/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/tools/arc/package-info.java b/nutch-core/src/main/java/org/apache/nutch/tools/arc/package-info.java
new file mode 100644
index 0000000..cb6e115
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/tools/arc/package-info.java
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Tools to read the
+ * <a href="http://archive.org/web/researcher/ArcFileFormat.php">Arc file format</a>.
+ */
+package org.apache.nutch.tools.arc;
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/tools/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/tools/package-info.java b/nutch-core/src/main/java/org/apache/nutch/tools/package-info.java
new file mode 100644
index 0000000..3b868c5
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/tools/package-info.java
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Miscellaneous tools.
+ */
+package org.apache.nutch.tools;
+

[18/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build for nutch-core and nutch-plugins

Posted by th...@apache.org.

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/language-identifier/src/test/java/org/apache/nutch/analysis/lang/fr.test
----------------------------------------------------------------------
diff --git a/nutch-plugins/language-identifier/src/test/java/org/apache/nutch/analysis/lang/fr.test b/nutch-plugins/language-identifier/src/test/java/org/apache/nutch/analysis/lang/fr.test
new file mode 100644
index 0000000..05e5e35
--- /dev/null
+++ b/nutch-plugins/language-identifier/src/test/java/org/apache/nutch/analysis/lang/fr.test
@@ -0,0 +1,105 @@
+Reprise de la session
+Je d�clare reprise la session du Parlement europ�en qui avait �t� interrompue le vendredi 17 d�cembre dernier et je vous renouvelle tous mes vux en esp�rant que vous avez pass� de bonnes vacances.
+Comme vous avez pu le constater, le grand "bogue de l'an 2000" ne s'est pas produit. En revanche, les citoyens d'un certain nombre de nos pays ont �t� victimes de catastrophes naturelles qui ont vraiment �t� terribles. Vous avez souhait� un d�bat � ce sujet dans les prochains jours, au cours de cette p�riode de session. En attendant, je souhaiterais, comme un certain nombre de coll�gues me l'ont demand�, que nous observions une minute de silence pour toutes les victimes, des temp�tes notamment, dans les diff�rents pays de l'Union europ�enne qui ont �t� touch�s. Je vous invite � vous lever pour cette minute de silence.
+(Le Parlement, debout, observe une minute de silence)
+
+Madame la Pr�sidente, c'est une motion de proc�dure. Vous avez probablement appris par la presse et par la t�l�vision que plusieurs attentats � la bombe et crimes ont �t� perp�tr�s au Sri Lanka. L'une des personnes qui vient d'�tre assassin�e au Sri Lanka est M. Kumar Ponnambalam, qui avait rendu visite au Parlement europ�en il y a quelques mois � peine. Ne pensez-vous pas, Madame la Pr�sidente, qu'il conviendrait d'�crire une lettre au pr�sident du Sri Lanka pour lui communiquer que le Parlement d�plore les morts violentes, dont celle de M. Ponnambalam, et pour l'inviter instamment � faire tout ce qui est en son pouvoir pour chercher une r�conciliation pacifique et mettre un terme � cette situation particuli�rement difficile.
+
+Oui, Monsieur Evans, je pense qu'une initiative dans le sens que vous venez de sugg�rer serait tout � fait appropri�e. Si l'Assembl�e en est d'accord, je ferai comme M. Evans l'a sugg�r�.
+
+Madame la Pr�sidente, c'est une motion de proc�dure. Je voudrais vous demander un conseil au sujet de l'article 143, qui concerne l'irrecevabilit�. Ma question porte sur un sujet qui est � l'ordre du jour du jeudi et que je soul�verai donc une nouvelle fois.
+Le paragraphe 6 du rapport Cunha sur les programmes d'orientation pluriannuels, qui sera soumis au Parlement ce jeudi, propose d'introduire des sanctions applicables aux pays qui ne respectent pas les objectifs annuels de r�duction de leur flotte. Il pr�cise que cela devrait �tre fait malgr� le principe de stabilit� relative. � mon sens, le principe de stabilit� relative est un principe juridique fondamental de la politique commune de la p�che et toute proposition le bouleversant serait juridiquement irrecevable. Je voudrais savoir si l'on peut avancer une objection de ce type � ce qui n'est qu'un rapport, pas une proposition l�gislative, et si je suis habilit� � le faire ce jeudi.
+
+C'est exactement � ce moment-l� que vous pourrez, en effet, si vous le souhaitez, soulever cette question, c'est-�-dire jeudi avant le d�but de la pr�sentation du rapport.
+
+Madame la Pr�sidente, alors que se d�roule la premi�re session de l'ann�e du Parlement europ�en, l'ex�cution d'un condamn� � mort au Texas aux �tats-Unis, un jeune homme de 34 ans appel� Hicks, a �t� fix�e, malheureusement, � jeudi prochain.
+� la demande d'un d�put� fran�ais, Monsieur Zimeray, une p�tition a d�j� �t� introduite ; elle a r�colt� de nombreuses signatures dont la mienne. Cependant, je vous demande, conform�ment � l'orientation d�sormais constamment exprim�e par le Parlement europ�en et toute la Communaut� europ�enne, d'intervenir aupr�s du pr�sident et du gouverneur du Texas, Monsieur Bush, en faisant jouer le prestige de votre mandat et de l'Institution que vous repr�sentez, car c'est Monsieur Bush qui a le pouvoir de suspendre la condamnation � mort et de gracier le condamn�.
+Et tout ceci dans le respect des principes que nous avons toujours soutenus.
+
+Merci, Monsieur Segni, je le ferai bien volontiers. C'est en effet tout � fait dans la ligne des positions que notre Parlement a toujours adopt�es.
+
+Madame la Pr�sidente, je voudrais attirer votre attention sur un cas dont s'est r�guli�rement occup� le Parlement. Il s'agit du cas d'Alexandre Nikitin. Nous nous r�jouissons tous, dans cette enceinte, que le tribunal ait prononc� sa lib�ration et ait clairement �tabli qu'en Russie aussi, l'acc�s aux informations environnementales �tait un droit constitutionnel. Cependant, il se fait qu'il semblerait �tre � nouveau mis en accusation, le minist�re public ayant interjet� appel. Nous savons, et nous l'avons d'ailleurs �tabli dans de tr�s nombreuses r�solutions - y compris lors de la derni�re p�riode de session de l'ann�e derni�re -, que ce cas n'est pas seulement de nature juridique et qu'il est faux d'accuser Alexandre Nikitin d'activit� criminelle et de trahison car nous sommes concern�s par ses r�sultats et nous en profitons. Ces r�sultats forment la base des programmes europ�ens de protection de la mer de Barents et c'est pourquoi je vous prie d'examiner u
 n projet de lettre vous d�peignant les faits essentiels de cette affaire et de communiquer � la Russie la position qui d�coule des d�cisions du Parlement.
+
+Oui, Madame Schroedter, j'examinerai bien volontiers les faits relatifs � cette question lorsque j'aurai re�u votre lettre.
+
+Madame la Pr�sidente, je voudrais tout d'abord vous f�liciter pour avoir tenu parole car en effet, en cette premi�re p�riode de session, en cette nouvelle ann�e, le nombre de cha�nes de t�l�vision a r�ellement �t� augment� de mani�re significative dans nos locaux. Toutefois, Madame la Pr�sidente, ce que j'avais demand� n'a pas �t� r�alis�. Il y a bien deux cha�nes finnoises et une cha�ne portugaise, mais il n'y a toujours aucune cha�ne n�erlandaise. Pourtant je vous avais demand� une cha�ne n�erlandaise, car les N�erlandais aussi d�sirent pouvoir suivre les actualit�s chaque mois lorsqu'ils sont envoy�s en cette terre d'exil. Je vous demande donc � nouveau de faire le n�cessaire pour que nous puissions disposer d'une cha�ne n�erlandaise.
+
+Madame Plooij-van Gorsel, je peux vous dire que cette question est � l'ordre du jour de la r�union des questeurs de mercredi. Elle sera, je l'esp�re, examin�e dans un esprit positif.
+
+Madame la Pr�sidente, comment se fait-il que le Parlement ne se conforme pas � la r�glementation en mati�re de sant� et de s�curit� qu'il vote ? Comment se fait-il qu'aucun test de qualit� de l'air n'ait �t� r�alis� dans ce b�timent depuis notre �lection ? Comment se fait-il que le comit� de sant� et d'hygi�ne ne se soit plus r�uni depuis 1998 ? Comment se fait-il que nous n'ayons jamais fait d'exercice d'�vacuation dans les b�timents du Parlement de Bruxelles et de Strasbourg ? Comment se fait-il qu'il n'y ait pas de consignes en cas d'incendie ? Comment se fait-il que les escaliers n'aient pas �t� am�lior�s depuis mon accident ? Comment se fait-il que l'on ne respecte pas les zones non fumeurs ? Nous votons des r�glementations et nous ne nous y conformons m�me pas. C'est scandaleux.
+
+Madame Lynne, vous avez parfaitement raison et je vais v�rifier si tout cela n' a effectivement pas �t� fait. Je vais soumettre �galement le probl�me au Coll�ge des questeurs et je suis certaine que nos questeurs auront � cur de faire en sorte que nous respections la r�glementation qu' en effet nous votons.
+
+Madame la Pr�sidente, Mme D�ez Gonz�lez et moi-m�me avions pr�sent� quelques questions sur certaines opinions exprim�es par la vice-pr�sidente, Mme de Palacio, et publi�es dans un journal espagnol. Les services comp�tents ne les ont pas inclues � l'ordre du jour, avan�ant que des r�ponses avaient d�j� �t� apport�es lors d'une pr�c�dente session.
+Je demande que cette d�cision soit reconsid�r�e car ce n'est pas le cas. Les questions auxquelles on a r�pondu pr�c�demment se rapportaient � l'intervention de Mme de Palacio dans un dossier pr�cis, et non aux d�clarations parues dans le journal ABC du 18 novembre dernier.
+
+ Cher coll�gue nous allons v�rifier tout cela. Je vous avouerai que, pour le moment, les choses me semblent un petit peu confuses. Donc, nous allons revoir cela tr�s s�rieusement pour que tout soit bien en ordre.
+
+Madame la Pr�sidente, je voudrais savoir si cette semaine, le Parlement va envoyer un message clair exprimant son m�contentement quant � la d�cision prise aujourd'hui de refuser de renouveler l'embargo sur les armes � destination de l'Indon�sie, alors que par le pass�, le Parlement a souscrit � une tr�s large majorit� � l'embargo sur les armes � destination de l'Indon�sie. La d�cision d'aujourd'hui de ne pas renouveler l'embargo est extr�mement dangereuse, compte tenu de la situation sur place. Le Parlement devrait d�s lors envoyer un message en ce sens, �tant donn� qu'une grande majorit� des d�put�s le souhaite. En refusant de prolonger l'embargo, les �tats membres font preuve d'irresponsabilit�. D'aucuns l'ont dit, la situation en Indon�sie est extr�mement explosive. Il y a en r�alit� un risque de coup d'�tat. Nous ne savons pas ce qui se passe. Pourquoi donc les producteurs d'armes de l'UE devraient-ils s'enrichir sur le dos de personnes innocentes ?
+
+En tout cas, cette question ne figure pas pour l' instant parmi les demandes d' urgence pour jeudi prochain.
+
+Ordre des travaux
+L' ordre du jour appelle l' examen du projet d�finitif d' ordre du jour tel qu' il a �t� �tabli par la Conf�rence des pr�sidents, le jeudi 13 janvier, conform�ment � l' article 110 du r�glement. S' agissant de lundi et mardi, je n' ai pas de modifications.
+En ce qui concerne le mercredi :
+Le groupe socialiste demande d' inclure une d�claration de la Commission sur ses objectifs strat�giques pour les cinq ans � venir, ainsi que sur la r�forme administrative de la Commission.
+Je souhaiterais que M. Bar�n Crespo, auteur de la demande, intervienne pour la justifier, s' il le souhaite bien s�r. Puis , nous ferons comme d' habitude : nous entendrons un orateur pour et un orateur contre.
+
+Madame la Pr�sidente, la pr�sentation du programme politique de la Commission Prodi pour l'ensemble de la l�gislature �tait au d�part une proposition du groupe du parti des socialistes europ�ens qui a obtenu l'unanimit� lors de la Conf�rence des pr�sidents en septembre, mais aussi l'approbation explicite du pr�sident Prodi, qui a r�it�r� son engagement dans son discours d'investiture.
+Cet engagement est important dans la mesure o� la Commission est un organisme qui d�tient, conform�ment aux Trait�s, le monopole de l'initiative et qui, par cons�quent, d�cide pour l'essentiel quelle sera l'activit� politique et l�gislative de ce Parlement au cours des cinq prochaines ann�es. Je vous rappelle en outre, Madame la Pr�sidente, que ce Parlement a vot� par deux fois la confiance au pr�sident Prodi lors de la pr�c�dente l�gislature ; au cours de cette l�gislature, il l'a de nouveau vot�e en juillet et, ensuite, avec l'entr�e en fonction de la nouvelle Commission, il a redonn� sa confiance en septembre � l'ensemble de la Commission. Il s'est donc �coul� suffisamment de temps pour que la Commission pr�pare son programme et pour que nous puissions en prendre connaissance et l'expliquer aux citoyens. En ce sens, je vous rappelle la r�solution du 15 septembre, laquelle recommandait que la proposition soit pr�sent�e dans les plus brefs d�lais.
+Les �v�nements qui se sont produits la semaine derni�re � qui sont n�s en marge de la Conf�rence des pr�sidents et se servent d'elle dans le seul but de corroborer et de ratifier des d�cisions prises en dehors de celle-ci � cr�ent un dilemme : ou bien la Commission n'est pas en mesure de pr�senter ce programme ; (dans ce cas, il conviendrait qu'elle tire les choses au clair. Aux dires de son Pr�sident, la Commission serait en mesure de le faire. �tant donn� que la Commission est repr�sent�e par la vice-pr�sidente, Mme de Palacio, je crois qu'avant de voter, il serait bon de conna�tre la position de la Commission en ce qui concerne ses disponibilit�s pour pr�senter le programme, tel que cela avait �t� convenu) ou bien le Parlement n'est pas en mesure d'examiner ce programme, comme certains semblent le pr�tendre. � mon avis, cette deuxi�me hypoth�se signifierait le rejet de nos responsabilit�s en tant que Parlement, outre l'introduction d'une th�se original
 e, d'une m�thode inconnue qui consiste � communiquer aux groupes politiques le discours du programme de la Commission par �crit une semaine avant � et non le jour avant, comme il avait �t� convenu �, en tenant compte du fait que le programme l�gislatif serait discut� en f�vrier, de telle sorte que nous pourrions nous passer du d�bat, car le lendemain, la presse et Internet l'auraient port� � la connaissance de tous les citoyens et le Parlement n'aurait plus de raison de s'en occuper.
+Mon groupe estimant qu'un Parlement est l� pour �couter, pour d�battre et pour r�fl�chir, nous pensons qu'aucune raison ne justifie cet ajournement et nous croyons que si la Commission est en mesure de le pr�senter, nous avons parfaitement le temps pour r�tablir l'accord original entre le Parlement et la Commission et d'agir de mani�re responsable vis-�-vis de nos concitoyennes et concitoyens. Par cons�quent, la proposition du groupe du parti des socialistes europ�ens que vous avez mentionn�e est de maintenir � mercredi la pr�sentation du programme de l�gislature de la Commission Prodi, en incluant �galement dans le programme le projet de r�forme administrative car, si ce n'est pas fait, nous risquons de nous retrouver dans une position paradoxale : sous pr�texte qu'il n'y a pas de texte, on refuse d'une part le droit du pr�sident de la Commission � s'exprimer dans ce Parlement et, d'autre part, la tenue d'un d�bat sur une r�forme dont le Parlement ne conna�t 
 pas les textes. Je vous prie d�s lors, Madame la Pr�sidente, de demander � la Commission de s'exprimer maintenant et que l'on proc�de ensuite au vote.
+(Applaudissements du groupe PSE)
+
+Madame la Pr�sidente, chers coll�gues, je suis tout de m�me quelque peu surpris de l'attitude de notre coll�gue Bar�n Crespo qui demande � pr�sent que ce point de l'ordre du jour soit plac� � l'ordre du jour de la s�ance de mercredi.
+Monsieur Bar�n Crespo, vous n'avez pu �tre pr�sent � la Conf�rence des pr�sidents de jeudi dernier. Je ne le critique pas : il peut arriver qu'on se fasse repr�senter. M. H�nsch vous y a repr�sent�. Nous avons men� un d�bat en profondeur au cours de cette Conf�rence des pr�sidents. Seul votre groupe a d�fendu la position que vous venez d'exposer. Nous avons ensuite proc�d� � un vote, chaque pr�sident ou pr�sidente disposant d'un nombre de voix �gal au nombre de ses membres. Il y a donc eu un vote � ce stade de la r�union et, de ce que je me souviens, ce vote a donn� le r�sultat suivant : 422 voix contre 180 et quelques rares abstentions. Ce qui signifie qu'� l'exception des non-inscrits - qui ne constituent cependant pas un groupe -, seul votre groupe �tait d'avis de proc�der de la mani�re que vous venez de proposer. Tous les autres �taient d'un avis diff�rent. Telle �tait la d�cision prise.
+� pr�sent, je voudrais dire quelques mots sur l'affaire en tant que telle. Nous avons confiance en la Commission, en Romano Prodi, et, au terme d'un processus difficile connu de chacun, la toute grande majorit� de notre groupe a t�moign� sa confiance � Romano Prodi et � sa Commission. Mais nous pensons �galement devoir tenir un d�bat sur la strat�gie de la Commission dans le cadre d'une proc�dure ordonn�e, ne reposant pas seulement sur une d�claration orale faite au sein du Parlement europ�en mais sur un document adopt� par la Commission et d�crivant ce programme pour les cinq ans � venir. Un tel document n'existe pas.
+La Commission pr�sentera le programme pour l'an 2000 en f�vrier. Nous avons marqu� notre accord et dit que si la Commission ne pr�sentait pas ce programme en janvier, nous le ferions en f�vrier. Nous avons marqu� notre accord. Nous ne voulons pas d'un conflit avec la Commission et pensons que, dans la mesure du possible, la Commission et le Parlement doivent avancer de concert. Toutefois, le Parlement est �galement le contr�leur de la Commission. Et tout ce qui provient de la Commission ne doit pas avoir notre assentiment.
+Je voudrais que les groupes puissent proc�der � la pr�paration judicieuse d'un d�bat sur ce programme pour les cinq ans � venir. On ne peut s'y pr�parer si l'on entend une d�claration dans cette enceinte sans m�me conna�tre le contenu d'une telle d�claration. C'est pourquoi nous recommandons - et j'ai l'impression que la Commission se range �galement � cette id�e - de mener le d�bat sur le programme de la Commission jusqu'� 2005 au cours du mois de f�vrier - j'esp�re que d'ici l�, la Commission se sera accord�e sur un programme qu'elle nous soumettra - et de mener au cours du m�me mois de f�vrier le d�bat sur le programme l�gislatif de la Commission pour l'an 2000. La logique nous invite donc �galement � mener de concert les d�bats sur ces deux programmes. C'est pourquoi mon groupe rejette r�solument la proposition du groupe socialiste.
+(Applaudissements du groupe PPE-DE)
+
+Madame la Pr�sidente, avant toute chose, je voudrais qu'il soit bien clair que la Commission a le plus grand respect pour les d�cisions de ce Parlement et, notamment, pour celle qui concerne la fixation de l'ordre du jour. Par cons�quent, nous respectons les d�cisions que pourrait prendre le Parlement dans ce sens.
+Mais je voudrais �galement qu'il soit bien clair que le pr�sident Prodi s'est engag� avec le Parlement � instaurer un nouveau d�bat, comme l'a rappel� M. Bar�n, qui vient s'ajouter au d�bat annuel sur le programme l�gislatif de la Commission, sur les grandes lignes d'action pour la prochaine p�riode de cinq ans, c'est-�-dire pour cette l�gislature.
+Je voudrais dire, Madame la Pr�sidente, que, dans l'accord auquel on est parvenu au mois de septembre, ce d�bat diff�re de la pr�sentation annuelle du programme l�gislatif de la Commission. J'ajouterais, Madame la Pr�sidente, que, du c�t� de la Commission, nous sommes pr�ts et dispos�s � organiser ce d�bat quand cela vous conviendra, que nous �tions pr�ts � le d�velopper cette semaine, comme cela avait �t� d�cid� au d�part, en se basant sur le fait qu'il �tait pr�sent� la veille dans un discours aux groupes parlementaires.
+Je voudrais donc r�p�ter, Madame la Pr�sidente, que, pour notre part, nous avons discut� du programme d'action pour les cinq prochaines ann�es et que nous sommes pr�ts � venir pr�senter le programme pour les cinq prochaines ann�es quand le Parlement le d�cidera � y compris cette semaine, si telle est sa d�cision � et le programme pour l'an 2000, le mois prochain, ce sur quoi nous nous �tions parfaitement mis d'accord.
+
+Je propose que nous votions sur la demande du groupe socialiste visant � r�inscrire la d�claration de la Commission sur ses objectifs strat�giques.
+(Le Parlement rejette la demande) La Pr�sidente. Toujours au sujet de la journ�e du mercredi, j'ai une autre proposition concernant la question orale sur l'imp�t sur le capital. Le groupe PPE�DE demande de retirer ce point de l'ordre du jour.
+Y a-t-il un coll�gue pour prendre la parole au nom du groupe et justifier cette demande ?
+
+Madame la Pr�sidente, pour r�pondre aux rires que j'entends parmi les socialistes, on m'a dit que de larges pans du groupe socialiste aimeraient �galement supprimer ce point de l'ordre du jour car lors du scrutin au sein de la Conf�rence des pr�sidents, les coll�gues responsables du groupe socialiste ne disposaient pas du vote du groupe de travail. Je ne sais si cette information est correcte mais quoi qu'il en soit, le groupe PPE-DE vous saurait gr� de supprimer ce point de l'ordre du jour car le Parlement s'est en effet maintes fois saisi de cette question. Des d�cisions existent qui s'opposent � une telle taxe. C'est pourquoi mon groupe demande que ce point soit retir� de l'ordre du jour.
+
+Merci Monsieur Poettering.
+Nous entendons � pr�sent M. Wurtz, qui s' exprime contre cette demande.
+
+Madame la Pr�sidente, je voudrais d' abord souligner le manque de logique de M. Poettering. A l' instant, il vient de faire la le�on au groupe socialiste parce que celui-ci revient sur une d�cision qui a �t� prise de fa�on extr�mement nette en Conf�rence des pr�sidents. Or, il fait la m�me chose. Nous avons discut�, nous �tions unanimes sauf le groupe PPE et le groupe lib�ral et j' avais m�me fait remarquer, vous vous en souviendrez mes chers confr�res pr�sidents, que la question n' est pas de savoir si vous �tes pour ou contre la taxe Tobin, mais de savoir si vous osez entendre ce que la Commission et le Conseil en pensent. Ce n' est pas demander beaucoup. Donc, je r�it�re la proposition de maintenir cette question orale � la Commission et au Conseil pour conna�tre une fois pour toutes la position de ces deux instances par rapport � cette proposition relativement modeste, mais qui donnerait un signal important � l' opinion, en particulier apr�s l' �motion s
 uscit�e par l' �chec de la conf�rence de Seattle.
+
+Nous allons voter sur la demande du groupe PPE-DE visant � retirer la question orale concernant l' imp�t sur le capital de l' ordre du jour.
+(Le Parlement rejette la demande avec 164 voix pour, 166 voix contre et 7 abstentions)
+
+Madame la Pr�sidente, je voudrais remercier M. Poettering pour le coup de publicit� qu' il vient de donner � ce d�bat. Merci.
+
+Madame la Pr�sidente, a-t-on comptabilis� mon vote, qui n'a pu �tre r�alis� �lectroniquement parce que je n'ai pas ma carte ? J'ai vot� "pour".
+
+Effectivement, si on ajoute les deux coll�gues qui se sont manifest�s, nous obtenons comme r�sultat....
+
+Madame la Pr�sidente, la pr�sidence a proclam� le r�sultat du vote. Les modifications n'ont pas lieu d'�tre.
+
+Mes chers coll�gues, encore une fois, il faut que chacun ait bien sa carte le lundi. On voit que nous avons l� un probl�me. Cela �tant, je dois prendre une d�cision.
+J' ai aussi oubli� ma carte et j' aurais vot� contre. Je consid�re donc que la question orale reste maintenue � l' ordre du jour.
+C' est la derni�re fois que nous tiendrons compte des cartes oubli�es. Que ceci soit bien clair et qu' on se le dise.
+(Applaudissements)
+Oui, la question orale est maintenue � l' ordre du jour et oui, la pr�sidente a le droit de voter, comme elle a aussi le droit d' oublier sa carte.
+Nous poursuivons avec les autres modifications de l' ordre du jour.
+
+Madame la Pr�sidente, lors du dernier vote � et je m'en remets � votre d�cision sur ce sujet - sur la question du plan strat�gique de la Commission, j'ai signal� que je demandais la parole avant le vote au nom de mon groupe. Mais ma demande n'a pas �t� satisfaite. Je vous saurai gr�, � l'issue de ce point de l'ordre du jour, de me permettre de fournir un explication de vote au nom de mon groupe. C'est important. Il serait utile de consigner au proc�s-verbal du Parlement la mani�re dont les gens per�oivent ce que nous venons de faire, � la lumi�re de leur propre analyse politique.
+
+Madame la Pr�sidente, je ne veux pas relancer le d�bat mais j'avais �galement demand� la parole pour m'exprimer quant � la demande de M. Bar�n Crespo. Moi non plus, vous ne m'avez pas donn� la parole. Je le d�plore mais le vote a �t� effectu�, la d�cision est tomb�e et nous devrions donc en rester l�.
+
+Je suis d�sol�e, Monsieur H�nsch et Monsieur Cox, je n'avais pas vu que vous demandiez la parole. Cela �tant, je crois que les positions sont bien claires et elles seront consign�es au proc�s-verbal. Lorsque nous adopterons demain le proc�s-verbal de la s�ance d'aujourd'hui, les coll�gues qui estimeront que les positions n'ont pas �t� suffisamment bien expliqu�es pourront demander des modifications. Il me semble que c'est une bonne formule. Bien entendu, le proc�s-verbal de la r�union de demain tiendra compte de toutes les explications compl�mentaires. Je crois que c'est une meilleure formule que de proc�der maintenant � des explications de vote qui nous entra�neraient tr�s loin. Monsieur Cox, Monsieur H�nsch, est-ce que cela vous convient ?
+
+Madame la Pr�sidente, si le proc�s-verbal refl�te correctement le vote de mon groupe, je n'ai et n'aurai aucune objection � formuler. Si votre d�cision est que je ne puis pas donner d'explication de vote, je l'accepte, mais avec certaines r�serves.
+
+Nous ferons donc tr�s attention � la r�daction du proc�s-verbal. Nous le faisons d'ailleurs toujours. S'il ne refl�te pas bien les positions, nous pourrons �ventuellement le corriger.
+(Le Parlement adopte l'ordre des travaux ainsi modifi�)
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/language-identifier/src/test/java/org/apache/nutch/analysis/lang/it.test
----------------------------------------------------------------------
diff --git a/nutch-plugins/language-identifier/src/test/java/org/apache/nutch/analysis/lang/it.test b/nutch-plugins/language-identifier/src/test/java/org/apache/nutch/analysis/lang/it.test
new file mode 100644
index 0000000..15813fb
--- /dev/null
+++ b/nutch-plugins/language-identifier/src/test/java/org/apache/nutch/analysis/lang/it.test
@@ -0,0 +1,109 @@
+Ripresa della sessione
+Dichiaro ripresa la sessione del Parlamento europeo, interrotta venerd� 17 dicembre e rinnovo a tutti i miei migliori auguri nella speranza che abbiate trascorso delle buone vacanze.
+Come avrete avuto modo di constatare il grande "baco del millennio" non si � materializzato. Invece, i cittadini di alcuni nostri paesi sono stati colpiti da catastrofi naturali di proporzioni davvero terribili. Avete chiesto che si tenesse una discussione su tale tema nei prossimi giorni, nel corso della presente tornata. Nel frattempo � mio desiderio, come del resto mi � stato chiesto da alcuni colleghi, osservare un minuto di silenzio in memoria di tutte le vittime delle tempeste che si sono abbattute sui diversi paesi dell' Unione europea. Vi invito pertanto ad alzarvi in piedi per osservare appunto un minuto di silenzio.
+(Il Parlamento osserva un minuto di silenzio)
+
+Signora Presidente, intervengo per una mozione d'ordine. Come avr� letto sui giornali o sentito alla televisione, in Sri Lanka si sono verificati numerosi assassinii ed esplosioni di ordigni. Una delle vittime pi� recenti � stato Kumar Ponnambalam, che qualche mese fa era venuto in visita qui al Parlamento europeo. Signora Presidente, sarebbe opportuno che inviasse una lettera alla Presidente del Sri Lanka per esprimere le condoglianze del Parlamento per questa e le altre morti violente verificatesi in Sri Lanka e per invitarla a fare quanto in suo potere al fine di giungere a una riconciliazione pacifica in questa situazione assai difficile.
+
+S�, onorevole Evans, ritengo che un' iniziativa del tipo che lei propone sia assolutamente opportuna. Se l' Assemblea � d' accordo seguir� il suggerimento dell' onorevole Evans.
+
+Signora Presidente, un richiamo al Regolamento. Gradirei avere il suo parere riguardo all'articolo 143 sull'inammissibilit�. La mia domanda si ricollega a un tema all'ordine del giorno di gioved� e che formuler� di nuovo al momento opportuno.
+La relazione Cunha sui programmi di orientamento pluriennali � iscritta all'ordine del giorno della Plenaria di gioved� e al paragrafo 6 contiene una proposta volta a introdurre una sorta di sanzione a carico delle quote di quei paesi che non riescono a raggiungere i loro obiettivi di riduzione annuali delle flotte, nonostante il principio della stabilit� relativa. Credo che tale principio sia un principio giuridico fondamentale della politica comune della pesca e qualsiasi proposta volta a sovvertirlo sarebbe giuridicamente inammissibile Vorrei sapere se � possibile sollevare un'obiezione di questo tipo nel contesto di una semplice relazione, e non di una proposta legislativa, e se rientra nelle mie competenze sollevare una tale obiezione gioved� prossimo.
+
+E' appunto in quell' occasione che, se lo desidera, avr� modo di sollevare la sua questione pregiudiziale, cio� gioved� in apertura della discussione sulla relazione.
+
+Signora Presidente, in coincidenza con la prima tornata dell'anno del Parlamento europeo, negli Stati Uniti in Texas � stata fissata, purtroppo per gioved� prossimo, l'esecuzione di un condannato a morte, un giovane di 34 anni che chiameremo di nome Hicks.
+Su richiesta di un deputato francese, l'onorevole Zimeray, � gi� stata presentata una petizione, che ha avuto molti firmatari tra cui il sottoscritto, ma le chiedo, in conformit� con l'indirizzo ormai costantemente espresso dal Parlamento europeo e da tutta la Comunit� europea, di intervenire, con il prestigio della sua carica e dell'Istituzione che lei rappresenta, presso il Presidente e il Governatore del Texas Bush, che ha il potere di sospendere la condanna a morte e di graziare il condannato.
+E tutto ci� in conformit� con i principi che abbiamo sempre sostenuto.
+
+La ringrazio, onorevole Segni, lo far� volentieri. In effetti ci� � assolutamente conforme alla posizione che il nostro Parlamento ha sempre sostenuto.
+
+Signora Presidente, vorrei richiamare l'attenzione su un caso che il Parlamento segue da tempo, ossia il caso di Alexander Nikitin. Noi tutti siamo lieti che il tribunale lo abbia assolto, ribadendo che anche in Russia l'accesso a informazioni sull'ambiente � un diritto sancito dalla costituzione. Ora, per�, verr� messo nuovamente in stato di accusa perch� il pubblico ministero ricorrer� in appello. Come sappiamo e come abbiamo fatto rilevare in innumerevoli risoluzioni - anche nell'ultima seduta plenaria dell'anno scorso - non si tratta semplicemente di un caso giudiziario ed � un grave errore accusare Alexander Nikitin di aver commesso reati e atti criminali, tanto pi� che noi, in quanto diretti interessati, abbiamo beneficiato dei risultati delle sue ricerche. Tali risultati sono alla base dei programmi europei di tutela del Mare di Barents. La prego pertanto di prendere in esame la bozza della lettera in cui vengono indicati i fatti principali e di sostenere presso le aut
 orit� russe la posizione assunta dal Parlamento, conformemente alle sue risoluzioni.
+
+S�, onorevole Scroedter, esaminer� volentieri i fatti relativi alla questione da lei esposta non appena avr� ricevuto la sua lettera.
+
+Signora Presidente, mi permetta di farle innanzi tutto i miei complimenti per aver tenuto fede alla parola data. In effetti il numero di canali televisivi disponibili nei nostri uffici � aumentato enormemente in questa prima tornata dell'anno nuovo. Tuttavia, signora Presidente, non � ancora stato dato seguito alla mia richiesta. E' vero che adesso abbiamo due canali finlandesi e uno portoghese, ma purtroppo manca ancora il canale olandese. Ed era proprio quello che avevo chiesto, dato che noi parlamentari olandesi, quando veniamo spediti in questo esilio mensile, gradiremmo poter vedere il telegiornale in olandese. Ripeto ancora una volta la mia richiesta: faccia in modo che sia reso disponibile anche un canale olandese.
+
+Onorevole Plooj-van Gorsel, posso risponderle che tale punto figura all' ordine del giorno della riunione dei questori di mercoled�. Spero che sar� esaminata con uno spirito positivo.
+
+Signora Presidente, vorrei sapere perch� questo Parlamento non rispetta le norme in materia di salute e sicurezza che esso stesso approva. Perch� non � stato condotto alcun test della qualit� dell'aria in questo edificio da quando siamo stati eletti? Perch� dal 1998 il comitato salute e sicurezza non si � pi� riunito? Perch� non sono state fatte prove dell'allarme antincendio n� negli edifici del Parlamento di Bruxelles n� qui a Strasburgo? Perch� non esistono istruzioni da seguire in caso di incendio? Perch� dopo il mio incidente non sono state apportate migliorie alle scale? Perch� non viene fatto rispettare il divieto di fumare nelle aree riservate appunto ai non fumatori? E' assolutamente vergognoso che proprio noi non rispettiamo le norme da noi stessi approvate.
+
+Onorevole Lynne, lei ha perfettamente ragione e intendo verificare se tutto quanto lei ha detto davvero non � stato fatto. Intendo altres� sottoporre il punto al collegio dei questori e sono certa che ai nostri questori star� a cuore fare in modo che il Parlamento osservi le disposizioni che approva.
+
+Signora Presidente, l' onorevole D�ez Gonz�lez e io avevamo presentato alcune interrogazioni in merito a determinate opinioni della Vicepresidente de Palacio riferite da un giornale spagnolo. I servizi competenti non le hanno inserite all' ordine del giorno, in quanto hanno ritenuto che avessero gi� ottenuto risposta in una tornata precedente.
+Chiedo che venga riesaminata tale decisione, in quanto non � cos�. Le interrogazioni cui � stata data risposta in precedenza riguardavano un intervento della Commissario de Palacio in un caso determinato, non le dichiarazioni pubblicate dal giornale ABC il 18 novembre scorso.
+
+Onorevole collega, sar� mia cura verificare tale punto. Devo confessarle che in questo momento la questione mi pare un po' confusa. Quindi verificheremo con estrema attenzione per essere certi che tutto sia corretto.
+
+Signora Presidente, gradirei sapere se questa settimana il Parlamento intende lanciare un segnale chiaro per esprimere il nostro scontento riguardo alla decisione presa oggi di rifiutare il rinnovo dell'embargo sulle armi contro l' Indonesia, visto e considerato che in passato la stragrande maggioranza dei deputati aveva sostenuto l'imposizione dell'embargo all'Indonesia. La decisione odierna di non rinnovarlo � pericolosissima, data la situazione sul posto. Il Parlamento dovrebbe pertanto inviare un messaggio, come auspica la stragrande maggioranza dei deputati. Gli Stati membri dell'Unione sono stati irresponsabili a non rinnovare l'embargo. Com'� gi� stato detto, la situazione in Indonesia � davvero esplosiva, con un forte rischio che in futuro si verifichi un colpo di Stato. Non sappiamo cosa stia succedendo e quindi mi chiedo perch� si debba permettere ai produttori di armi dell'UE di trarne profitto a scapito di persone innocenti.
+
+Comunque sia, questo punto non � previsto nelle discussioni sui problemi di attualit� di gioved�.
+
+Ordine dei lavori
+L' ordine del giorno reca la fissazione dell' ordine dei lavori.
+E' stata distribuita la versione definitiva del progetto di ordine del giorno, elaborata, ai sensi dell' articolo 110 del Regolamento, dalla Conferenza dei presidenti nella seduta di gioved� 13 gennaio. Non sono state proposte modifiche per luned� e marted�.
+Mercoled�:
+Il gruppo PSE ha chiesto di iscrivere una dichiarazione della Commissione sui suoi obiettivi strategici per i prossimi cinque anni e sulla riforma amministrativa della Commissione.
+Desidero che l' onorevole Barn Crespo, autore della richiesta, intervenga per motivarla, ovviamente se lo desidera. Poi procederemo come di norma: sentiremo un oratore a favore e uno contro.
+
+Signora Presidente, l' idea che la Commissione Prodi presentasse il suo programma politico per tutta la legislatura proviene inizialmente da una proposta del gruppo del Partito del socialismo europeo, approvata all' unanimit� dalla Conferenza dei Presidenti in settembre e anche accettata esplicitamente dal Presidente Prodi, che ha ribadito il suo impegno al riguardo durante il discorso di investitura.
+Si tratta di un impegno importante in quanto la Commissione � un organo che detiene il monopolio di iniziativa, conformemente ai Trattati, e di conseguenza delinea i tratti essenziali di quella che sar� l' attivit� politica e legislativa di questo Parlamento nei prossimi cinque anni. Ricordo altres�, signora Presidente, che durante la precedente legislatura il Parlamento ha votato due volte la fiducia a favore del Presidente Prodi; durante l' attuale legislatura l' ha votata di nuovo a luglio e poi, dopo l' insediamento della nuova Commissione, ha votato nuovamente la fiducia per l' intera Commissione in settembre. Quindi c' � gi� stato tempo a sufficienza per permettere alla Commissione di elaborare il suo programma e per consentirci di prenderne conoscenza per poi spiegarlo ai cittadini. A tale proposito ricordo la risoluzione del 15 settembre scorso, in cui si raccomandava di presentare la proposta il pi� rapidamente possibile.
+I fatti della settimana scorsa - scaturiti a latere della Conferenza dei Presidenti, sfruttata solo per corroborare e ratificare decisioni adottate al di fuori di essa - ci pongono di fronte a un dilemma: o la Commissione non � in grado di presentare questo programma (in tal caso, sarebbe opportuno che lo dicesse. A sentire il suo Presidente, � in grado di farlo. Dato che la Commissione � rappresentata dalla vicepresidente de Palacio, ritengo che prima di votare converrebbe sapere se la Commissione � sempre disposta a presentare il programma, conformemente agli accordi); oppure il Parlamento non � in grado di esaminare tale programma, come apparentemente sostengono alcuni. Secondo me, questa seconda ipotesi significherebbe rinunciare alle nostre responsabilit� di Parlamento, oltre a introdurre una tesi originale, un metodo finora sconosciuto che consiste nel distribuire per iscritto ai gruppi politici il discorso programmatico della Commissione una settimana prima - e non il g
 iorno prima, come era stato concordato. Considerando che il programma legislativo sar� discusso a febbraio, potremmo prescindere dal dibattito, in quanto il giorno dopo la stampa ed Internet avrebbero divulgato il testo a tutti i cittadini e quindi il Parlamento non avrebbe pi� bisogno di occuparsene.
+Secondo il mio gruppo, un Parlamento serve per ascoltare, discutere e riflettere, quindi a nostro avviso non c' � alcuna ragione che giustifichi questo rinvio. Se la Commissione � in grado di presentare il programma, secondo noi siamo perfettamente in tempo per ripristinare l' accordo iniziale intervenuto tra il Parlamento e la Commissione e comportarci responsabilmente dinnanzi ai nostri concittadini. Perci� la proposta del gruppo del Partito del socialismo europeo, da lei menzionata, � che mercoled� si mantenga la presentazione del programma per la legislatura della Commissione Prodi, inserendovi anche il progetto di riforma amministrativa. Altrimenti potremmo ritrovarci in una situazione paradossale: con la scusa che non c' � il testo, si nega da un lato il diritto del Presidente della Commissione di rivolgersi a questo Parlamento, e dall' altro che abbia luogo una discussione sulla riforma, senza che il Parlamento conosca a priori i testi su cui si basa. Pertanto, signora 
 Presidente, la prego di chiedere alla Commissione di esprimersi subito e poi di procedere al voto.
+(Applausi dai banchi del gruppo del partito del socialismo europeo)
+
+Signora Presidente, onorevoli colleghi, sono piuttosto sorpreso del comportamento del collega, onorevole Bar�n Crespo, che ora pretende che il punto in questione venga inserito nell'ordine del giorno di mercoled�.
+Onorevole collega Bar�n Crespo, lei non ha potuto partecipare gioved� scorso alla Conferenza dei presidenti. Non la biasimo per questo: pu� sempre succedere che si debba essere sostituiti. Il collega H�nsch � intervenuto in sua vece. In sede di Conferenza dei presidenti ne abbiamo discusso approfonditamente. Soltanto un gruppo politico condivideva l'opinione da lei espressa in questa sede. La questione � stata posta ai voti. Come � noto, ciascun presidente dispone di un numero di voti pari al numero dei deputati iscritti al proprio gruppo politico. Il punto in questione � stato oggetto di una votazione in cui, se ben ricordo, vi sono stati 422 voti contrari e 180 a favore con poche astensioni. Ci� significa che tutti i gruppi politici, ad eccezione dei non iscritti - che per� non costituiscono un gruppo politico -, erano concordi e che un solo gruppo era del parere di procedere come proposto dal collega in questa sede. Tutti gli altri erano di diversa opinione e cos� � s
 tato deciso.
+Vorrei ora entrare brevemente nel merito. Abbiamo fiducia nella Commissione, in Romano Prodi e la grande maggioranza del nostro gruppo politico, come tutti sanno, dopo un difficile processo ha votato la fiducia a Romano Prodi e alla Commissione. Tuttavia siamo anche dell'idea che la strategia della Commissione vada discussa nel corso di una procedura regolare, non soltanto in base a una dichiarazione rilasciata oralmente in questo Parlamento ma anche in base a un documento adottato dalla Commissione che illustri tale programma per i prossimi cinque anni. Ma un tale documento non esiste ancora!
+
+La Commissione presenter� il programma per il 2000 in febbraio. Abbiamo acconsentito: se la Commissione non vuole discutere il programma 2000 in gennaio lo faremo in febbraio. Non � certo nostra intenzione entrare in conflitto con la Commissione. Al contrario, pensiamo che per quanto possibile la Commissione e il Parlamento debbano percorrere una strada comune. Il Parlamento, tuttavia, esercita anche funzioni di controllo nei confronti della Commissione e non tutto ci� che viene proposto da quest'ultima deve necessariamente trovarci concordi.
+Vorrei che all'interno dei gruppi politici potessimo prepararci adeguatamente al dibattito sul programma quinquennale. Non � possibile farlo ascoltando una dichiarazione di cui non conosciamo con esattezza il contenuto. Perci� raccomandiamo - e ho l'impressione che anche la Commissione sia disposta ad accogliere questa idea - di discutere in febbraio il programma a lungo termine della Commissione che si estende fino al 2005 - sperando che a quel punto la Commissione abbia concordato un programma che ci sottoporr� - e, sempre in febbraio, anche il programma legislativo della Commissione per l'anno 2000. E' dunque sulla base di un nesso oggettivo che proponiamo di discutere contestualmente i due programmi e per questa ragione il mio gruppo politico respinge decisamente la proposta del gruppo socialista!
+(Applausi dai banchi del gruppo del PPE-DE)
+
+Signora Presidente, desidero affermare chiaramente che, innanzi tutto, la Commissione nutre il massimo rispetto per le decisioni del Parlamento tra cui quella di elaborare il proprio ordine del giorno. Quindi, noi rispettiamo le eventuali decisioni in materia del Parlamento.
+Ma voglio dire altrettanto chiaramente che il Presidente Prodi si � impegnato con il Parlamento a inserire un nuovo dibattito, come ha ricordato l' onorevole Bar�n, oltre al dibattito annuale sul programma legislativo della Commissione, sulle grandi linee di azione per il prossimo quinquennio, cio� per la presente legislatura.
+Tengo a sottolineare, signora Presidente, che, secondo l' accordo concluso nel settembre scorso, questo dibattito era distinto dalla presentazione del programma legislativo della Commissione. E desidero far sapere che, per quanto riguarda la Commissione, siamo pronti e disposti a tenere questo dibattito quando lo si ritenga opportuno; eravamo gi� pronti a farlo questa settimana, conformemente all' accordo iniziale, sulla base dell' intesa di una presentazione del discorso ai gruppi parlamentari il giorno prima.
+Quindi, signora Presidente, ribadisco che da parte nostra abbiamo discusso del programma di azione per il prossimo quinquennio e che siamo pronti a presentarlo quando vuole il Parlamento - anche questa settimana, se decide il tal senso- mentre il mese prossimo toccher� al programma per il 2000, esattamente come era stato stabilito.
+
+Propongo di porre in votazione la richiesta del gruppo PSE di iscrivere nuovamente all' ordine del giorno la dichiarazione della Commissione sui suoi obiettivi strategici.
+(Il Parlamento respinge la richiesta)
+Presidente. Sempre sulla giornata di mercoled� ho ricevuto un' altra proposta di modifica relativa alla interrogazione orale sull' imposta patrimoniale che il gruppo PPE-DE chiede di ritirare dall' ordine del giorno.
+Qualcuno desidera intervenire a nome del gruppo per motivare tale richiesta?
+
+Signora Presidente, sento qualche risata fra i socialisti. Mi � stato detto che anche una parte cospicua del gruppo socialista vorrebbe che questo punto venisse ritirato dall'ordine del giorno, in quanto nella votazione in sede di Conferenza dei presidenti � mancato il voto del gruppo di lavoro dei colleghi competenti del gruppo socialista. Non so se questa informazione sia corretta, ma noi del PPE-DE saremmo comunque grati se il suddetto punto venisse eliminato visto che il Parlamento si � gi� occupato ripetutamente della questione. Esistono anche alcune decisioni contro tale imposta. Per questa ragione il gruppo del PPE-DE chiede che il punto in questione venga ritirato dall'ordine del giorno.
+
+La ringrazio, onorevole Poettering.
+Ha facolt� l' onorevole Wurtz che interviene contro la richiesta.
+
+Signora Presidente, vorrei innanzi tutto sottolineare la mancanza di logica da parte dell' onorevole Poettering che ha appena fatto la morale al gruppo socialista per essere tornato su una decisione approvata dalla Conferenza dei presidenti con una netta maggioranza. Ebbene, egli ha fatto la stessa cosa. Si era discusso e tutti si erano detti d' accordo, tranne il gruppo PPE-DE e il gruppo ELDR. In tale circostanza - gli onorevoli colleghi presidenti lo ricorderanno - avevo altres� sottolineato che il punto non era tanto sapere se siamo a favore o contro la tassa Tobin, ma se abbiamo il coraggio di ascoltare che cosa ne pensano la Commissione e il Consiglio. Non � chiedere molto. Pertanto, reitero la proposta di mantenere all' ordine del giorno l' interrogazione orale al Consiglio e alla Commissione, per apprendere, una volta per tutte, la posizione delle due Istituzioni su una proposta relativamente modesta, ma che lancerebbe un segnale importante all' opinione pubblica, soprattu
 tto sull' onda dell' emozione suscitata dal fallimento della Conferenza di Seattle.
+
+Pongo ora in votazione la richiesta del gruppo PPE-DE intesa a ritirare dall' ordine del giorno l' interrogazione orale sull' imposta patrimoniale.
+(Il Parlamento respinge la richiesta con 164 voti favorevoli, 166 contrari e 7 astenuti)
+
+Signora Presidente, ringrazio l' onorevole Poettering per l' inaspettata pubblicit� che ha appena dato a questa discussione. Grazie!
+
+Signora Presidente, � stato contato il mio voto, che non � stato deposto elettronicamente, perch� non ho la scheda? Il mio voto era favorevole.
+
+In effetti, se aggiungiamo i voti dei due colleghi che sono intervenuti, il risultato...
+
+Signora Presidente, la Presidenza ha gi� annunciato l' esito della votazione. Non sono ammesse modifiche.
+
+Onorevoli colleghi, ancora una volta, occorre presentarsi in Aula con la carta di votazione anche il luned�. Evidentemente abbiamo un problema sul quale sono chiamata a prendere una decisione.
+Anch' io ho dimenticato la mia carta di votazione e avrei votato contro. Ritengo pertanto che si debba concludere che l' interrogazione orale rimane iscritta all' ordine del giorno.
+
+Questa � l' ultima volta che si terr� conto del voto dei deputati che hanno dimenticato la loro carta di votazione. Che sia ben chiaro per tutti.
+(Applausi)
+S�, l' interrogazione orale resta all' ordine del giorno ed ebbene s�, la Presidente ha diritto di votare cos� come ha diritto anche di dimenticare la carta di votazione.
+Proseguiamo con le altre modifiche all' ordine del giorno.
+
+Signora Presidente, premetto che rispetter� la sua decisione, ma durante la votazione precedente sulla questione del piano strategico della Commissione avevo manifestato l'intenzione di intervenire a nome del mio gruppo prima della votazione, ma non ne ho avuto la possibilit�. Le sarei molto grato se prima di chiudere questo punto all'ordine del giorno mi permettesse di rilasciare una dichiarazione di voto a nome del mio gruppo. Si tratta di una cosa importante e sarebbe utile poter mettere a verbale il motivo del nostro comportamento al momento della votazione alla luce della nostra analisi politica.
+
+Signora Presidente, non � mia intenzione riprendere la discussione, ma anch'io avevo chiesto di intervenire per prendere posizione sulla richiesta dell'onorevole Bar�n Crespo. Lei non mi ha dato la parola. Ne sono spiacente anche se ormai la votazione � gi� stata effettuata, la decisione presa e quindi la questione � da considerarsi chiusa.
+
+Me ne dispiace, onorevoli Hnsch e Cox, non mi ero accorta che avevate chiesto la parola. Comunque mi pare che le posizioni siano chiare e saranno riportate al processo verbale. Quando domani approveremo il processo verbale della seduta odierna i colleghi che dovessero ritenerlo inadeguato potranno chiedere che esso venga modificato. Mi pare una buona soluzione. Evidentemente il processo verbale della seduta di domani riporter� tutte le eventuali dichiarazioni complementari. Mi pare una formula migliore piuttosto che procedere ora alle dichiarazioni di voto che ci porterebbero molto lontano. Onorevole Cox, onorevole Hnsch, siete d' accordo?
+
+Signora Presidente, se il verbale rispecchier� correttamente il voto del mio gruppo non avr� alcuna obiezione. Ma se la sua decisione implica che non posso rilasciare una dichiarazione di voto, la accetto con delle riserve.
+
+Faremo molta attenzione allora alla redazione del processo verbale, come per altro facciamo sempre. Se esso non rifletter� correttamente le posizioni potr� essere modificato.
+(Il Parlamento approva l' ordine del giorno cos� modificato)
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/language-identifier/src/test/java/org/apache/nutch/analysis/lang/nl.test
----------------------------------------------------------------------
diff --git a/nutch-plugins/language-identifier/src/test/java/org/apache/nutch/analysis/lang/nl.test b/nutch-plugins/language-identifier/src/test/java/org/apache/nutch/analysis/lang/nl.test
new file mode 100644
index 0000000..92473f7
--- /dev/null
+++ b/nutch-plugins/language-identifier/src/test/java/org/apache/nutch/analysis/lang/nl.test
@@ -0,0 +1,105 @@
+Hervatting van de zitting
+Ik verklaar de zitting van het Europees Parlement, die op vrijdag 17 december werd onderbroken, te zijn hervat. Ik wens u allen een gelukkig nieuwjaar en hoop dat u een goede vakantie heeft gehad.
+Zoals u heeft kunnen constateren, is de grote "millenniumbug" uitgebleven. De burgers van een aantal van onze lidstaten zijn daarentegen door verschrikkelijke natuurrampen getroffen. U heeft aangegeven dat u deze vergaderperiode een debat wilt over deze rampen. Nu wil ik graag op verzoek van een aantal collega's een minuut stilte in acht nemen ter nagedachtenis van de slachtoffers. Ik doel hiermee met name op de slachtoffers van het noodweer dat verschillende lidstaten van de Unie heeft geteisterd. Ik wil u vragen deze minuut stilte staande in acht te nemen.
+(Het Parlement neemt staande een minuut stilte in acht)
+
+Mevrouw de Voorzitter, ik wil een motie van orde stellen. U zult via de media hebben vernomen dat er zich in Sri Lanka een aantal bomexplosies en schietpartijen hebben voorgedaan. Een van de mensen die zeer recent in Sri Lanka is vermoord, is de heer Kumar Ponnambalam, die een paar maanden geleden nog een bezoek bracht aan het Europees Parlement. Zou u, mevrouw de Voorzitter, wellicht een brief kunnen schrijven aan de President van Sri Lanka, waarin u laat weten dat uzelf en het Europees Parlement deze moord en de overige brute moorden in Sri Lanka diep betreuren, en waarin u haar oproept al het nodige te doen om in deze zeer moeilijke situatie langs vreedzame weg tot verzoening te komen?
+
+Ja, mijnheer Evans, ik denk dat een dergelijk initiatief zeer gepast zou zijn. Als het Parlement ermee instemt, geef ik graag gevolg aan uw suggestie.
+
+ Mevrouw de Voorzitter, ik wil een motie van orde stellen. Ik vraag uw advies over artikel 143 van het Reglement, met betrekking tot niet-ontvankelijkheid. Mijn vraag betreft een zaak die voor donderdag op de agenda staat en waarop ik dan opnieuw zal ingaan.
+Het verslag Cunha, over het resultaat van de meerjarige ori�ntatieprogramma's voor de vissersvloot, wordt donderdag in het Parlement besproken. In artikel 6 van dit verslag wordt voorgesteld een soort strafkorting op vangstquota toe te passen voor lidstaten die zich niet houden aan de jaarlijkse doelstellingen voor vlootinkrimping. Dit zou moeten gebeuren in weerwil van het beginsel van relatieve stabiliteit. Naar mijn mening is de relatieve stabiliteit een grondbeginsel van het gemeenschappelijk visserijbeleid, en zou een voorstel om dit te ondermijnen juridisch niet-ontvankelijk zijn. Ik zou graag willen weten of een dergelijk bezwaar kan worden ingebracht tegen enkel een verslag, dus niet een ontwerpwetgevingsvoorstel, en of ik aanstaande donderdag gerechtigd ben dit te doen.
+
+Dat is inderdaad het juiste moment om dit vraagstuk ter tafel te brengen, dat wil zeggen donderdag voordat het verslag wordt gepresenteerd.
+
+Mevrouw de Voorzitter, helaas valt de eerste vergaderperiode van het Europees Parlement in het nieuwe jaar samen met de executie van een jonge terdoodveroordeelde van 34 jaar, Hicks genaamd. Deze executie is door de staat Texas van de Verenigde Staten vastgesteld voor aanstaande donderdag.
+Op initiatief van een van de Franse afgevaardigden, de heer Zimeray, is al een petitie aangeboden, die door velen is ondertekend, onder andere door mij. Maar ik wil u vragen om, in uw hoedanigheid van Voorzitter van het Europees Parlement, uw invloed aan te wenden bij de president en de gouverneur van de staat Texas, de heer Bush, die de macht heeft om de terdoodveroordeling op te schorten en de veroordeelde gratie te verlenen, in overeenstemming met het standpunt van het Europees Parlement en de gehele Europese Gemeenschap, zoals dat herhaaldelijk naar voren is gekomen.
+Bovendien is een dergelijke actie in overeenstemming met onze grondbeginselen.
+
+Hartelijk dank, mijnheer Segni, dat wil ik graag doen. Het ligt ook geheel in de lijn van de standpunten die ons Parlement altijd met betrekking tot dergelijke vraagstukken heeft ingenomen.
+
+Mevrouw de Voorzitter, ik zou op een geval willen wijzen waarmee dit Parlement zich keer op keer bezig heeft gehouden, het geval-Alexander Nikitin. We zijn allemaal blij dat de rechter hem heeft vrijgesproken en duidelijk heeft gemaakt dat ook in Rusland het recht op toegang tot informatie over het milieu door de grondwet wordt gegarandeerd. Hij schijnt echter weer voor de rechter te moeten verschijnen, omdat de officier van justitie in beroep gaat. We weten allemaal, en hebben dat ook gezegd in talloze resoluties, onder andere nog tijdens de laatste plenaire vergadering vorig jaar, dat dit niet alleen een geval voor juristen is, en dat het verkeerd is Alexander Nikitin te beschuldigen van misdrijven en verraad. Wij zijn namelijk direct betrokken bij de resultaten van zijn onderzoek en profiteren ervan. Deze resultaten vormen de basis voor de Europese programma' s voor de bescherming van de Barentszzee, en daarom zou ik u willen vragen een brief te bestuderen waarin de belangrijkste
  feiten worden samengevat, en een standpunt te bepalen over de besluiten van het Russische parlement.
+
+Mevrouw Schroedter, nadat ik uw brief ontvangen heb, zal ik de feiten die op dit vraagstuk betrekking hebben, zeker bestuderen.
+
+Voorzitter, ik zou u eerst een compliment willen maken met het feit dat u woord hebt gehouden en dat inderdaad nu in deze eerste vergaderperiode in het nieuwe jaar het aantal televisiezenders op onze kamer daadwerkelijk enorm is uitgebreid. Maar, Voorzitter, er is niet gebeurd waar ik om had gevraagd. Er zijn nu weliswaar twee Finse zenders en een Portugese zender, maar er is nog steeds geen Nederlandse zender en ik had u gevraagd om een Nederlandse zender, omdat ook Nederlanders graag het nieuws willen volgen, iedere maand als wij hier naar dit verbanningsoord worden gestuurd. Dus ik zou u nogmaals willen vragen om er toch zorg voor te dragen dat wij ook een Nederlandse zender krijgen.
+
+Mevrouw Plooij-van Gorsel, ik kan u mededelen dat dit vraagstuk op de agenda van de vergadering van het College van quaestoren van aanstaande woensdag staat. Het zal, naar ik hoop, op bevredigende wijze worden opgelost.
+
+Mevrouw de Voorzitter, kunt u mij uitleggen waarom dit Parlement zich niet houdt aan gezondheids- en veiligheidsvoorschriften die het zelf vaststelt? Waarom is de kwaliteit van de lucht in dit gebouw nooit onderzocht sinds de verkiezingen? Waarom heeft het Comit� voor veiligheid en hygi�ne sinds 1998 niet meer vergaderd? Waarom is er noch in het Parlementsgebouw in Brussel noch hier in Straatsburg een brandweeroefening gehouden? Waarom zijn er nergens instructies over wat te doen in geval van brand? Waarom zijn er sinds mijn ongeval geen verbeteringen aangebracht aan de trappen? Waarom wordt het rookverbod niet gehandhaafd in daarvoor aangewezen delen van het gebouw? Het is absoluut onaanvaardbaar dat wij wetgeving goedkeuren en dat wij ons daar zelf niet aan houden.
+
+Mevrouw Lynne, u hebt volkomen gelijk. Ik zal nagaan of dit inderdaad niet gebeurd is. Ik zal het vraagstuk verder aan het College van quaestoren voorleggen. Ik ben ervan overtuigd dat onze quaestoren ervoor zullen zorgdragen dat de wetgeving waarover we ons hebben uitgesproken, ook daadwerkelijk wordt nageleefd.
+
+Mevrouw de Voorzitter, mevrouw D�ez Gonz�lez en ikzelf hadden samen een aantal vragen gesteld naar aanleiding van bepaalde uitspraken van commissaris de Palacio in een Spaans dagblad. De voor de agenda verantwoordelijke diensten hebben die vragen niet op de agenda geplaatst, daar die vragen al in een andere vergaderperiode beantwoord zouden zijn.
+Daar dat niet zo is, verzoek ik het desbetreffende besluit te heroverwegen. De eerder beantwoorde vragen betreffen de bijdrage van mevrouw de Palacio over een ander dossier en gaan niet over de uitspraken die 18 november jongstleden in de krant ABC zijn verschenen.
+
+Waarde collega, we zullen dit alles natrekken. Ik moet u bekennen dat ik de situatie nu enigszins verwarrend vind. We zullen daarom deze kwestie zorgvuldig nagaan en correct volgens de regels handelen.
+
+Mevrouw de Voorzitter, ik zou graag willen weten of het Parlement deze week duidelijk zijn ongenoegen zal laten blijken over het vandaag genomen besluit het wapenembargo tegen Indonesi� niet te verlengen. De overgrote meerderheid van het Parlement heeft zich in het verleden immers uitgesproken voor dit embargo.Gezien de situatie in Indonesi� is het besluit het embargo niet te verlengen uiterst riskant. Het Parlement moet dan ook een signaal afgeven, aangezien een grote meerderheid dit wenst.Dat de lidstaten van de Europese Unie weigeren het embargo te verlengen, is zonder meer onverantwoordelijk, de explosieve situatie in het land in aanmerking nemend. Het gevaar van een militaire coup is niet denkbeeldig.Wij weten niet wat er aan de hand is. Waarom zouden EU-wapenfabrikanten moeten profiteren van een situatie die ten koste gaan van onschuldige mensen?
+
+Dit punt staat op dit moment in ieder geval niet voor het actualiteitendebat van donderdag ingeschreven.
+
+Regeling van de werkzaamheden
+Aan de orde is de behandeling van de definitieve ontwerpagenda zoals deze op de Conferentie van voorzitters op donderdag 13 januari is vastgesteld overeenkomstig artikel 110 van het Reglement. Ik heb geen wijzigingen voor de maandag en de dinsdag.
+Woensdag 19 januari:
+De socialistische fractie vraagt om inschrijving van een verklaring van de Commissie over haar strategische doelstellingen voor de komende vijf jaar alsmede over de bestuurlijke hervorming van de Commissie.
+Ik zou de heer Bar�n Crespo willen vragen zijn verzoek om inschrijving hier toe te lichten. Vervolgens zullen we de gebruikelijke procedure volgen, dat wil zeggen dat we een voorstander en een tegenstander van dit verzoek het woord zullen geven.
+
+Mevrouw de Voorzitter, de presentatie van het politieke programma van de Commissie Prodi voor de hele legislatuur was oorspronkelijk een voorstel van Fractie van de Partij van de Europese Sociaal-Democraten, dat in september de unanieme steun kreeg van de Conferentie van voorzitters. Ook de heer Prodi gaf er uitdrukkelijk zijn steun aan, en in de rede waarmee hij zijn benoeming aanvaardde, heeft hij die toezegging nog eens herhaald.
+Die toezegging is vooral belangrijk omdat de Commissie volgens de Verdragen het monopolie heeft op het initiatiefrecht en dus eigenlijk vorm geeft aan wat de komende vijf jaar de politieke en de wetgevende activiteit van dit Parlement zal zijn. Bovendien wil ik eraan herinneren, mevrouw de Voorzitter, dat dit Parlement in de vorige legislatuur tweemaal zijn vertrouwen heeft uitgesproken in de heer Prodi als voorzitter van de Commissie. Tijdens de nieuwe legislatuur is dat vertrouwen nogmaals uitgesproken in juli, en na het aantreden van de voltallige nieuwe Commissie is dat vertrouwen in september aan de hele Commissie geschonken. Er is derhalve in theorie voldoende tijd geweest voor de Commissie om haar programma op te stellen en voor ons om er dan kennis van te nemen en uitleg te geven aan de burgers. In dit verband wil ik wijzen op de resolutie van 15 september, waarin de Commissie de aanbeveling kreeg het voorstel zo snel mogelijk in te dienen.
+De gebeurtenissen van de vorige week in de marge van de Conferentie van voorzitters, waarbij de Conferentie van voorzitters alleen maar gebruikt is om elders genomen besluiten te bekrachtigen en te ratificeren, plaatsen ons voor een dilemma: ofwel de Commissie is niet in staat dat programma te presenteren, ofwel, zoals sommigen schijnen te beweren, het Parlement is niet in staat dat programma te behandelen. In het eerste geval zou de Commissie haar onvermogen moeten toelichten, want volgens de voorzitter van de Commissie is de Commissie klaar voor de presentatie. Daar de Commissie hier vertegenwoordigd is door haar vice-voorzitter mevrouw de Palacio, denk ik dat het gepast zou zijn om voor wij gaan stemmen van de Commissie te horen hoe het zit met haar bereidheid het programma te presenteren zoals is afgesproken. Naar mijn mening houdt de tweede hypothese in dat wij als Parlement onze verantwoordelijkheid verwaarlozen. Volgens een tot nu onbekende procedure krijgen de fracties een w
 eek van tevoren - en niet zoals afgesproken ��n dag van tevoren - de geschreven toespraak over het programma van de Commissie. Bovendien is dit wel een zeer originele thesis, want het wetgevingsprogramma wordt in februari behandeld. Wij zouden dan ook kunnen afzien van het debat over dat programma, daar de dag nadat de fracties het gekregen hebben alle burgers dan via de pers en Internet ge�nformeerd zouden zijn en het Parlement niets meer met het onderwerp zou kunnen doen.
+Daar onze fractie meent dat het de taak is van een parlement om te luisteren, te debatteren en na te denken, geloven wij dat er geen enkele reden is die dit uitstel rechtvaardigt. Ook menen wij dat indien de Commissie in staat is het programma te presenteren, wij zeker nog de tijd hebben om terug te komen op de oorspronkelijke afspraak tussen het Parlement en de Commissie, en tegenover onze medeburgers op verantwoordelijke wijze te werk te gaan. Derhalve doet de Fractie van de Partij van de Europese Sociaal-Democraten het voorstel dat door u genoemd is, mevrouw de Voorzitter, om woensdag de presentatie van het programma voor de hele legislatuur van de Commissie Prodi te handhaven. Bij dat programma hoort ook de administratieve hervorming van de Commissie, want anders zouden wij met een paradoxale situatie te maken kunnen krijgen. Onder het voorwendsel dat er geen tekst is, wordt namelijk aan de ene kant de voorzitter van de Commissie het recht ontzegd voor dit Parlement te spreken e
 n wordt er aan de andere kant een debat gehouden over de hervorming, terwijl het Parlement de teksten daarvan niet van tevoren kent. Daarom verzoek ik u, mevrouw de Voorzitter, eerst de Commissie te vragen om haar mening te laten horen en daarna te laten stemmen.
+(Applaus van de PSE-fractie)
+
+Mevrouw de Voorzitter, geachte collega's, de houding van de heer Bar�n Crespo verbaast me wel een beetje. Hij wil dit punt op de agenda van woensdag plaatsen.
+Mijnheer Bar�n Crespo, u kon niet deelnemen aan de vergadering van de Conferentie van voorzitters op donderdag jongstleden. Daar heb ik het volste begrip voor; het kan altijd gebeuren dat iemand een plaatsvervanger moet sturen. De heer H�nsch heeft u vertegenwoordigd. Tijdens de vergadering van de Conferentie van voorzitters hebben we een uitvoerig debat gevoerd. Uw fractie was de enige die voorstelde wat u hier nu wilt. Daarover hebben we gestemd. Iedere voorzitter heeft zoveel stemmen als zijn of haar fractie leden heeft. Als ik me niet vergis, was het resultaat 422 stemmen tegen 180, bij slechts een paar onthoudingen. Dat betekent dat alle fracties het met elkaar eens waren, behalve de niet-ingeschreven leden, en die vormen geen fractie. Alleen uw fractie heeft voor het voorstel gestemd dat u hier herhaalt. De anderen waren het niet met u eens, en het besluit is genomen.
+Nu wil ik nog even op de zaak zelf ingaan. Wij hebben vertrouwen in de Commissie, in Romano Prodi, en zoals u allemaal weet, heeft de overwegende meerderheid van onze fractie Romano Prodi en de Commissie na een moeizaam proces het vertrouwen geschonken. We vinden echter ook dat we het debat over de strategie van de Commissie volgens de juiste procedure moeten voeren, en niet zomaar naar aanleiding van een mondelinge verklaring hier in het Europees Parlement. We hebben daarvoor ook een document nodig, dat door de Commissie is goedgekeurd en dit vijfjarig programma beschrijft. Een dergelijk document bestaat echter niet!
+De Commissie zal het programma voor 2000 in februari voorstellen. We hebben gezegd: goed, als de Commissie het programma voor 2000 nog niet in januari wil voorstellen, dan moet dat maar in februari. We hebben daarmee ingestemd. Wij willen tenslotte helemaal geen ruzie met de Commissie; wij vinden dat de Commissie en het Europees Parlement als het enigszins mogelijk is een gezamenlijke weg moeten vinden. Als Parlement zijn we echter ook verantwoordelijk voor de controle van de Commissie. Wat van de Commissie komt, is niet altijd ook ons standpunt.
+Ik ben van mening dat de fracties zich gedegen moeten kunnen voorbereiden op het debat over het vijfjarig programma. Naar een verklaring luisteren en van tevoren helemaal niet weten wat de inhoud van die verklaring is, dat is geen goede voorbereiding. Daarom stellen wij voor - en ik heb de indruk dat de Commissie wel openstaat voor dit idee - dat we in februari het debat voeren over het langetermijnprogramma van de Commissie voor de periode tot 2005. Ik hoop dat de Commissie het tegen die tijd eens is geworden over het programma dat ze dan aan ons zal voorleggen. Wij stellen echter voor dat we tegelijkertijd in februari het debat voeren over het wetgevend programma van de Commissie voor 2000. Er zijn goede inhoudelijke redenen om het debat over die twee programma' s samen te voeren. Daarom wijst mijn fractie het voorstel van de socialistische fractie met nadruk van de hand!
+(Applaus van de PPE-DE-Fractie)
+
+Mevrouw de Voorzitter, allereerst wil ik duidelijk maken dat de Commissie alle respect heeft voor de besluiten die dit Parlement neemt, met inbegrip van het opstellen van de agenda. Derhalve respecteren wij wat het Parlement in dit verband zal besluiten.
+Maar ik wens ook duidelijk te maken dat voorzitter Prodi zich tegenover het Parlement verplicht heeft, zoals de heer Bar�n in herinnering heeft geroepen, om naast het jaarlijks debat over het wetgevingsprogramma van de Commissie een nieuw debat te houden over de grote lijnen van het beleid voor de komende vijf jaar, dat wil zeggen voor deze hele legislatuur.
+Ik wil erop wijzen, mevrouw de Voorzitter, dat dat debat zich volgens het in septembere bereikte akkoord zou onderscheiden van het debat over het jaarlijks wetgevingsprogramma van de Commissie. Ook wil ik laten weten, mevrouw de Voorzitter, dat wij als Commissie bereid en klaar zijn om dat debat op het gepaste moment te houden. Wij waren ook klaar voor een debat gedurende deze week, zoals in beginsel was afgesproken, met dien verstande dat de dag voor het debat het woord aan de fracties zou worden gegeven.
+Daarom wil ik herhalen, mevrouw de Voorzitter, dat wij als Commissie het beleidsprogramma voor de komende vijf jaar besproken hebben en dat, indien het Parlement zo mocht besluiten - eventueel nog deze week -, wij exact volgens de afspraak klaar zijn om dit programma voor de komende vijf jaar te komen toelichten en om de volgende maand hetzelfde te doen met het programma voor het jaar 2000.
+
+Ik stel voor dat we gaan stemmen over het verzoek van de socialistische fractie om de verklaring van de Commissie over haar strategische doelstellingen opnieuw in te schrijven.
+(Het Parlement verwerpt het verzoek)
+Nog altijd met betrekking tot de woensdag heb ik een ander verzoek ontvangen. De PPE-DE-Fractie wil dat de mondelinge vragen over hoofdelijke belasting van de agenda worden geschrapt.
+Welke vertegenwoordiger van deze fractie wil het woord voeren om dit verzoek toe te lichten?
+
+Mevrouw de Voorzitter, ik heb wat gelach gehoord van de banken van de socialistische fractie. Ik heb ook gehoord dat veel leden van die fractie dit punt graag van de agenda zouden willen afvoeren, omdat ten tijde van de stemming in de Conferentie van voorzitters het standpunt van de socialistische leden van de werkgroep die zich met deze zaken bezig houdt, nog niet beschikbaar was. Ik weet niet of dat klopt, maar wij als PPE-DE-Fractie zouden dankbaar zijn als dit punt van de agenda zou kunnen worden afgevoerd. Het Parlement heeft zich namelijk al meerdere malen met deze kwestie bezig gehouden. Er is ook al besloten om deze belasting af te wijzen. Daarom vraagt mijn fractie dit punt van de agenda af te voeren.
+
+Hartelijk dank, mijnheer Poettering.
+We geven nu de heer Wurtz de gelegenheid om zich tegen dit verzoek uit te spreken.
+
+Mevrouw de Voorzitter, ik wil allereerst benadrukken dat de logica in het betoog van de heer Poettering ver te zoeken is. Zojuist heeft hij de socialistische fractie de les gelezen omdat deze fractie terugkomt op een besluit dat overduidelijk tijdens de Conferentie van voorzitters is genomen. Vervolgens handelt hij op precies dezelfde wijze als deze fractie. Wij hebben over dit vraagstuk gedebatteerd en waren, met uitzondering van de PPE-DE-Fractie en de liberale fractie, allen dezelfde mening toegedaan. Zoals u zich herinnert, heb ik opgemerkt, waarde collega-voorzitters, dat het er niet zozeer toe doet of u voor of tegen de Todin-heffing bent, maar dat u van de Commissie en de Raad durft te verlangen dat ze hun mening over dit vraagstuk kenbaar maken. Dat is toch niet teveel gevraagd. Ik herhaal dus mijn voorstel om deze mondelinge vraag aan de Commissie en de Raad te handhaven zodat we voor eens en altijd weten hoe deze twee instellingen over dit relatief eenvoudige verzoek denke
 n. We kunnen zo immers een belangrijk signaal aan de burgers afgeven, zeker na de commotie die na het mislukken van de Conferentie van Seattle is ontstaan.
+
+We gaan stemmen over het verzoek van de PPE-DE-Fractie om de mondelinge vragen over hoofdelijke belasting van de agenda te schrappen.
+(Het Parlement verwerpt het verzoek met 164 stemmen voor, 166 stemmen tegen en 7 onthoudingen)
+
+Mevrouw de Voorzitter, ik wil de heer Poettering er hartelijk voor danken dat hij dit debat zo effectief onder de aandacht heeft gebracht. Hartelijk dank.
+
+Mevrouw de Voorzitter, is mijn stem die ik elektronisch niet heb kunnen uitbrengen omdat ik mijn stemkaart niet bij mij heb, meegeteld? Ik was v��r.
+
+Inderdaad, als we de stemmen van beide collega's die van zich hebben laten horen bij de uitslag optellen, dan wordt het resultaat...
+
+Mevrouw de Voorzitter, u heeft de uitslag van de stemming bekendgemaakt. Daaraan mag niets veranderd worden.
+
+Waarde collega's, ik wil nogmaals benadrukken dat iedereen zijn kaart voor maandag bij zich moet hebben. We hebben nu een probleem en ik zal daarom een beslissing moeten nemen.
+Ik ben mijn kaart ook vergeten en zou anders tegen hebben gestemd. Ik vind dus dat de mondelinge vraag op de agenda moet blijven staan.
+Dit is de laatste keer dat we rekening houden met collega's die hun kaart vergeten zijn. Laat dit nu voor eens en altijd duidelijk zijn.
+(Applaus)Ja, de mondelinge vraag blijft op de agenda gehandhaafd, en ja, de Voorzitter heeft het recht om te stemmen. Ze heeft immers ook het recht haar kaart te vergeten.
+We gaan nu verder met de andere wijzigingen in de agenda.
+
+Mevrouw de Voorzitter, laat mij vooropstellen dat ik mij zal neerleggen bij uw uitspraak in dezen, maar bij de eerdere stemming over de Commissiestrategie had ik voorafgaand aan de stemming namens mijn fractie het woord willen voeren. Dit is niet gebeurd. Ik zou het waarderen indien ik na afsluiting van dit punt de gelegenheid zou krijgen namens mijn fractie een stemverklaring uit te spreken. Het gaat om een belangrijke kwestie. Het is een goede zaak voor dit Parlement dat wordt vastgelegd, hoe de mensen vanuit hun eigen politieke analyse de zojuist genomen beslissing beoordelen.
+
+Mevrouw de Voorzitter, ik wil het debat niet opnieuw openen, maar ik had ook om het woord gevraagd. Ik wilde ingaan op het verzoek van de heer Bar�n Crespo. U heeft mij het woord niet verleend. Ik vind dat jammer, maar we hebben nu gestemd en een besluit genomen. Daar wil ik het bij laten.
+
+Mijn excuses, mijnheer H�nsch en mijnheer Cox, ik had niet in de gaten dat u om het woord vroeg. Ik denk dat de stellingname in de notulen zal worden weergegeven. Bij de goedkeuring van notulen van de vergadering van vandaag kunnen de collega's die vinden dat de standpunten niet goed zijn weergegeven, een verzoek tot wijziging indienen. Ik denk dat dit een goede oplossing is. Uiteraard zal in de notulen van de vergadering van morgen rekening gehouden worden met al deze aanvullende verklaringen. Ik denk dat dit beter is dan nu stemverklaringen af te leggen. We zouden dan immers teveel afdwalen. Mijnheer Cox en mijnheer H�nsch, kunt u zich in mijn voorstel vinden?
+
+Mevrouw de Voorzitter, als het stemgedrag van mijn fractie correct is weergegeven, zal en kan ik hiertegen geen bezwaar maken. Indien uw besluit is dat ik geen stemverklaring mag afleggen, dan accepteer ik dat, zij het onder voorbehoud.
+
+We zullen dus heel goed opletten bij het opstellen van de notulen. Dat doen we trouwens altijd al. Als de standpunten niet goed in de notulen worden weergegeven, kunnen deze eventueel worden aangepast.
+(Het Parlement neemt de aldus gewijzigde agenda aan)
+

[26/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build for nutch-core and nutch-plugins

Posted by th...@apache.org.

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/util/WritableTestUtils.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/util/WritableTestUtils.java b/nutch-core/src/test/java/org/apache/nutch/util/WritableTestUtils.java
new file mode 100644
index 0000000..49bcfa9
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/util/WritableTestUtils.java
@@ -0,0 +1,55 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util;
+
+import org.apache.hadoop.io.*;
+import org.apache.hadoop.conf.*;
+import org.junit.Assert;
+
+public class WritableTestUtils {
+
+  /** Utility method for testing writables. */
+  public static void testWritable(Writable before) throws Exception {
+    testWritable(before, null);
+  }
+
+  /** Utility method for testing writables. */
+  public static void testWritable(Writable before, Configuration conf)
+      throws Exception {
+    Assert.assertEquals(before, writeRead(before, conf));
+  }
+
+  /** Utility method for testing writables. */
+  public static Writable writeRead(Writable before, Configuration conf)
+      throws Exception {
+
+    DataOutputBuffer dob = new DataOutputBuffer();
+    before.write(dob);
+
+    DataInputBuffer dib = new DataInputBuffer();
+    dib.reset(dob.getData(), dob.getLength());
+
+    Writable after = (Writable) before.getClass().newInstance();
+    if (conf != null) {
+      ((Configurable) after).setConf(conf);
+    }
+    after.readFields(dib);
+    return after;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/crawl-tests.xml
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/resources/crawl-tests.xml b/nutch-core/src/test/resources/crawl-tests.xml
new file mode 100644
index 0000000..01fc683
--- /dev/null
+++ b/nutch-core/src/test/resources/crawl-tests.xml
@@ -0,0 +1,62 @@
+<?xml version="1.0"?>
+
+<!-- Configuration overrides used during unit tests. -->
+
+<configuration>
+
+<property>
+  <name>plugin.includes</name>
+  <value>parse-tika|protocol-http|urlfilter-suffix|scoring-opic</value>
+  <description>Enable required plugins.</description>
+</property>
+
+<property>
+  <name>content.server.port</name>
+  <value>55000</value>
+  <description>Port of http server serving content.</description>
+</property>
+
+<property>
+  <name>fetcher.server.delay</name>
+  <value>0.2</value>
+  <description>The number of seconds the fetcher will delay between 
+   successive requests to the same server.</description>
+</property>
+
+<property>
+  <name>http.agent.name</name>
+  <value>test-nutch</value>
+</property>
+
+<property>
+  <name>http.robots.agents</name>
+  <value>test-nutch,*</value>
+</property>
+
+<property>
+  <name>http.agent.name.check</name>
+  <value>true</value>
+</property>
+
+<property>                                                                                                                                                   
+  <name>http.robots.agents</name>                                                                                                                            
+  <value>test-nutch,*</value>                                                                                                                                
+  <description>The agent strings we'll look for in robots.txt files,                                                                                         
+  comma-separated, in decreasing order of precedence. You should                                                                                             
+  put the value of http.agent.name as the first agent name, and keep the                                                                                     
+  default * at the end of the list. E.g.: BlurflDev,Blurfl,*                                                                                                 
+  </description>                                                                                                                                             
+</property>
+
+<property>
+  <name>io.serializations</name>
+  <value>org.apache.hadoop.io.serializer.WritableSerialization,org.apache.hadoop.io.serializer.JavaSerialization</value>
+  <!-- org.apache.hadoop.io.serializer.avro.AvroSpecificSerialization,
+  org.apache.hadoop.io.serializer.avro.AvroReflectSerialization,
+  org.apache.hadoop.io.serializer.avro.AvroGenericSerialization, -->
+  <description>A list of serialization classes that can be used for
+  obtaining serializers and deserializers.</description>
+</property>
+
+</configuration>
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/domain-urlfilter.txt
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/resources/domain-urlfilter.txt b/nutch-core/src/test/resources/domain-urlfilter.txt
new file mode 100644
index 0000000..955700a
--- /dev/null
+++ b/nutch-core/src/test/resources/domain-urlfilter.txt
@@ -0,0 +1,22 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# config file for urlfilter-domsin plugin
+
+com
+org
+net
+edu
+gov

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/fetch-test-site/dup_of_pagea.html
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/resources/fetch-test-site/dup_of_pagea.html b/nutch-core/src/test/resources/fetch-test-site/dup_of_pagea.html
new file mode 100644
index 0000000..6444c41
--- /dev/null
+++ b/nutch-core/src/test/resources/fetch-test-site/dup_of_pagea.html
@@ -0,0 +1,11 @@
+<html>
+ <head>
+  <title>page a</title>
+ </head>
+<body>
+This is page a
+<a href="index.html">home</a>
+<hr>
+Nutch fetcher test page
+</body>
+</html>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/fetch-test-site/exception.html
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/resources/fetch-test-site/exception.html b/nutch-core/src/test/resources/fetch-test-site/exception.html
new file mode 100644
index 0000000..e1192a1
--- /dev/null
+++ b/nutch-core/src/test/resources/fetch-test-site/exception.html
@@ -0,0 +1,13 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">
+<HTML>
+<HEAD>
+<TITLE>Exception</TITLE>
+<META http-equiv="Content-Type" content="text/html; charset=unicode">
+</HEAD>
+<BODY>
+!!Trying to parse this one will fail with a MalformedInputException!!
+
+Nutch fetcher test page.
+</BODY>
+</HTML>
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/fetch-test-site/index.html
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/resources/fetch-test-site/index.html b/nutch-core/src/test/resources/fetch-test-site/index.html
new file mode 100644
index 0000000..d73ff3f
--- /dev/null
+++ b/nutch-core/src/test/resources/fetch-test-site/index.html
@@ -0,0 +1,13 @@
+<html>
+ <head>
+  <title>front page</title>
+ </head>
+<body>
+This is front page.
+<a href="pagea.html">Page a</a>
+<a href="pageb.html">Page b</a>
+<a href="dup_of_pagea.html">dup of Page a</a>
+<hr>
+Nutch fetcher test page
+</body>
+</html>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/fetch-test-site/nested_spider_trap.html
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/resources/fetch-test-site/nested_spider_trap.html b/nutch-core/src/test/resources/fetch-test-site/nested_spider_trap.html
new file mode 100644
index 0000000..5dcf7c2
--- /dev/null
+++ b/nutch-core/src/test/resources/fetch-test-site/nested_spider_trap.html
@@ -0,0 +1,23 @@
+<html>
+<head>
+<title>nested spider trap</title>
+</head>
+
+<body>Nutch fetcher test page
+<table>
+  <tr> 
+    <td>
+<i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i> 
+<b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b> 
+<i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i
 ></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b>
 </i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i> 
+<i><b><i><b><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></
 b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></b></i></b></i> 
+<b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><
 b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b
 ><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b>
 <i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><
 i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i
 ><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i>
 <b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><
 b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b><
 /i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></
 b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i
 ></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b>
 </i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i><
 /b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></
 i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b
 ></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i>
 </b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b><
 /i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i> 
+</b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b> 
+<i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><
 i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b><i><b></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b>
 </i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i></b></i> 
+
+    </td>
+  </tr>
+ 
+</table>
+</body>
+</html>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/fetch-test-site/pagea.html
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/resources/fetch-test-site/pagea.html b/nutch-core/src/test/resources/fetch-test-site/pagea.html
new file mode 100644
index 0000000..6444c41
--- /dev/null
+++ b/nutch-core/src/test/resources/fetch-test-site/pagea.html
@@ -0,0 +1,11 @@
+<html>
+ <head>
+  <title>page a</title>
+ </head>
+<body>
+This is page a
+<a href="index.html">home</a>
+<hr>
+Nutch fetcher test page
+</body>
+</html>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/fetch-test-site/pageb.html
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/resources/fetch-test-site/pageb.html b/nutch-core/src/test/resources/fetch-test-site/pageb.html
new file mode 100644
index 0000000..66e3725
--- /dev/null
+++ b/nutch-core/src/test/resources/fetch-test-site/pageb.html
@@ -0,0 +1,11 @@
+<html>
+ <head>
+  <title>bage b</title>
+ </head>
+<body>
+This is page b
+<a href="index.html">home</a>
+<hr>
+Nutch fetcher test page
+</body>
+</html>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/fetch-test-site/robots.txt
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/resources/fetch-test-site/robots.txt b/nutch-core/src/test/resources/fetch-test-site/robots.txt
new file mode 100644
index 0000000..e69de29

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/filter-all.txt
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/resources/filter-all.txt b/nutch-core/src/test/resources/filter-all.txt
new file mode 100644
index 0000000..4ed567a
--- /dev/null
+++ b/nutch-core/src/test/resources/filter-all.txt
@@ -0,0 +1,7 @@
+# Config file for urlfilter-suffix plugin
+# Filter away all urls
+
+# case-insensitive, disallow unknown suffixes
+-I
+
+# allow these

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/log4j.properties
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/resources/log4j.properties b/nutch-core/src/test/resources/log4j.properties
new file mode 100644
index 0000000..3ff115f
--- /dev/null
+++ b/nutch-core/src/test/resources/log4j.properties
@@ -0,0 +1,7 @@
+# log4j configuration used during build and unit tests
+
+log4j.rootLogger=info,stdout
+log4j.threshold=ALL
+log4j.appender.stdout=org.apache.log4j.ConsoleAppender
+log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
+log4j.appender.stdout.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} (%F:%M(%L)) - %m%n

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/nutch-site.xml
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/resources/nutch-site.xml b/nutch-core/src/test/resources/nutch-site.xml
new file mode 100644
index 0000000..dd40873
--- /dev/null
+++ b/nutch-core/src/test/resources/nutch-site.xml
@@ -0,0 +1,19 @@
+<?xml version="1.0"?>
+
+<!-- Configuration overrides used during unit tests. -->
+
+<configuration>
+
+<property>
+  <name>plugin.includes</name>
+  <value>.*</value>
+  <description>Enable all plugins during unit testing.</description>
+</property>
+
+<property>
+  <name>distributed.search.test.port</name>
+  <value>60000</value>
+  <description>TCP port used during junit testing.</description>
+</property>
+
+</configuration>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/test-mime-util/test.xlsx
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/resources/test-mime-util/test.xlsx b/nutch-core/src/test/resources/test-mime-util/test.xlsx
new file mode 100644
index 0000000..de33f28
Binary files /dev/null and b/nutch-core/src/test/resources/test-mime-util/test.xlsx differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/test-segments/20150309101625/content/part-00000/.data.crc
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/resources/test-segments/20150309101625/content/part-00000/.data.crc b/nutch-core/src/test/resources/test-segments/20150309101625/content/part-00000/.data.crc
new file mode 100644
index 0000000..c321777
Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101625/content/part-00000/.data.crc differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/test-segments/20150309101625/content/part-00000/.index.crc
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/resources/test-segments/20150309101625/content/part-00000/.index.crc b/nutch-core/src/test/resources/test-segments/20150309101625/content/part-00000/.index.crc
new file mode 100644
index 0000000..5c5d11f
Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101625/content/part-00000/.index.crc differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/test-segments/20150309101625/content/part-00000/data
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/resources/test-segments/20150309101625/content/part-00000/data b/nutch-core/src/test/resources/test-segments/20150309101625/content/part-00000/data
new file mode 100644
index 0000000..0f8d263
Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101625/content/part-00000/data differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/test-segments/20150309101625/content/part-00000/index
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/resources/test-segments/20150309101625/content/part-00000/index b/nutch-core/src/test/resources/test-segments/20150309101625/content/part-00000/index
new file mode 100644
index 0000000..4dfeaec
Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101625/content/part-00000/index differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/test-segments/20150309101625/crawl_fetch/part-00000/.data.crc
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/resources/test-segments/20150309101625/crawl_fetch/part-00000/.data.crc b/nutch-core/src/test/resources/test-segments/20150309101625/crawl_fetch/part-00000/.data.crc
new file mode 100644
index 0000000..c4d315a
Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101625/crawl_fetch/part-00000/.data.crc differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/test-segments/20150309101625/crawl_fetch/part-00000/.index.crc
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/resources/test-segments/20150309101625/crawl_fetch/part-00000/.index.crc b/nutch-core/src/test/resources/test-segments/20150309101625/crawl_fetch/part-00000/.index.crc
new file mode 100644
index 0000000..6dd171e
Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101625/crawl_fetch/part-00000/.index.crc differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/test-segments/20150309101625/crawl_fetch/part-00000/data
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/resources/test-segments/20150309101625/crawl_fetch/part-00000/data b/nutch-core/src/test/resources/test-segments/20150309101625/crawl_fetch/part-00000/data
new file mode 100644
index 0000000..66b1f8d
Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101625/crawl_fetch/part-00000/data differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/test-segments/20150309101625/crawl_fetch/part-00000/index
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/resources/test-segments/20150309101625/crawl_fetch/part-00000/index b/nutch-core/src/test/resources/test-segments/20150309101625/crawl_fetch/part-00000/index
new file mode 100644
index 0000000..ad4ed47
Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101625/crawl_fetch/part-00000/index differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/test-segments/20150309101625/crawl_generate/.part-00000.crc
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/resources/test-segments/20150309101625/crawl_generate/.part-00000.crc b/nutch-core/src/test/resources/test-segments/20150309101625/crawl_generate/.part-00000.crc
new file mode 100644
index 0000000..8d5ffa4
Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101625/crawl_generate/.part-00000.crc differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/test-segments/20150309101625/crawl_generate/part-00000
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/resources/test-segments/20150309101625/crawl_generate/part-00000 b/nutch-core/src/test/resources/test-segments/20150309101625/crawl_generate/part-00000
new file mode 100644
index 0000000..41ef146
Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101625/crawl_generate/part-00000 differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/test-segments/20150309101625/crawl_parse/.part-00000.crc
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/resources/test-segments/20150309101625/crawl_parse/.part-00000.crc b/nutch-core/src/test/resources/test-segments/20150309101625/crawl_parse/.part-00000.crc
new file mode 100644
index 0000000..683a1dd
Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101625/crawl_parse/.part-00000.crc differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/test-segments/20150309101625/crawl_parse/part-00000
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/resources/test-segments/20150309101625/crawl_parse/part-00000 b/nutch-core/src/test/resources/test-segments/20150309101625/crawl_parse/part-00000
new file mode 100644
index 0000000..3232abf
Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101625/crawl_parse/part-00000 differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/test-segments/20150309101625/parse_data/part-00000/.data.crc
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/resources/test-segments/20150309101625/parse_data/part-00000/.data.crc b/nutch-core/src/test/resources/test-segments/20150309101625/parse_data/part-00000/.data.crc
new file mode 100644
index 0000000..47164ee
Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101625/parse_data/part-00000/.data.crc differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/test-segments/20150309101625/parse_data/part-00000/.index.crc
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/resources/test-segments/20150309101625/parse_data/part-00000/.index.crc b/nutch-core/src/test/resources/test-segments/20150309101625/parse_data/part-00000/.index.crc
new file mode 100644
index 0000000..a32d62d
Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101625/parse_data/part-00000/.index.crc differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/test-segments/20150309101625/parse_data/part-00000/data
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/resources/test-segments/20150309101625/parse_data/part-00000/data b/nutch-core/src/test/resources/test-segments/20150309101625/parse_data/part-00000/data
new file mode 100644
index 0000000..5b71a24
Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101625/parse_data/part-00000/data differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/test-segments/20150309101625/parse_data/part-00000/index
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/resources/test-segments/20150309101625/parse_data/part-00000/index b/nutch-core/src/test/resources/test-segments/20150309101625/parse_data/part-00000/index
new file mode 100644
index 0000000..d931103
Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101625/parse_data/part-00000/index differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/test-segments/20150309101625/parse_text/part-00000/.data.crc
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/resources/test-segments/20150309101625/parse_text/part-00000/.data.crc b/nutch-core/src/test/resources/test-segments/20150309101625/parse_text/part-00000/.data.crc
new file mode 100644
index 0000000..53c925c
Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101625/parse_text/part-00000/.data.crc differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/test-segments/20150309101625/parse_text/part-00000/.index.crc
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/resources/test-segments/20150309101625/parse_text/part-00000/.index.crc b/nutch-core/src/test/resources/test-segments/20150309101625/parse_text/part-00000/.index.crc
new file mode 100644
index 0000000..5ba878c
Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101625/parse_text/part-00000/.index.crc differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/test-segments/20150309101625/parse_text/part-00000/data
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/resources/test-segments/20150309101625/parse_text/part-00000/data b/nutch-core/src/test/resources/test-segments/20150309101625/parse_text/part-00000/data
new file mode 100644
index 0000000..b58f97f
Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101625/parse_text/part-00000/data differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/test-segments/20150309101625/parse_text/part-00000/index
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/resources/test-segments/20150309101625/parse_text/part-00000/index b/nutch-core/src/test/resources/test-segments/20150309101625/parse_text/part-00000/index
new file mode 100644
index 0000000..9880a27
Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101625/parse_text/part-00000/index differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/test-segments/20150309101656/content/part-00000/.data.crc
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/resources/test-segments/20150309101656/content/part-00000/.data.crc b/nutch-core/src/test/resources/test-segments/20150309101656/content/part-00000/.data.crc
new file mode 100644
index 0000000..1b49819
Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101656/content/part-00000/.data.crc differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/test-segments/20150309101656/content/part-00000/.index.crc
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/resources/test-segments/20150309101656/content/part-00000/.index.crc b/nutch-core/src/test/resources/test-segments/20150309101656/content/part-00000/.index.crc
new file mode 100644
index 0000000..5aae648
Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101656/content/part-00000/.index.crc differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/test-segments/20150309101656/content/part-00000/data
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/resources/test-segments/20150309101656/content/part-00000/data b/nutch-core/src/test/resources/test-segments/20150309101656/content/part-00000/data
new file mode 100644
index 0000000..8069e84
Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101656/content/part-00000/data differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/test-segments/20150309101656/content/part-00000/index
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/resources/test-segments/20150309101656/content/part-00000/index b/nutch-core/src/test/resources/test-segments/20150309101656/content/part-00000/index
new file mode 100644
index 0000000..9b19ce9
Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101656/content/part-00000/index differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/test-segments/20150309101656/crawl_fetch/part-00000/.data.crc
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/resources/test-segments/20150309101656/crawl_fetch/part-00000/.data.crc b/nutch-core/src/test/resources/test-segments/20150309101656/crawl_fetch/part-00000/.data.crc
new file mode 100644
index 0000000..926ced1
Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101656/crawl_fetch/part-00000/.data.crc differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/test-segments/20150309101656/crawl_fetch/part-00000/.index.crc
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/resources/test-segments/20150309101656/crawl_fetch/part-00000/.index.crc b/nutch-core/src/test/resources/test-segments/20150309101656/crawl_fetch/part-00000/.index.crc
new file mode 100644
index 0000000..714a1e8
Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101656/crawl_fetch/part-00000/.index.crc differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/test-segments/20150309101656/crawl_fetch/part-00000/data
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/resources/test-segments/20150309101656/crawl_fetch/part-00000/data b/nutch-core/src/test/resources/test-segments/20150309101656/crawl_fetch/part-00000/data
new file mode 100644
index 0000000..f36a9fa
Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101656/crawl_fetch/part-00000/data differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/test-segments/20150309101656/crawl_fetch/part-00000/index
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/resources/test-segments/20150309101656/crawl_fetch/part-00000/index b/nutch-core/src/test/resources/test-segments/20150309101656/crawl_fetch/part-00000/index
new file mode 100644
index 0000000..c648d89
Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101656/crawl_fetch/part-00000/index differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/test-segments/20150309101656/crawl_generate/.part-00000.crc
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/resources/test-segments/20150309101656/crawl_generate/.part-00000.crc b/nutch-core/src/test/resources/test-segments/20150309101656/crawl_generate/.part-00000.crc
new file mode 100644
index 0000000..3ee3c94
Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101656/crawl_generate/.part-00000.crc differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/test-segments/20150309101656/crawl_generate/part-00000
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/resources/test-segments/20150309101656/crawl_generate/part-00000 b/nutch-core/src/test/resources/test-segments/20150309101656/crawl_generate/part-00000
new file mode 100644
index 0000000..1ef0406
Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101656/crawl_generate/part-00000 differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/test-segments/20150309101656/crawl_parse/.part-00000.crc
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/resources/test-segments/20150309101656/crawl_parse/.part-00000.crc b/nutch-core/src/test/resources/test-segments/20150309101656/crawl_parse/.part-00000.crc
new file mode 100644
index 0000000..7948825
Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101656/crawl_parse/.part-00000.crc differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/test-segments/20150309101656/crawl_parse/part-00000
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/resources/test-segments/20150309101656/crawl_parse/part-00000 b/nutch-core/src/test/resources/test-segments/20150309101656/crawl_parse/part-00000
new file mode 100644
index 0000000..3a83a82
Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101656/crawl_parse/part-00000 differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/test-segments/20150309101656/parse_data/part-00000/.data.crc
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/resources/test-segments/20150309101656/parse_data/part-00000/.data.crc b/nutch-core/src/test/resources/test-segments/20150309101656/parse_data/part-00000/.data.crc
new file mode 100644
index 0000000..b46b6f6
Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101656/parse_data/part-00000/.data.crc differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/test-segments/20150309101656/parse_data/part-00000/.index.crc
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/resources/test-segments/20150309101656/parse_data/part-00000/.index.crc b/nutch-core/src/test/resources/test-segments/20150309101656/parse_data/part-00000/.index.crc
new file mode 100644
index 0000000..18766e6
Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101656/parse_data/part-00000/.index.crc differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/test-segments/20150309101656/parse_data/part-00000/data
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/resources/test-segments/20150309101656/parse_data/part-00000/data b/nutch-core/src/test/resources/test-segments/20150309101656/parse_data/part-00000/data
new file mode 100644
index 0000000..9a1f284
Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101656/parse_data/part-00000/data differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/test-segments/20150309101656/parse_data/part-00000/index
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/resources/test-segments/20150309101656/parse_data/part-00000/index b/nutch-core/src/test/resources/test-segments/20150309101656/parse_data/part-00000/index
new file mode 100644
index 0000000..47fb983
Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101656/parse_data/part-00000/index differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/test-segments/20150309101656/parse_text/part-00000/.data.crc
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/resources/test-segments/20150309101656/parse_text/part-00000/.data.crc b/nutch-core/src/test/resources/test-segments/20150309101656/parse_text/part-00000/.data.crc
new file mode 100644
index 0000000..ceada1b
Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101656/parse_text/part-00000/.data.crc differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/test-segments/20150309101656/parse_text/part-00000/.index.crc
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/resources/test-segments/20150309101656/parse_text/part-00000/.index.crc b/nutch-core/src/test/resources/test-segments/20150309101656/parse_text/part-00000/.index.crc
new file mode 100644
index 0000000..b756b5c
Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101656/parse_text/part-00000/.index.crc differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/test-segments/20150309101656/parse_text/part-00000/data
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/resources/test-segments/20150309101656/parse_text/part-00000/data b/nutch-core/src/test/resources/test-segments/20150309101656/parse_text/part-00000/data
new file mode 100644
index 0000000..ad96df0
Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101656/parse_text/part-00000/data differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/resources/test-segments/20150309101656/parse_text/part-00000/index
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/resources/test-segments/20150309101656/parse_text/part-00000/index b/nutch-core/src/test/resources/test-segments/20150309101656/parse_text/part-00000/index
new file mode 100644
index 0000000..a3e1d8d
Binary files /dev/null and b/nutch-core/src/test/resources/test-segments/20150309101656/parse_text/part-00000/index differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/build-plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/build-plugin.xml b/nutch-plugins/build-plugin.xml
new file mode 100755
index 0000000..c759d5f
--- /dev/null
+++ b/nutch-plugins/build-plugin.xml
@@ -0,0 +1,255 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<!-- Imported by plugin build.xml files to define default targets. -->
+<project xmlns:ivy="antlib:org.apache.ivy.ant">
+
+  <property name="name" value="${ant.project.name}"/>
+  <property name="root" value="${basedir}"/>
+
+  <!-- load plugin-specific properties first -->
+  <property file="${user.home}/${name}.build.properties" />
+  <property file="${root}/build.properties" />
+
+  <property name="nutch.root" location="${root}/../../../"/>
+
+  <property name="src.dir" location="${root}/src/java"/>
+  <property name="src.test" location="${root}/src/test"/>
+
+  <available file="${src.test}" type="dir" property="test.available"/>
+
+  <property name="conf.dir" location="${nutch.root}/conf"/>
+
+  <property name="build.dir" location="${nutch.root}/build/${name}"/>
+  <property name="build.classes" location="${build.dir}/classes"/>
+  <property name="build.test" location="${build.dir}/test"/>
+  <property name="build.test.lib" location="${build.test}/lib"/>
+
+  <property name="deploy.dir" location="${nutch.root}/build/plugins/${name}"/>
+
+  <!-- load nutch defaults last so that they can be overridden above -->
+  <property file="${nutch.root}/default.properties" />
+
+  <ivy:settings id="ivy.instance" file="${nutch.root}/ivy/ivysettings.xml" />
+
+  <path id="plugin.deps"/>
+
+  <fileset id="lib.jars" dir="${root}" includes="lib/*.jar"/>
+
+  <!-- the normal classpath -->
+  <path id="classpath">
+    <pathelement location="${build.classes}"/>
+    <fileset refid="lib.jars"/>
+    <pathelement location="${nutch.root}/build/classes"/>
+    <fileset dir="${nutch.root}/build/lib">
+      <include name="*.jar" />
+    </fileset>
+    <path refid="plugin.deps"/>
+    <fileset dir="${deploy.dir}">
+      <include name="*.jar" />
+    </fileset>
+  </path>
+
+  <!-- the unit test classpath -->
+  <path id="test.classpath">
+    <pathelement location="${build.test}" />
+    <pathelement location="${nutch.root}/build/test/classes"/>
+    <pathelement location="${nutch.root}/src/test"/>
+    <pathelement location="${conf.dir}"/>
+    <pathelement location="${nutch.root}/build"/>
+    <!-- test dependencies specific to current plugin -->
+    <fileset dir="${build.test.lib}">
+      <include name="*.jar" />
+    </fileset>
+    <!-- global test dependencies -->
+    <fileset dir="${nutch.root}/build/test/lib">
+      <include name="*.jar" />
+    </fileset>
+    <path refid="classpath"/>
+  </path>
+
+  <!-- ====================================================== -->
+  <!-- Stuff needed by all targets                            -->
+  <!-- ====================================================== -->
+  <target name="init">
+    <mkdir dir="${build.dir}"/>
+    <mkdir dir="${build.classes}"/>
+    <mkdir dir="${build.test}"/>
+    <mkdir dir="${build.test.lib}"/>
+    <mkdir dir="${deploy.dir}"/>
+
+    <antcall target="init-plugin"/>
+  </target>
+
+  <!-- to be overridden by sub-projects --> 
+  <target name="init-plugin"/>
+
+  <!--
+   ! Used to build plugin compilation dependencies
+   ! (to be overridden by plugins)
+   !-->
+  <target name="deps-jar"/>
+
+  <!--
+   ! Used to deploy plugin runtime dependencies
+   ! (to be overridden by plugins)
+   !-->
+  <target name="deps-test"/>
+
+  <!--
+   ! Used to compile test for plugin runtime dependencies
+   ! (to be overridden by plugins)
+   !-->
+  <target name="deps-test-compile"/>
+
+  <!-- ====================================================== -->
+  <!-- Compile the Java files                                 -->
+  <!-- ====================================================== -->
+  <target name="compile" depends="init,deps-jar, resolve-default">
+    <echo message="Compiling plugin: ${name}"/>
+    <javac 
+     encoding="${build.encoding}" 
+     srcdir="${src.dir}"
+     includes="**/*.java"
+     destdir="${build.classes}"
+     debug="${javac.debug}"
+     optimize="${javac.optimize}"
+     target="${javac.version}"
+     source="${javac.version}"
+     deprecation="${javac.deprecation}">
+      <classpath refid="classpath"/>
+    </javac>
+  </target>
+
+  <target name="compile-core">
+    <ant target="compile-core" inheritall="false" dir="${nutch.root}"/>
+    <ant target="compile"/>
+  </target>
+  
+  <!-- ================================================================== -->
+  <!-- Make plugin .jar                                                   -->
+  <!-- ================================================================== -->
+  <!--                                                                    -->
+  <!-- ================================================================== -->
+  <target name="jar" depends="compile">
+    <jar
+      jarfile="${build.dir}/${name}.jar"
+      basedir="${build.classes}"
+    />
+  </target>
+
+  <target name="jar-core" depends="compile-core">
+    <jar
+        jarfile="${build.dir}/${name}.jar"
+        basedir="${build.classes}"
+        />
+  </target>
+
+  <!-- ================================================================== -->
+  <!-- Deploy plugin to ${deploy.dir}                                     -->
+  <!-- ================================================================== -->
+  <!--                                                                    -->
+  <!-- ================================================================== -->
+  <target name="deploy" depends="jar, deps-test">
+    <mkdir dir="${deploy.dir}"/>
+    <copy file="plugin.xml" todir="${deploy.dir}" 
+          preservelastmodified="true"/>
+    <available property="lib-available"
+                 file="${build.dir}/${name}.jar"/>
+    <antcall target="copy-generated-lib"/>
+    <copy todir="${deploy.dir}" flatten="true">
+      <fileset refid="lib.jars"/>
+    </copy>
+  </target>
+	
+  <target name="copy-generated-lib" if="lib-available">
+    <copy file="${build.dir}/${name}.jar" todir="${deploy.dir}" failonerror="false"/>
+  </target>
+
+  <!-- ================================================================== -->
+  <!-- Compile test code                                                  --> 
+  <!-- ================================================================== -->
+  <target name="compile-test" depends="compile, deps-test-compile" if="test.available">
+    <javac 
+     encoding="${build.encoding}" 
+     srcdir="${src.test}"
+     includes="**/*.java"
+     destdir="${build.test}"
+     debug="${javac.debug}"
+     optimize="${javac.optimize}"
+     target="${javac.version}"
+     source="${javac.version}"
+     deprecation="${javac.deprecation}">
+      <classpath refid="test.classpath"/>
+    </javac>    
+  </target>
+
+  <!-- ================================================================== -->
+  <!-- Run unit tests                                                     --> 
+  <!-- ================================================================== -->
+  <target name="test" depends="compile-test, deploy" if="test.available">
+    <echo message="Testing plugin: ${name}"/>
+
+    <junit printsummary="yes" haltonfailure="no" fork="yes"
+      errorProperty="tests.failed" failureProperty="tests.failed">
+      <sysproperty key="test.data" value="${build.test}/data"/>
+      <sysproperty key="test.input" value="${root}/data"/>
+      <sysproperty key="javax.xml.parsers.DocumentBuilderFactory" value="com.sun.org.apache.xerces.internal.jaxp.DocumentBuilderFactoryImpl"/> 
+      <classpath refid="test.classpath"/>
+      <formatter type="${test.junit.output.format}" />
+      <batchtest todir="${build.test}" unless="testcase">
+        <fileset dir="${src.test}"
+                 includes="**/Test*.java" excludes="**/${test.exclude}.java" />
+      </batchtest>
+      <batchtest todir="${build.test}" if="testcase">
+        <fileset dir="${src.test}" includes="**/${testcase}.java"/>
+      </batchtest>
+    </junit>
+
+    <fail if="tests.failed">Tests failed!</fail>
+
+  </target>   
+
+  <!-- target: resolve  ================================================= -->
+  <target name="resolve-default" depends="clean-lib" description="resolve and retrieve dependencies with ivy">
+    <ivy:resolve file="ivy.xml" conf="default" log="download-only"/>
+    <ivy:retrieve pattern="${deploy.dir}/[artifact]-[revision].[ext]" symlink="false" log="quiet"/>
+  </target>
+
+  <target name="resolve-test" depends="clean-lib" description="resolve and retrieve dependencies with ivy">
+    <ivy:resolve file="ivy.xml" conf="test" log="download-only"/>
+    <ivy:retrieve pattern="${build.test.lib}/[artifact]-[revision].[ext]" symlink="false" log="quiet"/>
+  </target>
+
+  <!-- ================================================================== -->
+  <!-- Clean.  Delete the build files, and their directories              -->
+  <!-- ================================================================== -->
+  <!-- target: clean  =================================================== -->
+  <target name="clean" depends="clean-build, clean-lib" description="--> clean the project" />
+
+  <!-- target: clean-lib  =============================================== -->
+  <target name="clean-lib" description="--> clean the project libraries directory (dependencies)">
+    <delete includeemptydirs="true" dir="${build.lib.dir}"/>
+  </target>
+
+  <!-- target: clean-build  ============================================= -->
+  <target name="clean-build" description="--> clean the project built files">
+    <delete includeemptydirs="true" dir="${build.dir}"/>
+    <delete includeemptydirs="true" dir="${deploy.dir}"/>
+  </target>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/build.xml b/nutch-plugins/build.xml
new file mode 100755
index 0000000..75ae2e7
--- /dev/null
+++ b/nutch-plugins/build.xml
@@ -0,0 +1,213 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="Nutch" default="deploy-core" basedir=".">
+
+  <target name="deploy-core">
+    <ant target="compile-core" inheritall="false" dir="../.."/>
+    <ant target="deploy"/>
+  </target>
+
+  <!-- ====================================================== -->
+  <!-- Build & deploy all the plugin jars.                    -->
+  <!-- ====================================================== -->
+  <target name="deploy">
+     <ant dir="creativecommons" target="deploy"/>
+     <ant dir="feed" target="deploy"/>
+     <ant dir="headings" target="deploy"/>
+     <ant dir="index-basic" target="deploy"/>
+     <ant dir="index-anchor" target="deploy"/>
+     <ant dir="index-geoip" target="deploy"/>
+     <ant dir="index-more" target="deploy"/>
+     <ant dir="index-replace" target="deploy"/>
+     <ant dir="index-static" target="deploy"/>
+     <ant dir="index-metadata" target="deploy"/>
+     <ant dir="index-links" target="deploy"/>
+     <ant dir="mimetype-filter" target="deploy"/>
+     <ant dir="indexer-cloudsearch" target="deploy"/>
+     <ant dir="indexer-dummy" target="deploy"/>
+     <ant dir="indexer-elastic" target="deploy"/>
+     <ant dir="indexer-solr" target="deploy"/>
+     <ant dir="language-identifier" target="deploy"/>
+     <ant dir="lib-http" target="deploy"/>
+     <ant dir="lib-nekohtml" target="deploy"/>
+     <ant dir="lib-regex-filter" target="deploy"/>
+     <ant dir="lib-xml" target="deploy"/>
+     <ant dir="microformats-reltag" target="deploy"/>
+     <ant dir="nutch-extensionpoints" target="deploy"/>
+     <ant dir="protocol-file" target="deploy"/>
+     <ant dir="protocol-ftp" target="deploy"/>
+     <ant dir="protocol-http" target="deploy"/>
+     <ant dir="protocol-httpclient" target="deploy"/>
+     <ant dir="lib-htmlunit" target="deploy"/>
+     <ant dir="protocol-htmlunit" target="deploy" />
+     <ant dir="lib-selenium" target="deploy"/>
+     <ant dir="protocol-selenium" target="deploy" />
+     <ant dir="protocol-interactiveselenium" target="deploy" />
+     <ant dir="parse-ext" target="deploy"/>
+     <ant dir="parse-js" target="deploy"/>
+     <ant dir="parse-html" target="deploy"/>
+     <ant dir="parse-metatags" target="deploy"/>
+     <ant dir="parse-swf" target="deploy"/>
+     <ant dir="parse-tika" target="deploy"/>
+     <ant dir="parse-zip" target="deploy"/>
+     <ant dir="scoring-depth" target="deploy"/>
+     <ant dir="scoring-opic" target="deploy"/>
+     <ant dir="scoring-link" target="deploy"/>
+     <ant dir="scoring-similarity" target="deploy"/>
+     <ant dir="subcollection" target="deploy"/>
+     <ant dir="tld" target="deploy"/>
+     <ant dir="urlfilter-automaton" target="deploy"/>
+     <ant dir="urlfilter-domain" target="deploy" />
+     <ant dir="urlfilter-domainblacklist" target="deploy" />
+     <ant dir="urlfilter-prefix" target="deploy"/>
+     <ant dir="urlfilter-regex" target="deploy"/>
+     <ant dir="urlfilter-suffix" target="deploy"/>
+     <ant dir="urlfilter-validator" target="deploy"/>
+     <ant dir="urlfilter-ignoreexempt" target="deploy"/>
+     <ant dir="parsefilter-naivebayes" target="deploy"/>
+     <ant dir="parsefilter-regex" target="deploy"/>
+     <ant dir="urlmeta" target="deploy"/>
+     <ant dir="urlnormalizer-ajax" target="deploy"/>
+     <ant dir="urlnormalizer-basic" target="deploy"/>
+     <ant dir="urlnormalizer-host" target="deploy"/>
+     <ant dir="urlnormalizer-pass" target="deploy"/>
+     <ant dir="urlnormalizer-protocol" target="deploy"/>
+     <ant dir="urlnormalizer-querystring" target="deploy"/>
+     <ant dir="urlnormalizer-regex" target="deploy"/>
+     <ant dir="urlnormalizer-slash" target="deploy"/>
+  </target>
+
+  <!-- ====================================================== -->
+  <!-- Test all of the plugins.                               -->
+  <!-- ====================================================== -->
+  <target name="test">
+    <parallel threadCount="2">
+     <ant dir="creativecommons" target="test"/>
+     <ant dir="index-basic" target="test"/>
+     <ant dir="index-anchor" target="test"/>
+     <ant dir="index-geoip" target="test"/>
+     <ant dir="index-more" target="test"/>
+     <ant dir="index-static" target="test"/>
+     <ant dir="index-replace" target="test"/>
+     <ant dir="index-links" target="test"/>
+     <ant dir="mimetype-filter" target="test"/>
+     <ant dir="language-identifier" target="test"/>
+     <ant dir="lib-http" target="test"/>
+     <ant dir="protocol-file" target="test"/>
+     <ant dir="protocol-http" target="test"/>
+     <ant dir="protocol-httpclient" target="test"/>
+     <!--ant dir="parse-ext" target="test"/-->
+     <ant dir="feed" target="test"/>
+     <ant dir="parse-html" target="test"/>
+     <ant dir="parse-metatags" target="test"/>
+     <ant dir="parse-swf" target="test"/>
+     <ant dir="parse-tika" target="test"/>
+     <ant dir="parse-zip" target="test"/>
+     <ant dir="parsefilter-regex" target="test"/>
+     <ant dir="subcollection" target="test"/>
+     <ant dir="urlfilter-automaton" target="test"/>
+     <ant dir="urlfilter-domain" target="test"/>
+     <ant dir="urlfilter-domainblacklist" target="test"/>
+     <ant dir="urlfilter-prefix" target="test"/>
+     <ant dir="urlfilter-regex" target="test"/>
+     <ant dir="urlfilter-suffix" target="test"/>
+     <ant dir="urlfilter-validator" target="test"/>
+     <ant dir="urlfilter-ignoreexempt" target="test"/>
+     <ant dir="urlnormalizer-ajax" target="test"/>
+     <ant dir="urlnormalizer-basic" target="test"/>
+     <ant dir="urlnormalizer-host" target="test"/>
+     <ant dir="urlnormalizer-pass" target="test"/>
+     <ant dir="urlnormalizer-protocol" target="test"/>
+     <ant dir="urlnormalizer-querystring" target="test"/>
+     <ant dir="urlnormalizer-regex" target="test"/>
+     <ant dir="urlnormalizer-slash" target="test"/>
+    </parallel>
+  </target>
+
+  <!-- ====================================================== -->
+  <!-- Clean all of the plugins.                              -->
+  <!-- ====================================================== -->
+  <target name="clean">
+    <ant dir="creativecommons" target="clean"/>
+    <ant dir="feed" target="clean"/>
+    <ant dir="headings" target="clean"/>
+    <ant dir="index-basic" target="clean"/>
+    <ant dir="index-anchor" target="clean"/>
+    <ant dir="index-geoip" target="clean"/>
+    <ant dir="index-more" target="clean"/>
+    <ant dir="index-static" target="clean"/>
+    <ant dir="index-replace" target="clean"/>
+    <ant dir="index-metadata" target="clean"/>
+    <ant dir="index-links" target="clean"/>
+    <ant dir="mimetype-filter" target="clean"/>
+    <ant dir="indexer-cloudsearch" target="clean"/>
+    <ant dir="indexer-dummy" target="clean"/>
+    <ant dir="indexer-elastic" target="clean"/>
+    <ant dir="indexer-solr" target="clean"/>
+    <ant dir="language-identifier" target="clean"/>
+    <!-- <ant dir="lib-commons-httpclient" target="clean"/> -->
+    <ant dir="lib-http" target="clean"/>
+    <!-- <ant dir="lib-lucene-analyzers" target="clean"/>-->
+    <ant dir="lib-nekohtml" target="clean"/>
+    <ant dir="lib-regex-filter" target="clean"/>
+    <ant dir="lib-xml" target="clean"/>
+    <ant dir="microformats-reltag" target="clean"/>
+    <ant dir="nutch-extensionpoints" target="clean"/>
+    <ant dir="protocol-file" target="clean"/>
+    <ant dir="protocol-ftp" target="clean"/>
+    <ant dir="protocol-http" target="clean"/>
+    <ant dir="protocol-httpclient" target="clean"/>
+    <ant dir="lib-htmlunit" target="clean"/>
+    <ant dir="protocol-htmlunit" target="clean" />
+    <ant dir="lib-selenium" target="clean"/>
+    <ant dir="protocol-selenium" target="clean" />
+    <ant dir="protocol-interactiveselenium" target="clean" />
+    <ant dir="parse-ext" target="clean"/>
+    <ant dir="parse-js" target="clean"/>
+    <ant dir="parse-html" target="clean"/>
+    <ant dir="parse-metatags" target="clean"/>
+    <ant dir="parse-swf" target="clean"/>
+    <ant dir="parse-tika" target="clean"/>
+    <ant dir="parse-zip" target="clean"/>
+    <ant dir="parsefilter-regex" target="clean"/>
+    <ant dir="scoring-depth" target="clean"/>
+    <ant dir="scoring-opic" target="clean"/>
+    <ant dir="scoring-link" target="clean"/>
+    <ant dir="scoring-similarity" target="clean"/>
+    <ant dir="subcollection" target="clean"/>
+    <ant dir="tld" target="clean"/>
+    <ant dir="urlfilter-automaton" target="clean"/>
+    <ant dir="urlfilter-domain" target="clean" />
+    <ant dir="urlfilter-domainblacklist" target="clean" />
+    <ant dir="urlfilter-prefix" target="clean"/>
+    <ant dir="urlfilter-regex" target="clean"/>
+    <ant dir="urlfilter-suffix" target="clean"/>
+    <ant dir="urlfilter-validator" target="clean"/>
+    <ant dir="urlfilter-ignoreexempt" target="clean"/>
+    <ant dir="parsefilter-naivebayes" target="clean" />
+    <ant dir="urlmeta" target="clean"/>
+    <ant dir="urlnormalizer-ajax" target="clean"/>
+    <ant dir="urlnormalizer-basic" target="clean"/>
+    <ant dir="urlnormalizer-host" target="clean"/>
+    <ant dir="urlnormalizer-pass" target="clean"/>
+    <ant dir="urlnormalizer-protocol" target="clean"/>
+    <ant dir="urlnormalizer-querystring" target="clean"/>
+    <ant dir="urlnormalizer-regex" target="clean"/>
+    <ant dir="urlnormalizer-slash" target="clean"/>
+  </target>
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/creativecommons/README.txt
----------------------------------------------------------------------
diff --git a/nutch-plugins/creativecommons/README.txt b/nutch-plugins/creativecommons/README.txt
new file mode 100644
index 0000000..d4d7b65
--- /dev/null
+++ b/nutch-plugins/creativecommons/README.txt
@@ -0,0 +1 @@
+Support for crawling and searching Creative-Commons licensed content. 

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/creativecommons/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/creativecommons/build.xml b/nutch-plugins/creativecommons/build.xml
new file mode 100755
index 0000000..6443d7f
--- /dev/null
+++ b/nutch-plugins/creativecommons/build.xml
@@ -0,0 +1,28 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="creativecommons" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+  <!-- Deploy Unit test dependencies -->
+  <target name="deps-test">
+    <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
+   <!--  <ant target="deploy" inheritall="false" dir="../parse-html"/> -->
+  </target>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/creativecommons/conf/crawl-urlfilter.txt
----------------------------------------------------------------------
diff --git a/nutch-plugins/creativecommons/conf/crawl-urlfilter.txt b/nutch-plugins/creativecommons/conf/crawl-urlfilter.txt
new file mode 100644
index 0000000..324617f
--- /dev/null
+++ b/nutch-plugins/creativecommons/conf/crawl-urlfilter.txt
@@ -0,0 +1,18 @@
+# Creative Commnons crawl filter
+
+# Each non-comment, non-blank line contains a regular expression
+# prefixed by '+' or '-'.  The first matching pattern in the file
+# determines whether a URL is included or ignored.  If no pattern
+# matches, the URL is ignored.
+
+# skip file:, ftp:, & mailto: urls
+-^(file|ftp|mailto|https):
+
+# skip image and other suffixes we can't yet parse
+-\.(gif|GIF|jpg|JPG|ico|ICO|css|sit|eps|wmf|rtf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe|mp3|rss|xml|doc|pdf|txt|DOC|PDF|TXT)$
+
+# skip URLs containing certain characters as probable queries, etc.
+-[?*!@=]
+
+# accept anything else
++.

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/creativecommons/conf/nutch-site.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/creativecommons/conf/nutch-site.xml b/nutch-plugins/creativecommons/conf/nutch-site.xml
new file mode 100644
index 0000000..71e344b
--- /dev/null
+++ b/nutch-plugins/creativecommons/conf/nutch-site.xml
@@ -0,0 +1,50 @@
+<?xml version="1.0"?>
+<?xml-stylesheet type="text/xsl" href="nutch-conf.xsl"?>
+
+<!-- Creative Commons' Nutch configuration -->
+
+<nutch-conf>
+
+<property>
+  <name>http.agent.name</name>
+  <value>CreativeCommons</value>
+  <description>Our HTTP 'User-Agent' request header.</description>
+</property>
+
+<property>
+  <name>http.robots.agents</name>
+  <value>CreativeCommons,Nutch,*</value>
+  <description>The agent strings we'll look for in robots.txt files,
+  comma-separated, in decreasing order of precedence.</description>
+</property>
+
+<property>
+  <name>fetcher.server.delay</name>
+  <value>2.0</value>
+  <description>We need to be more polite than when crawling an
+  intranet that we control.</description>
+</property>
+
+<property>
+  <name>http.max.delays</name>
+  <value>3</value>
+  <description>The CC crawl visits a large number of different
+  hosts, so we should not need to delay much.</description>
+</property>
+
+<property>
+  <name>creativecommons.exclude.unlicensed</name>
+  <value>true</value>
+  <description>Exclude HTML content which does not contain a CC license.
+  </description>
+</property>
+
+<property>
+  <name>plugin.excludes</name>
+  <value>parse-(?!html).*</value>
+  <description>Exclude non-HTML content, since we don't know how to
+  find a CC license in anything but HTML. 
+  </description>
+</property>
+
+</nutch-conf>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/creativecommons/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/creativecommons/ivy.xml b/nutch-plugins/creativecommons/ivy.xml
new file mode 100644
index 0000000..1a86d68
--- /dev/null
+++ b/nutch-plugins/creativecommons/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

[13/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build for nutch-core and nutch-plugins

Posted by th...@apache.org.

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-html/src/test/java/org/apache/nutch/parse/html/TestDOMContentUtils.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-html/src/test/java/org/apache/nutch/parse/html/TestDOMContentUtils.java b/nutch-plugins/parse-html/src/test/java/org/apache/nutch/parse/html/TestDOMContentUtils.java
new file mode 100644
index 0000000..15725ae
--- /dev/null
+++ b/nutch-plugins/parse-html/src/test/java/org/apache/nutch/parse/html/TestDOMContentUtils.java
@@ -0,0 +1,347 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.html;
+
+import org.apache.nutch.parse.Outlink;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.NutchConfiguration;
+
+import java.io.ByteArrayInputStream;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.StringTokenizer;
+
+import org.cyberneko.html.parsers.*;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+import org.xml.sax.*;
+import org.w3c.dom.*;
+import org.apache.html.dom.*;
+
+/**
+ * Unit tests for DOMContentUtils.
+ */
+public class TestDOMContentUtils {
+
+  private static final String[] testPages = {
+      new String("<html><head><title> title </title><script> script </script>"
+          + "</head><body> body <a href=\"http://www.nutch.org\">"
+          + " anchor </a><!--comment-->" + "</body></html>"),
+      new String("<html><head><title> title </title><script> script </script>"
+          + "</head><body> body <a href=\"/\">" + " home </a><!--comment-->"
+          + "<style> style </style>" + " <a href=\"bot.html\">" + " bots </a>"
+          + "</body></html>"),
+      new String("<html><head><title> </title>" + "</head><body> "
+          + "<a href=\"/\"> separate this " + "<a href=\"ok\"> from this"
+          + "</a></a>" + "</body></html>"),
+      // this one relies on certain neko fixup behavior, possibly
+      // distributing the anchors into the LI's-but not the other
+      // anchors (outside of them, instead)! So you get a tree that
+      // looks like:
+      // ... <li> <a href=/> home </a> </li>
+      // <li> <a href=/> <a href="1"> 1 </a> </a> </li>
+      // <li> <a href=/> <a href="1"> <a href="2"> 2 </a> </a> </a> </li>
+      new String("<html><head><title> my title </title>"
+          + "</head><body> body " + "<ul>" + "<li> <a href=\"/\"> home"
+          + "<li> <a href=\"1\"> 1" + "<li> <a href=\"2\"> 2" + "</ul>"
+          + "</body></html>"),
+      // test frameset link extraction. The invalid frame in the middle will be
+      // fixed to a third standalone frame.
+      new String("<html><head><title> my title </title>"
+          + "</head><frameset rows=\"20,*\"> " + "<frame src=\"top.html\">"
+          + "</frame>" + "<frameset cols=\"20,*\">"
+          + "<frame src=\"left.html\">" + "<frame src=\"invalid.html\"/>"
+          + "</frame>" + "<frame src=\"right.html\">" + "</frame>"
+          + "</frameset>" + "</frameset>" + "</body></html>"),
+      // test <area> and <iframe> link extraction + url normalization
+      new String(
+          "<html><head><title> my title </title>"
+              + "</head><body>"
+              + "<img src=\"logo.gif\" usemap=\"#green\" border=\"0\">"
+              + "<map name=\"green\">"
+              + "<area shape=\"polygon\" coords=\"19,44,45,11,87\" href=\"../index.html\">"
+              + "<area shape=\"rect\" coords=\"128,132,241,179\" href=\"#bottom\">"
+              + "<area shape=\"circle\" coords=\"68,211,35\" href=\"../bot.html\">"
+              + "</map>" + "<a name=\"bottom\"/><h1> the bottom </h1> "
+              + "<iframe src=\"../docs/index.html\"/>" + "</body></html>"),
+      // test whitespace processing for plain text extraction
+      new String(
+          "<html><head>\n <title> my\t\n  title\r\n </title>\n"
+              + " </head>\n"
+              + " <body>\n"
+              + "    <h1> Whitespace\ttest  </h1> \n"
+              + "\t<a href=\"../index.html\">\n  \twhitespace  test\r\n\t</a>  \t\n"
+              + "    <p> This is<span> a whitespace<span></span> test</span>. Newlines\n"
+              + "should appear as space too.</p><p>Tabs\tare spaces too.\n</p>"
+              + "    This\t<b>is a</b> break -&gt;<br>and the line after<i> break</i>.<br>\n"
+              + "<table>"
+              + "    <tr><td>one</td><td>two</td><td>three</td></tr>\n"
+              + "    <tr><td>space here </td><td> space there</td><td>no space</td></tr>"
+              + "\t<tr><td>one\r\ntwo</td><td>two\tthree</td><td>three\r\tfour</td></tr>\n"
+              + "</table>put some text here<Br>and there."
+              + "<h2>End\tthis\rmadness\n!</h2>\r\n"
+              + "         .        .        .         ." + "</body>  </html>"),
+
+      // test that <a rel=nofollow> links are not returned
+      new String("<html><head></head><body>"
+          + "<a href=\"http://www.nutch.org\" rel=\"nofollow\"> ignore </a>"
+          + "<a rel=\"nofollow\" href=\"http://www.nutch.org\"> ignore </a>"
+          + "</body></html>"),
+      // test that POST form actions are skipped
+      new String("<html><head></head><body>"
+          + "<form method='POST' action='/search.jsp'><input type=text>"
+          + "<input type=submit><p>test1</p></form>"
+          + "<form method='GET' action='/dummy.jsp'><input type=text>"
+          + "<input type=submit><p>test2</p></form></body></html>"),
+      // test that all form actions are skipped
+      new String("<html><head></head><body>"
+          + "<form method='POST' action='/search.jsp'><input type=text>"
+          + "<input type=submit><p>test1</p></form>"
+          + "<form method='GET' action='/dummy.jsp'><input type=text>"
+          + "<input type=submit><p>test2</p></form></body></html>"),
+      new String("<html><head><title> title </title>" + "</head><body>"
+          + "<a href=\";x\">anchor1</a>" + "<a href=\"g;x\">anchor2</a>"
+          + "<a href=\"g;x?y#s\">anchor3</a>" + "</body></html>"),
+      new String("<html><head><title> title </title>" + "</head><body>"
+          + "<a href=\"g\">anchor1</a>" + "<a href=\"g?y#s\">anchor2</a>"
+          + "<a href=\"?y=1\">anchor3</a>" + "<a href=\"?y=1#s\">anchor4</a>"
+          + "<a href=\"?y=1;somethingelse\">anchor5</a>" + "</body></html>"),
+      new String("<html><head><title> title </title>" + "</head><body>"
+          + "<a href=\"g\"><!--no anchor--></a>"
+          + "<a href=\"g1\"> <!--whitespace-->  </a>"
+          + "<a href=\"g2\">  <img src=test.gif alt='bla bla'> </a>"
+          + "</body></html>"), };
+
+  private static int SKIP = 9;
+
+  private static String[] testBaseHrefs = { "http://www.nutch.org",
+      "http://www.nutch.org/docs/foo.html", "http://www.nutch.org/docs/",
+      "http://www.nutch.org/docs/", "http://www.nutch.org/frames/",
+      "http://www.nutch.org/maps/", "http://www.nutch.org/whitespace/",
+      "http://www.nutch.org//", "http://www.nutch.org/",
+      "http://www.nutch.org/", "http://www.nutch.org/",
+      "http://www.nutch.org/;something", "http://www.nutch.org/" };
+
+  private static final DocumentFragment testDOMs[] = new DocumentFragment[testPages.length];
+
+  private static URL[] testBaseHrefURLs = new URL[testPages.length];
+
+  private static final String[] answerText = {
+      "title body anchor",
+      "title body home bots",
+      "separate this from this",
+      "my title body home 1 2",
+      "my title",
+      "my title the bottom",
+      "my title Whitespace test whitespace test "
+          + "This is a whitespace test . Newlines should appear as space too. "
+          + "Tabs are spaces too. This is a break -> and the line after break . "
+          + "one two three space here space there no space "
+          + "one two two three three four put some text here and there. "
+          + "End this madness ! . . . .", "ignore ignore", "test1 test2",
+      "test1 test2", "title anchor1 anchor2 anchor3",
+      "title anchor1 anchor2 anchor3 anchor4 anchor5", "title" };
+
+  private static final String[] answerTitle = { "title", "title", "",
+      "my title", "my title", "my title", "my title", "", "", "", "title",
+      "title", "title" };
+
+  // note: should be in page-order
+  private static Outlink[][] answerOutlinks;
+
+  private static Configuration conf;
+  private static DOMContentUtils utils = null;
+
+  @Before
+  public void setup() {
+    conf = NutchConfiguration.create();
+    conf.setBoolean("parser.html.form.use_action", true);
+    utils = new DOMContentUtils(conf);
+    DOMFragmentParser parser = new DOMFragmentParser();
+    try {
+      parser
+          .setFeature(
+              "http://cyberneko.org/html/features/scanner/allow-selfclosing-iframe",
+              true);
+    } catch (SAXException e) {
+    }
+    for (int i = 0; i < testPages.length; i++) {
+      DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();
+      try {
+        parser.parse(
+            new InputSource(new ByteArrayInputStream(testPages[i].getBytes())),
+            node);
+        testBaseHrefURLs[i] = new URL(testBaseHrefs[i]);
+      } catch (Exception e) {
+        Assert.assertTrue("caught exception: " + e, false);
+      }
+      testDOMs[i] = node;
+    }
+    try {
+      answerOutlinks = new Outlink[][] {
+          { new Outlink("http://www.nutch.org", "anchor"), },
+          { new Outlink("http://www.nutch.org/", "home"),
+              new Outlink("http://www.nutch.org/docs/bot.html", "bots"), },
+          { new Outlink("http://www.nutch.org/", "separate this"),
+              new Outlink("http://www.nutch.org/docs/ok", "from this"), },
+          { new Outlink("http://www.nutch.org/", "home"),
+              new Outlink("http://www.nutch.org/docs/1", "1"),
+              new Outlink("http://www.nutch.org/docs/2", "2"), },
+          { new Outlink("http://www.nutch.org/frames/top.html", ""),
+              new Outlink("http://www.nutch.org/frames/left.html", ""),
+              new Outlink("http://www.nutch.org/frames/invalid.html", ""),
+              new Outlink("http://www.nutch.org/frames/right.html", ""), },
+          { new Outlink("http://www.nutch.org/maps/logo.gif", ""),
+              new Outlink("http://www.nutch.org/index.html", ""),
+              new Outlink("http://www.nutch.org/maps/#bottom", ""),
+              new Outlink("http://www.nutch.org/bot.html", ""),
+              new Outlink("http://www.nutch.org/docs/index.html", ""), },
+          { new Outlink("http://www.nutch.org/index.html", "whitespace test"), },
+          {},
+          { new Outlink("http://www.nutch.org/dummy.jsp", "test2"), },
+          {},
+          { new Outlink("http://www.nutch.org/;x", "anchor1"),
+              new Outlink("http://www.nutch.org/g;x", "anchor2"),
+              new Outlink("http://www.nutch.org/g;x?y#s", "anchor3") },
+          {
+              // this is tricky - see RFC3986 section 5.4.1 example 7
+              new Outlink("http://www.nutch.org/g", "anchor1"),
+              new Outlink("http://www.nutch.org/g?y#s", "anchor2"),
+              new Outlink("http://www.nutch.org/;something?y=1", "anchor3"),
+              new Outlink("http://www.nutch.org/;something?y=1#s", "anchor4"),
+              new Outlink("http://www.nutch.org/;something?y=1;somethingelse",
+                  "anchor5") },
+          { new Outlink("http://www.nutch.org/g", ""),
+              new Outlink("http://www.nutch.org/g1", ""),
+              new Outlink("http://www.nutch.org/g2", "bla bla"),
+              new Outlink("http://www.nutch.org/test.gif", "bla bla"), } };
+
+    } catch (MalformedURLException e) {
+
+    }
+  }
+
+  private static boolean equalsIgnoreWhitespace(String s1, String s2) {
+    StringTokenizer st1 = new StringTokenizer(s1);
+    StringTokenizer st2 = new StringTokenizer(s2);
+
+    while (st1.hasMoreTokens()) {
+      if (!st2.hasMoreTokens())
+        return false;
+      if (!st1.nextToken().equals(st2.nextToken()))
+        return false;
+    }
+    if (st2.hasMoreTokens())
+      return false;
+    return true;
+  }
+
+  @Test
+  public void testGetText() {
+    if (testDOMs[0] == null)
+      setup();
+    for (int i = 0; i < testPages.length; i++) {
+      StringBuffer sb = new StringBuffer();
+      utils.getText(sb, testDOMs[i]);
+      String text = sb.toString();
+      Assert.assertTrue(
+          "expecting text: " + answerText[i]
+              + System.getProperty("line.separator")
+              + System.getProperty("line.separator") + "got text: " + text,
+          equalsIgnoreWhitespace(answerText[i], text));
+    }
+  }
+
+  @Test
+  public void testGetTitle() {
+    if (testDOMs[0] == null)
+      setup();
+    for (int i = 0; i < testPages.length; i++) {
+      StringBuffer sb = new StringBuffer();
+      utils.getTitle(sb, testDOMs[i]);
+      String text = sb.toString();
+      Assert.assertTrue(
+          "expecting text: " + answerText[i]
+              + System.getProperty("line.separator")
+              + System.getProperty("line.separator") + "got text: " + text,
+          equalsIgnoreWhitespace(answerTitle[i], text));
+    }
+  }
+
+  @Test
+  public void testGetOutlinks() {
+    if (testDOMs[0] == null)
+      setup();
+    for (int i = 0; i < testPages.length; i++) {
+      ArrayList<Outlink> outlinks = new ArrayList<Outlink>();
+      if (i == SKIP) {
+        conf.setBoolean("parser.html.form.use_action", false);
+        utils.setConf(conf);
+      } else {
+        conf.setBoolean("parser.html.form.use_action", true);
+        utils.setConf(conf);
+      }
+      utils.getOutlinks(testBaseHrefURLs[i], outlinks, testDOMs[i]);
+      Outlink[] outlinkArr = new Outlink[outlinks.size()];
+      outlinkArr = (Outlink[]) outlinks.toArray(outlinkArr);
+      compareOutlinks(answerOutlinks[i], outlinkArr);
+    }
+  }
+
+  private static final void appendOutlinks(StringBuffer sb, Outlink[] o) {
+    for (int i = 0; i < o.length; i++) {
+      sb.append(o[i].toString());
+      sb.append(System.getProperty("line.separator"));
+    }
+  }
+
+  private static final String outlinksString(Outlink[] o) {
+    StringBuffer sb = new StringBuffer();
+    appendOutlinks(sb, o);
+    return sb.toString();
+  }
+
+  private static final void compareOutlinks(Outlink[] o1, Outlink[] o2) {
+    if (o1.length != o2.length) {
+      Assert.assertTrue(
+          "got wrong number of outlinks (expecting " + o1.length + ", got "
+              + o2.length + ")" + System.getProperty("line.separator")
+              + "answer: " + System.getProperty("line.separator")
+              + outlinksString(o1) + System.getProperty("line.separator")
+              + "got: " + System.getProperty("line.separator")
+              + outlinksString(o2) + System.getProperty("line.separator"),
+          false);
+    }
+
+    for (int i = 0; i < o1.length; i++) {
+      if (!o1[i].equals(o2[i])) {
+        Assert.assertTrue(
+            "got wrong outlinks at position " + i
+                + System.getProperty("line.separator") + "answer: "
+                + System.getProperty("line.separator") + "'" + o1[i].getToUrl()
+                + "', anchor: '" + o1[i].getAnchor() + "'"
+                + System.getProperty("line.separator") + "got: "
+                + System.getProperty("line.separator") + "'" + o2[i].getToUrl()
+                + "', anchor: '" + o2[i].getAnchor() + "'", false);
+
+      }
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-html/src/test/java/org/apache/nutch/parse/html/TestHtmlParser.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-html/src/test/java/org/apache/nutch/parse/html/TestHtmlParser.java b/nutch-plugins/parse-html/src/test/java/org/apache/nutch/parse/html/TestHtmlParser.java
new file mode 100644
index 0000000..7099f50
--- /dev/null
+++ b/nutch-plugins/parse-html/src/test/java/org/apache/nutch/parse/html/TestHtmlParser.java
@@ -0,0 +1,122 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.html;
+
+import java.nio.charset.Charset;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.html.HtmlParser;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.Parser;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Assert;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class TestHtmlParser {
+
+  public static final Logger LOG = LoggerFactory
+      .getLogger(TestHtmlParser.class);
+
+  private static final String encodingTestKeywords = "fran�ais, espa�ol, \u0440\u0443\u0441\u0441\u043a\u0438\u0439 \u044f\u0437\u044b\u043a, \u010de\u0161tina, \u03b5\u03bb\u03bb\u03b7\u03bd\u03b9\u03ba\u03ac";
+  private static final String encodingTestBody = "<ul>\n  <li>fran�ais\n  <li>espa�ol\n  <li>\u0440\u0443\u0441\u0441\u043a\u0438\u0439 \u044f\u0437\u044b\u043a\n  <li>\u010de\u0161tina\n  <li>\u03b5\u03bb\u03bb\u03b7\u03bd\u03b9\u03ba\u03ac\n</ul>";
+  private static final String encodingTestContent = "<title>"
+      + encodingTestKeywords + "</title>\n"
+      + "<meta name=\"keywords\" content=\"" + encodingTestKeywords + "\" />\n"
+      + "</head>\n<body>" + encodingTestBody + "</body>\n</html>";
+
+  private static String[][] encodingTestPages = {
+      {
+          "HTML4, utf-8, meta http-equiv, no quotes",
+          "utf-8",
+          "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" "
+              + "\"http://www.w3.org/TR/html4/loose.dtd\">\n"
+              + "<html>\n<head>\n"
+              + "<meta http-equiv=Content-Type content=\"text/html; charset=utf-8\" />"
+              + encodingTestContent },
+      {
+          "HTML4, utf-8, meta http-equiv, single quotes",
+          "utf-8",
+          "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" "
+              + "\"http://www.w3.org/TR/html4/loose.dtd\">\n"
+              + "<html>\n<head>\n"
+              + "<meta http-equiv='Content-Type' content='text/html; charset=utf-8' />"
+              + encodingTestContent },
+      {
+          "XHTML, utf-8, meta http-equiv, double quotes",
+          "utf-8",
+          "<?xml version=\"1.0\"?>\n<html xmlns=\"http://www.w3.org/1999/xhtml\">"
+              + "<html>\n<head>\n"
+              + "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />"
+              + encodingTestContent },
+      {
+          "HTML5, utf-8, meta charset",
+          "utf-8",
+          "<!DOCTYPE html>\n<html>\n<head>\n" + "<meta charset=\"utf-8\">"
+              + encodingTestContent },
+      { "HTML5, utf-8, BOM", "utf-8",
+          "\ufeff<!DOCTYPE html>\n<html>\n<head>\n" + encodingTestContent },
+      { "HTML5, utf-16, BOM", "utf-16",
+          "\ufeff<!DOCTYPE html>\n<html>\n<head>\n" + encodingTestContent } };
+
+  private Configuration conf;
+  private Parser parser;
+
+  public TestHtmlParser() {
+    conf = NutchConfiguration.create();
+    parser = new HtmlParser();
+    parser.setConf(conf);
+  }
+
+  protected Parse parse(byte[] contentBytes) {
+    String dummyUrl = "http://dummy.url/";
+    return parser.getParse(
+        new Content(dummyUrl, dummyUrl, contentBytes, "text/html",
+            new Metadata(), conf)).get(dummyUrl);
+  }
+
+  @Test
+  public void testEncodingDetection() {
+    for (String[] testPage : encodingTestPages) {
+      String name = testPage[0];
+      Charset charset = Charset.forName(testPage[1]);
+      byte[] contentBytes = testPage[2].getBytes(charset);
+      Parse parse = parse(contentBytes);
+      String text = parse.getText();
+      String title = parse.getData().getTitle();
+      String keywords = parse.getData().getMeta("keywords");
+      LOG.info(name);
+      LOG.info("title:\t" + title);
+      LOG.info("keywords:\t" + keywords);
+      LOG.info("text:\t" + text);
+      Assert.assertEquals("Title not extracted properly (" + name + ")",
+          encodingTestKeywords, title);
+      for (String keyword : encodingTestKeywords.split(",\\s*")) {
+        Assert.assertTrue(keyword + " not found in text (" + name + ")",
+            text.contains(keyword));
+      }
+      Assert.assertNotNull("No keywords extracted", keywords);
+      Assert.assertEquals("Keywords not extracted properly (" + name + ")",
+          encodingTestKeywords, keywords);
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-html/src/test/java/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-html/src/test/java/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java b/nutch-plugins/parse-html/src/test/java/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java
new file mode 100644
index 0000000..5089a10
--- /dev/null
+++ b/nutch-plugins/parse-html/src/test/java/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java
@@ -0,0 +1,155 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.html;
+
+import org.apache.nutch.parse.HTMLMetaTags;
+
+import java.io.ByteArrayInputStream;
+import java.net.URL;
+
+import org.cyberneko.html.parsers.*;
+import org.junit.Assert;
+import org.junit.Test;
+import org.xml.sax.*;
+import org.w3c.dom.*;
+import org.apache.html.dom.*;
+
+/** Unit tests for HTMLMetaProcessor. */
+public class TestRobotsMetaProcessor {
+
+  /*
+   * 
+   * some sample tags:
+   * 
+   * <meta name="robots" content="index,follow"> <meta name="robots"
+   * content="noindex,follow"> <meta name="robots" content="index,nofollow">
+   * <meta name="robots" content="noindex,nofollow">
+   * 
+   * <META HTTP-EQUIV="Pragma" CONTENT="no-cache">
+   */
+
+  public static String[] tests = {
+      "<html><head><title>test page</title>"
+          + "<META NAME=\"ROBOTS\" CONTENT=\"NONE\"> "
+          + "<META HTTP-EQUIV=\"PRAGMA\" CONTENT=\"NO-CACHE\"> "
+          + "</head><body>" + " some text" + "</body></html>",
+
+      "<html><head><title>test page</title>"
+          + "<meta name=\"robots\" content=\"all\"> "
+          + "<meta http-equiv=\"pragma\" content=\"no-cache\"> "
+          + "</head><body>" + " some text" + "</body></html>",
+
+      "<html><head><title>test page</title>"
+          + "<MeTa NaMe=\"RoBoTs\" CoNtEnT=\"nOnE\"> "
+          + "<MeTa HtTp-EqUiV=\"pRaGmA\" cOnTeNt=\"No-CaChE\"> "
+          + "</head><body>" + " some text" + "</body></html>",
+
+      "<html><head><title>test page</title>"
+          + "<meta name=\"robots\" content=\"none\"> " + "</head><body>"
+          + " some text" + "</body></html>",
+
+      "<html><head><title>test page</title>"
+          + "<meta name=\"robots\" content=\"noindex,nofollow\"> "
+          + "</head><body>" + " some text" + "</body></html>",
+
+      "<html><head><title>test page</title>"
+          + "<meta name=\"robots\" content=\"noindex,follow\"> "
+          + "</head><body>" + " some text" + "</body></html>",
+
+      "<html><head><title>test page</title>"
+          + "<meta name=\"robots\" content=\"index,nofollow\"> "
+          + "</head><body>" + " some text" + "</body></html>",
+
+      "<html><head><title>test page</title>"
+          + "<meta name=\"robots\" content=\"index,follow\"> "
+          + "<base href=\"http://www.nutch.org/\">" + "</head><body>"
+          + " some text" + "</body></html>",
+
+      "<html><head><title>test page</title>" + "<meta name=\"robots\"> "
+          + "<base href=\"http://www.nutch.org/base/\">" + "</head><body>"
+          + " some text" + "</body></html>",
+
+  };
+
+  public static final boolean[][] answers = { { true, true, true }, // NONE
+      { false, false, true }, // all
+      { true, true, true }, // nOnE
+      { true, true, false }, // none
+      { true, true, false }, // noindex,nofollow
+      { true, false, false }, // noindex,follow
+      { false, true, false }, // index,nofollow
+      { false, false, false }, // index,follow
+      { false, false, false }, // missing!
+  };
+
+  private URL[][] currURLsAndAnswers;
+
+  @Test
+  public void testRobotsMetaProcessor() {
+    DOMFragmentParser parser = new DOMFragmentParser();
+    ;
+
+    try {
+      currURLsAndAnswers = new URL[][] {
+          { new URL("http://www.nutch.org"), null },
+          { new URL("http://www.nutch.org"), null },
+          { new URL("http://www.nutch.org"), null },
+          { new URL("http://www.nutch.org"), null },
+          { new URL("http://www.nutch.org"), null },
+          { new URL("http://www.nutch.org"), null },
+          { new URL("http://www.nutch.org"), null },
+          { new URL("http://www.nutch.org/foo/"),
+              new URL("http://www.nutch.org/") },
+          { new URL("http://www.nutch.org"),
+              new URL("http://www.nutch.org/base/") } };
+    } catch (Exception e) {
+      Assert.assertTrue("couldn't make test URLs!", false);
+    }
+
+    for (int i = 0; i < tests.length; i++) {
+      byte[] bytes = tests[i].getBytes();
+
+      DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();
+
+      try {
+        parser.parse(new InputSource(new ByteArrayInputStream(bytes)), node);
+      } catch (Exception e) {
+        e.printStackTrace();
+      }
+
+      HTMLMetaTags robotsMeta = new HTMLMetaTags();
+      HTMLMetaProcessor.getMetaTags(robotsMeta, node, currURLsAndAnswers[i][0]);
+
+      Assert.assertTrue("got index wrong on test " + i,
+          robotsMeta.getNoIndex() == answers[i][0]);
+      Assert.assertTrue("got follow wrong on test " + i,
+          robotsMeta.getNoFollow() == answers[i][1]);
+      Assert.assertTrue("got cache wrong on test " + i,
+          robotsMeta.getNoCache() == answers[i][2]);
+      Assert
+          .assertTrue(
+              "got base href wrong on test " + i + " (got "
+                  + robotsMeta.getBaseHref() + ")",
+              ((robotsMeta.getBaseHref() == null) && (currURLsAndAnswers[i][1] == null))
+                  || ((robotsMeta.getBaseHref() != null) && robotsMeta
+                      .getBaseHref().equals(currURLsAndAnswers[i][1])));
+
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-js/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-js/build.xml b/nutch-plugins/parse-js/build.xml
new file mode 100644
index 0000000..d9c2146
--- /dev/null
+++ b/nutch-plugins/parse-js/build.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="parse-js" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-js/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-js/ivy.xml b/nutch-plugins/parse-js/ivy.xml
new file mode 100644
index 0000000..1a86d68
--- /dev/null
+++ b/nutch-plugins/parse-js/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-js/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-js/plugin.xml b/nutch-plugins/parse-js/plugin.xml
new file mode 100644
index 0000000..9c06c2a
--- /dev/null
+++ b/nutch-plugins/parse-js/plugin.xml
@@ -0,0 +1,53 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="parse-js"
+   name="JavaScript Parser"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="parse-js.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+   <extension id="org.apache.nutch.parse.js"
+              name="JS Parser"
+              point="org.apache.nutch.parse.Parser">
+      <implementation id="JSParser"
+         class="org.apache.nutch.parse.js.JSParseFilter">
+        <parameter name="contentType" value="application/x-javascript"/>
+        <parameter name="pathSuffix"  value="js"/>
+      </implementation>
+   </extension>
+   <extension id="org.apache.nutch.parse.js.JSParseFilter"
+              name="Parse JS Filter"
+              point="org.apache.nutch.parse.HtmlParseFilter">
+      <implementation id="JSParseFilter"
+         class="org.apache.nutch.parse.js.JSParseFilter">
+        <parameter name="contentType" value="application/x-javascript"/>
+        <parameter name="pathSuffix"  value=""/>
+      </implementation>
+   </extension>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-js/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-js/pom.xml b/nutch-plugins/parse-js/pom.xml
new file mode 100644
index 0000000..68d5770
--- /dev/null
+++ b/nutch-plugins/parse-js/pom.xml
@@ -0,0 +1,38 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>parse-js</artifactId>
+    <packaging>jar</packaging>
+
+    <name>parse-js</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-js/src/main/java/org/apache/nutch/parse/js/JSParseFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-js/src/main/java/org/apache/nutch/parse/js/JSParseFilter.java b/nutch-plugins/parse-js/src/main/java/org/apache/nutch/parse/js/JSParseFilter.java
new file mode 100644
index 0000000..8c95372
--- /dev/null
+++ b/nutch-plugins/parse-js/src/main/java/org/apache/nutch/parse/js/JSParseFilter.java
@@ -0,0 +1,301 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse.js;
+
+import java.io.BufferedReader;
+import java.io.FileInputStream;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.nutch.parse.HTMLMetaTags;
+import org.apache.nutch.parse.HtmlParseFilter;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.ParseResult;
+import org.apache.nutch.parse.ParseText;
+import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.parse.Parser;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.oro.text.regex.MatchResult;
+import org.apache.oro.text.regex.Pattern;
+import org.apache.oro.text.regex.PatternCompiler;
+import org.apache.oro.text.regex.PatternMatcher;
+import org.apache.oro.text.regex.PatternMatcherInput;
+import org.apache.oro.text.regex.Perl5Compiler;
+import org.apache.oro.text.regex.Perl5Matcher;
+import org.w3c.dom.DocumentFragment;
+import org.w3c.dom.Element;
+import org.w3c.dom.NamedNodeMap;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+
+/**
+ * This class is a heuristic link extractor for JavaScript files and code
+ * snippets. The general idea of a two-pass regex matching comes from Heritrix.
+ * Parts of the code come from OutlinkExtractor.java
+ */
+public class JSParseFilter implements HtmlParseFilter, Parser {
+  public static final Logger LOG = LoggerFactory.getLogger(JSParseFilter.class);
+
+  private static final int MAX_TITLE_LEN = 80;
+
+  private Configuration conf;
+
+  public ParseResult filter(Content content, ParseResult parseResult,
+      HTMLMetaTags metaTags, DocumentFragment doc) {
+
+    Parse parse = parseResult.get(content.getUrl());
+
+    String url = content.getBaseUrl();
+    ArrayList<Outlink> outlinks = new ArrayList<Outlink>();
+    walk(doc, parse, metaTags, url, outlinks);
+    if (outlinks.size() > 0) {
+      Outlink[] old = parse.getData().getOutlinks();
+      String title = parse.getData().getTitle();
+      List<Outlink> list = Arrays.asList(old);
+      outlinks.addAll(list);
+      ParseStatus status = parse.getData().getStatus();
+      String text = parse.getText();
+      Outlink[] newlinks = (Outlink[]) outlinks.toArray(new Outlink[outlinks
+          .size()]);
+      ParseData parseData = new ParseData(status, title, newlinks, parse
+          .getData().getContentMeta(), parse.getData().getParseMeta());
+
+      // replace original parse obj with new one
+      parseResult.put(content.getUrl(), new ParseText(text), parseData);
+    }
+    return parseResult;
+  }
+
+  private void walk(Node n, Parse parse, HTMLMetaTags metaTags, String base,
+      List<Outlink> outlinks) {
+    if (n instanceof Element) {
+      String name = n.getNodeName();
+      if (name.equalsIgnoreCase("script")) {
+        /*
+         * String lang = null; Node lNode =
+         * n.getAttributes().getNamedItem("language"); if (lNode == null) lang =
+         * "javascript"; else lang = lNode.getNodeValue();
+         */
+        StringBuffer script = new StringBuffer();
+        NodeList nn = n.getChildNodes();
+        if (nn.getLength() > 0) {
+          for (int i = 0; i < nn.getLength(); i++) {
+            if (i > 0)
+              script.append('\n');
+            script.append(nn.item(i).getNodeValue());
+          }
+          // if (LOG.isInfoEnabled()) {
+          // LOG.info("script: language=" + lang + ", text: " +
+          // script.toString());
+          // }
+          Outlink[] links = getJSLinks(script.toString(), "", base);
+          if (links != null && links.length > 0)
+            outlinks.addAll(Arrays.asList(links));
+          // no other children of interest here, go one level up.
+          return;
+        }
+      } else {
+        // process all HTML 4.0 events, if present...
+        NamedNodeMap attrs = n.getAttributes();
+        int len = attrs.getLength();
+        for (int i = 0; i < len; i++) {
+          // Window: onload,onunload
+          // Form: onchange,onsubmit,onreset,onselect,onblur,onfocus
+          // Keyboard: onkeydown,onkeypress,onkeyup
+          // Mouse:
+          // onclick,ondbclick,onmousedown,onmouseout,onmousover,onmouseup
+          Node anode = attrs.item(i);
+          Outlink[] links = null;
+          if (anode.getNodeName().startsWith("on")) {
+            links = getJSLinks(anode.getNodeValue(), "", base);
+          } else if (anode.getNodeName().equalsIgnoreCase("href")) {
+            String val = anode.getNodeValue();
+            if (val != null && val.toLowerCase().indexOf("javascript:") != -1) {
+              links = getJSLinks(val, "", base);
+            }
+          }
+          if (links != null && links.length > 0)
+            outlinks.addAll(Arrays.asList(links));
+        }
+      }
+    }
+    NodeList nl = n.getChildNodes();
+    for (int i = 0; i < nl.getLength(); i++) {
+      walk(nl.item(i), parse, metaTags, base, outlinks);
+    }
+  }
+
+  public ParseResult getParse(Content c) {
+    String type = c.getContentType();
+    if (type != null && !type.trim().equals("")
+        && !type.toLowerCase().startsWith("application/x-javascript"))
+      return new ParseStatus(ParseStatus.FAILED_INVALID_FORMAT,
+          "Content not JavaScript: '" + type + "'").getEmptyParseResult(
+          c.getUrl(), getConf());
+    String script = new String(c.getContent());
+    Outlink[] outlinks = getJSLinks(script, "", c.getUrl());
+    if (outlinks == null)
+      outlinks = new Outlink[0];
+    // Title? use the first line of the script...
+    String title;
+    int idx = script.indexOf('\n');
+    if (idx != -1) {
+      if (idx > MAX_TITLE_LEN)
+        idx = MAX_TITLE_LEN;
+      title = script.substring(0, idx);
+    } else {
+      idx = Math.min(MAX_TITLE_LEN, script.length());
+      title = script.substring(0, idx);
+    }
+    ParseData pd = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks,
+        c.getMetadata());
+    return ParseResult.createParseResult(c.getUrl(), new ParseImpl(script, pd));
+  }
+
+  private static final String STRING_PATTERN = "(\\\\*(?:\"|\'))([^\\s\"\']+?)(?:\\1)";
+  // A simple pattern. This allows also invalid URL characters.
+  private static final String URI_PATTERN = "(^|\\s*?)/?\\S+?[/\\.]\\S+($|\\s*)";
+
+  // Alternative pattern, which limits valid url characters.
+  // private static final String URI_PATTERN =
+  // "(^|\\s*?)[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2})+[/.](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2})+(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]*))?($|\\s*)";
+
+  /**
+   * This method extracts URLs from literals embedded in JavaScript.
+   */
+  private Outlink[] getJSLinks(String plainText, String anchor, String base) {
+
+    final List<Outlink> outlinks = new ArrayList<Outlink>();
+    URL baseURL = null;
+
+    try {
+      baseURL = new URL(base);
+    } catch (Exception e) {
+      if (LOG.isErrorEnabled()) {
+        LOG.error("getJSLinks", e);
+      }
+    }
+
+    try {
+      final PatternCompiler cp = new Perl5Compiler();
+      final Pattern pattern = cp.compile(STRING_PATTERN,
+          Perl5Compiler.CASE_INSENSITIVE_MASK | Perl5Compiler.READ_ONLY_MASK
+              | Perl5Compiler.MULTILINE_MASK);
+      final Pattern pattern1 = cp.compile(URI_PATTERN,
+          Perl5Compiler.CASE_INSENSITIVE_MASK | Perl5Compiler.READ_ONLY_MASK
+              | Perl5Compiler.MULTILINE_MASK);
+      final PatternMatcher matcher = new Perl5Matcher();
+
+      final PatternMatcher matcher1 = new Perl5Matcher();
+      final PatternMatcherInput input = new PatternMatcherInput(plainText);
+
+      MatchResult result;
+      String url;
+
+      // loop the matches
+      while (matcher.contains(input, pattern)) {
+        result = matcher.getMatch();
+        url = result.group(2);
+        PatternMatcherInput input1 = new PatternMatcherInput(url);
+        if (!matcher1.matches(input1, pattern1)) {
+          // if (LOG.isTraceEnabled()) { LOG.trace(" - invalid '" + url + "'");
+          // }
+          continue;
+        }
+        if (url.startsWith("www.")) {
+          url = "http://" + url;
+        } else {
+          // See if candidate URL is parseable. If not, pass and move on to
+          // the next match.
+          try {
+            url = new URL(baseURL, url).toString();
+          } catch (MalformedURLException ex) {
+            if (LOG.isTraceEnabled()) {
+              LOG.trace(" - failed URL parse '" + url + "' and baseURL '"
+                  + baseURL + "'", ex);
+            }
+            continue;
+          }
+        }
+        url = url.replaceAll("&amp;", "&");
+        if (LOG.isTraceEnabled()) {
+          LOG.trace(" - outlink from JS: '" + url + "'");
+        }
+        outlinks.add(new Outlink(url, anchor));
+      }
+    } catch (Exception ex) {
+      // if it is a malformed URL we just throw it away and continue with
+      // extraction.
+      if (LOG.isErrorEnabled()) {
+        LOG.error("getJSLinks", ex);
+      }
+    }
+
+    final Outlink[] retval;
+
+    // create array of the Outlinks
+    if (outlinks != null && outlinks.size() > 0) {
+      retval = (Outlink[]) outlinks.toArray(new Outlink[0]);
+    } else {
+      retval = new Outlink[0];
+    }
+
+    return retval;
+  }
+
+  public static void main(String[] args) throws Exception {
+    if (args.length < 2) {
+      System.err.println(JSParseFilter.class.getName() + " file.js baseURL");
+      return;
+    }
+    InputStream in = new FileInputStream(args[0]);
+    BufferedReader br = new BufferedReader(new InputStreamReader(in, "UTF-8"));
+    StringBuffer sb = new StringBuffer();
+    String line = null;
+    while ((line = br.readLine()) != null)
+      sb.append(line + "\n");
+    br.close();
+
+    JSParseFilter parseFilter = new JSParseFilter();
+    parseFilter.setConf(NutchConfiguration.create());
+    Outlink[] links = parseFilter.getJSLinks(sb.toString(), "", args[1]);
+    System.out.println("Outlinks extracted: " + links.length);
+    for (int i = 0; i < links.length; i++)
+      System.out.println(" - " + links[i]);
+  }
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+  }
+
+  public Configuration getConf() {
+    return this.conf;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-js/src/main/java/org/apache/nutch/parse/js/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-js/src/main/java/org/apache/nutch/parse/js/package-info.java b/nutch-plugins/parse-js/src/main/java/org/apache/nutch/parse/js/package-info.java
new file mode 100644
index 0000000..36d0d14
--- /dev/null
+++ b/nutch-plugins/parse-js/src/main/java/org/apache/nutch/parse/js/package-info.java
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Parser and parse filter plugin to extract all (possible) links
+ * from JavaScript files and embedded JavaScript code snippets.
+ */
+package org.apache.nutch.parse.js;
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-metatags/README.txt
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-metatags/README.txt b/nutch-plugins/parse-metatags/README.txt
new file mode 100644
index 0000000..0d5b009
--- /dev/null
+++ b/nutch-plugins/parse-metatags/README.txt
@@ -0,0 +1,17 @@
+Parse-metatags plugin
+
+The parse-metatags plugin consists of a HTMLParserFilter which takes as parameter a list of metatag names with '*' as default value. The values are separated by ';'.
+In order to extract the values of the metatags description and keywords, you must specify in nutch-site.xml
+
+<property>
+  <name>metatags.names</name>
+  <value>description;keywords</value>
+</property>
+
+Prefixes the names with 'metatag.' in the parse-metadata. For instance to index description and keywords, you need to activate the plugin index-metadata and set the value of the parameter 'index.parse.md' to 'metatag.description;metatag.keywords'.
+  
+This code has been developed by DigitalPebble Ltd and offered to the community by ANT.com
+
+
+
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-metatags/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-metatags/build.xml b/nutch-plugins/parse-metatags/build.xml
new file mode 100644
index 0000000..e30292d
--- /dev/null
+++ b/nutch-plugins/parse-metatags/build.xml
@@ -0,0 +1,37 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="parse-metatags" default="jar-core">
+
+	<import file="../build-plugin.xml" />
+
+	<!-- Deploy Unit test dependencies -->
+	<target name="deps-test">
+		<ant target="deploy" inheritall="false" dir="../nutch-extensionpoints" />
+		<ant target="deploy" inheritall="false" dir="../protocol-file" />
+	</target>
+
+
+	<!-- for junit test -->
+	<mkdir dir="${build.test}/data" />
+	<copy todir="${build.test}/data">
+		<fileset dir="sample">
+			<include name="*.html" />
+		</fileset>
+	</copy>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-metatags/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-metatags/ivy.xml b/nutch-plugins/parse-metatags/ivy.xml
new file mode 100644
index 0000000..1a86d68
--- /dev/null
+++ b/nutch-plugins/parse-metatags/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-metatags/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-metatags/plugin.xml b/nutch-plugins/parse-metatags/plugin.xml
new file mode 100644
index 0000000..07933fa
--- /dev/null
+++ b/nutch-plugins/parse-metatags/plugin.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<plugin
+   id="parse-metatags"
+   name="MetaTags"
+   version="1.0"
+   provider-name="digitalpebble.com">
+
+   <runtime>
+      <library name="parse-metatags.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <extension id="org.apache.nutch.parse.metatags.parser"
+              name="MetaTags Parser"
+              point="org.apache.nutch.parse.HtmlParseFilter">
+      <implementation id="MetaTagsParser"
+                      class="org.apache.nutch.parse.metatags.MetaTagsParser"/>
+   </extension>
+
+</plugin>
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-metatags/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-metatags/pom.xml b/nutch-plugins/parse-metatags/pom.xml
new file mode 100644
index 0000000..e96d404
--- /dev/null
+++ b/nutch-plugins/parse-metatags/pom.xml
@@ -0,0 +1,38 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>parse-metatags</artifactId>
+    <packaging>jar</packaging>
+
+    <name>parse-metatags</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-metatags/src/main/java/org/apache/nutch/parse/metatags/MetaTagsParser.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-metatags/src/main/java/org/apache/nutch/parse/metatags/MetaTagsParser.java b/nutch-plugins/parse-metatags/src/main/java/org/apache/nutch/parse/metatags/MetaTagsParser.java
new file mode 100644
index 0000000..f9b9722
--- /dev/null
+++ b/nutch-plugins/parse-metatags/src/main/java/org/apache/nutch/parse/metatags/MetaTagsParser.java
@@ -0,0 +1,124 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse.metatags;
+
+import java.util.Enumeration;
+import java.util.HashSet;
+import java.util.Locale;
+import java.util.Properties;
+import java.util.Set;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.HTMLMetaTags;
+import org.apache.nutch.parse.HtmlParseFilter;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseResult;
+import org.apache.nutch.protocol.Content;
+import org.w3c.dom.DocumentFragment;
+
+/**
+ * Parse HTML meta tags (keywords, description) and store them in the parse
+ * metadata so that they can be indexed with the index-metadata plugin with the
+ * prefix 'metatag.'. Metatags are matched ignoring case.
+ */
+public class MetaTagsParser implements HtmlParseFilter {
+
+  private static final Log LOG = LogFactory.getLog(MetaTagsParser.class
+      .getName());
+
+  private Configuration conf;
+
+  private Set<String> metatagset = new HashSet<String>();
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    // specify whether we want a specific subset of metadata
+    // by default take everything we can find
+    String[] values = conf.getStrings("metatags.names", "*");
+    for (String val : values) {
+      metatagset.add(val.toLowerCase(Locale.ROOT));
+    }
+  }
+
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+  /**
+   * Check whether the metatag is in the list of metatags to be indexed (or if
+   * '*' is specified). If yes, add it to parse metadata.
+   */
+  private void addIndexedMetatags(Metadata metadata, String metatag,
+      String value) {
+    String lcMetatag = metatag.toLowerCase(Locale.ROOT);
+    if (metatagset.contains("*") || metatagset.contains(lcMetatag)) {
+      if (LOG.isDebugEnabled()) {
+        LOG.debug("Found meta tag: " + lcMetatag + "\t" + value);
+      }
+      metadata.add("metatag." + lcMetatag, value);
+    }
+  }
+
+  /**
+   * Check whether the metatag is in the list of metatags to be indexed (or if
+   * '*' is specified). If yes, add it with all values to parse metadata.
+   */
+  private void addIndexedMetatags(Metadata metadata, String metatag,
+      String[] values) {
+    String lcMetatag = metatag.toLowerCase(Locale.ROOT);
+    if (metatagset.contains("*") || metatagset.contains(lcMetatag)) {
+      for (String value : values) {
+        if (LOG.isDebugEnabled()) {
+          LOG.debug("Found meta tag: " + lcMetatag + "\t" + value);
+        }
+        metadata.add("metatag." + lcMetatag, value);
+      }
+    }
+  }
+
+  public ParseResult filter(Content content, ParseResult parseResult,
+      HTMLMetaTags metaTags, DocumentFragment doc) {
+
+    Parse parse = parseResult.get(content.getUrl());
+    Metadata metadata = parse.getData().getParseMeta();
+
+    // check in the metadata first : the tika-parser
+    // might have stored the values there already
+    for (String mdName : metadata.names()) {
+      addIndexedMetatags(metadata, mdName, metadata.getValues(mdName));
+    }
+
+    Metadata generalMetaTags = metaTags.getGeneralTags();
+    for (String tagName : generalMetaTags.names()) {
+      addIndexedMetatags(metadata, tagName, generalMetaTags.getValues(tagName));
+    }
+
+    Properties httpequiv = metaTags.getHttpEquivTags();
+    for (Enumeration<?> tagNames = httpequiv.propertyNames(); tagNames
+        .hasMoreElements();) {
+      String name = (String) tagNames.nextElement();
+      String value = httpequiv.getProperty(name);
+      addIndexedMetatags(metadata, name, value);
+    }
+
+    return parseResult;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-metatags/src/main/java/org/apache/nutch/parse/metatags/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-metatags/src/main/java/org/apache/nutch/parse/metatags/package-info.java b/nutch-plugins/parse-metatags/src/main/java/org/apache/nutch/parse/metatags/package-info.java
new file mode 100644
index 0000000..a55cf5c
--- /dev/null
+++ b/nutch-plugins/parse-metatags/src/main/java/org/apache/nutch/parse/metatags/package-info.java
@@ -0,0 +1,24 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Parse filter to extract meta tags: keywords, description, etc.
+ * Used in combination with index-metadata plugin
+ * (see {@link org.apache.nutch.indexer.metadata}).
+ */
+package org.apache.nutch.parse.metatags;
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-metatags/src/test/java/org/apache/nutch/parse/metatags/TestMetatagParser.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-metatags/src/test/java/org/apache/nutch/parse/metatags/TestMetatagParser.java b/nutch-plugins/parse-metatags/src/test/java/org/apache/nutch/parse/metatags/TestMetatagParser.java
new file mode 100644
index 0000000..024aadf
--- /dev/null
+++ b/nutch-plugins/parse-metatags/src/test/java/org/apache/nutch/parse/metatags/TestMetatagParser.java
@@ -0,0 +1,104 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.metatags;
+
+import java.util.Set;
+import java.util.TreeSet;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Assert;
+import org.junit.Test;
+
+public class TestMetatagParser {
+
+  private String fileSeparator = System.getProperty("file.separator");
+  private String sampleDir = System.getProperty("test.data", ".");
+  private String sampleFile = "testMetatags.html";
+  private String sampleFileMultival = "testMultivalueMetatags.html";
+  private String description = "This is a test of description";
+  private String keywords = "This is a test of keywords";
+
+  public Metadata parseMeta(String fileName, Configuration conf) {
+    Metadata metadata = null;
+    try {
+      String urlString = "file:" + sampleDir + fileSeparator + fileName;
+      Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
+      Content content = protocol.getProtocolOutput(new Text(urlString),
+          new CrawlDatum()).getContent();
+      Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
+      metadata = parse.getData().getParseMeta();
+    } catch (Exception e) {
+      e.printStackTrace();
+      Assert.fail(e.toString());
+    }
+    return metadata;
+  }
+
+  @Test
+  /** test defaults: keywords and description */
+  public void testIt() {
+    Configuration conf = NutchConfiguration.create();
+
+    // check that we get the same values
+    Metadata parseMeta = parseMeta(sampleFile, conf);
+
+    Assert.assertEquals(description, parseMeta.get("metatag.description"));
+    Assert.assertEquals(keywords, parseMeta.get("metatag.keywords"));
+  }
+
+  @Test
+  /** test multiple metatags resulting in metadata with multiple values */
+  public void testMultiValueMetatags() {
+    Configuration conf = NutchConfiguration.create();
+    conf.set("metatags.names", "keywords,DC.creator");
+    conf.set("index.parse.md", "metatag.keywords,metatag.dc.creator");
+
+    Metadata parseMeta = parseMeta(sampleFileMultival, conf);
+
+    String failMessage = "One value of metatag with multiple values is missing: ";
+
+    Set<String> valueSet = new TreeSet<String>();
+    for (String val : parseMeta.getValues("metatag.dc.creator")) {
+      valueSet.add(val);
+    }
+    String[] expectedValues1 = { "Doug Cutting", "Michael Cafarella" };
+    for (String val : expectedValues1) {
+      Assert.assertTrue(failMessage + val, valueSet.contains(val));
+    }
+
+    valueSet.clear();
+    for (String val : parseMeta.getValues("metatag.keywords")) {
+      valueSet.add(val);
+    }
+    String[] expectedValues2 = { "robot d'indexation", "web crawler",
+        "Webcrawler" };
+    for (String val : expectedValues2) {
+      Assert.assertTrue(failMessage + val, valueSet.contains(val));
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-metatags/src/test/resources/testMetatags.html
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-metatags/src/test/resources/testMetatags.html b/nutch-plugins/parse-metatags/src/test/resources/testMetatags.html
new file mode 100644
index 0000000..e9e8e6b
--- /dev/null
+++ b/nutch-plugins/parse-metatags/src/test/resources/testMetatags.html
@@ -0,0 +1,9 @@
+<html>
+<head>
+<meta name="Keywords" content="This is a test of keywords" />
+<meta name="Description" content="This is a test of description" />
+</head>
+<body>
+text of the document
+</body>
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-metatags/src/test/resources/testMultivalueMetatags.html
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-metatags/src/test/resources/testMultivalueMetatags.html b/nutch-plugins/parse-metatags/src/test/resources/testMultivalueMetatags.html
new file mode 100644
index 0000000..ca8b737
--- /dev/null
+++ b/nutch-plugins/parse-metatags/src/test/resources/testMultivalueMetatags.html
@@ -0,0 +1,12 @@
+<html>
+<head>
+<meta name="DC.creator" content="Doug Cutting">
+<meta name="DC.creator" content="Michael Cafarella">
+<!-- meta keywords in different casing -->
+<meta name="keywords" lang="en" content="web crawler" />
+<meta name="Keywords" lang="fr" content="robot d'indexation" />
+<meta name="KEYWORDS" lang="de" content="Webcrawler" />
+</head>
+<body>
+A test for multi-valued metatags.
+</body>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-replace/README.txt
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-replace/README.txt b/nutch-plugins/parse-replace/README.txt
new file mode 100644
index 0000000..a18bd9c
--- /dev/null
+++ b/nutch-plugins/parse-replace/README.txt
@@ -0,0 +1,91 @@
+ParseReplace plugin
+
+Allows post-parsing regexp replace manipulation of metadata fields.
+
+Configuration Example
+    <property>
+      <name>parse.replace.regexp</name>
+      <value>
+        id=/file:/http:/
+        url=/file:/http:/128
+      </value>
+    </property
+
+Property format: parse.replace.regexp
+    The format of the property is a list of regexp replacements, one line per field being
+    modified.  Field names would be one of those from https://wiki.apache.org/nutch/IndexStructure.
+
+    The fieldname preceeds the equal sign.  The first character after the equal sign signifies
+    the delimiter for the regexp, the replacement value and the flags.
+
+Replacement Sequence
+    The replacements will happen in the order listed. If a field needs multiple replacement operations
+    they may be listed more than once.
+
+RegExp Format
+    The regexp and the optional flags should correspond to Pattern.compile(String regexp, int flags) defined
+    here: http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#compile%28java.lang.String,%20int%29
+    Patterns are compiled when the plugin is initialized for efficiency.
+
+Replacement Format
+    The replacement value should correspond to Java Matcher(CharSequence input).replaceAll(String replacement):
+    http://docs.oracle.com/javase/7/docs/api/java/util/regex/Matcher.html#replaceAll%28java.lang.String%29
+
+Flags
+    The flags is an integer sum of the flag values defined in
+    http://docs.oracle.com/javase/7/docs/api/constant-values.html (Sec: java.util.regex.Pattern)
+
+Escaping
+    Since the regexp is being read from a config file, any escaped values must be double
+    escaped.  Eg:  id=/\\s+//  will cause the esacped \s+ match pattern to be used.
+
+Multi-valued Fields
+    If a field has multiple values, the replacement will be applied to each value in turn.
+
+Non-string Datatypes
+    Replacement is possible only on String field datatypes.  If the field you name in the property is
+    not a String datatype, it will be silently ignored.
+
+Host and URL specifc replacements.
+    If the replacements should apply only to specifc pages, then add a sequence like
+
+    hostmatch=/host match pattern/
+    fld1=/regexp/replace/flags
+    fld2=/regexp/replace/flags
+
+    or
+    urlmatch=/url match pattern/
+    fld1=/regexp/replace/flags
+    fld2=/regexp/replace/flags
+
+When using Host and URL replacements, all replacements preceding the first hostmatch or urlmatch
+will apply to all parsed pages.  Replacements following a hostmatch or urlmatch will be applied
+to pages which match the host or url field (up to the next hostmatch or urlmatch line).  hostmatch
+and urlmatch patterns must be unique in this property.
+
+Plugin order
+    TBD... But in most cases you will want this plugin to run last.
+
+Testing your match patterns
+    Online Regexp testers like http://www.regexplanet.com/advanced/java/index.html
+    can help get the basics of your pattern working.
+    To test in nutch: 
+        Prepare a test HTML file with the field contents you want to test. 
+        Place this in a directory accessible to nutch.
+        Use the file:/// syntax to list the test file(s) in a test/urls seed list.
+        See the nutch faq "index my local file system" for conf settings you will need.
+        (Note the urlmatch and hostmatch patterns may not conform to your test file host and url; This
+        test approach confirms only how your global matches behave, unless your urlmatch and hostmatch
+        patterns also match the file: URL pattern)
+ 
+    Run..
+        bin/nutch inject crawl/crawldb test
+        bin/nutch generate crawl/crawldb crawl/segments
+        bin/nutch fetch crawl/segments/[segment]
+        bin/nutch parse crawl/segments/[segment]
+
+    To inspect the returned fields...
+        bin/nutch readseg -dump crawl/segments/[segment] testout
+        less testout/dump
+
+    To retry: delete crawl/segments/[segment]/crawl_parse and repeat the parse and dump step.
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-replace/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-replace/build.xml b/nutch-plugins/parse-replace/build.xml
new file mode 100644
index 0000000..ca5ccf7
--- /dev/null
+++ b/nutch-plugins/parse-replace/build.xml
@@ -0,0 +1,37 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="parse-replace" default="jar-core">
+
+	<import file="../build-plugin.xml" />
+
+	<!-- Deploy Unit test dependencies -->
+	<target name="deps-test">
+		<ant target="deploy" inheritall="false" dir="../nutch-extensionpoints" />
+		<ant target="deploy" inheritall="false" dir="../protocol-file" />
+	</target>
+
+
+	<!-- for junit test -->
+	<mkdir dir="${build.test}/data" />
+	<copy todir="${build.test}/data">
+		<fileset dir="sample">
+			<include name="*.html" />
+		</fileset>
+	</copy>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-replace/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-replace/ivy.xml b/nutch-plugins/parse-replace/ivy.xml
new file mode 100644
index 0000000..1a86d68
--- /dev/null
+++ b/nutch-plugins/parse-replace/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-replace/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-replace/plugin.xml b/nutch-plugins/parse-replace/plugin.xml
new file mode 100644
index 0000000..6368210
--- /dev/null
+++ b/nutch-plugins/parse-replace/plugin.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<plugin
+   id="parse-replace"
+   name="ReplaceParser"
+   version="1.0"
+   provider-name="PeterCiuffetti">
+
+   <runtime>
+      <library name="parse-replace.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <extension id="org.apache.nutch.parse.replace.parser"
+              name="Replace Parser"
+              point="org.apache.nutch.parse.HtmlParseFilter">
+      <implementation id="ReplaceParser"
+                      class="org.apache.nutch.parse.replace.ReplaceParser"/>
+   </extension>
+
+</plugin>
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-replace/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-replace/pom.xml b/nutch-plugins/parse-replace/pom.xml
new file mode 100644
index 0000000..073f895
--- /dev/null
+++ b/nutch-plugins/parse-replace/pom.xml
@@ -0,0 +1,38 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>parse-replace</artifactId>
+    <packaging>jar</packaging>
+
+    <name>parse-replace</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-replace/src/main/java/org/apache/nutch/parse/replace/ReplaceParser.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-replace/src/main/java/org/apache/nutch/parse/replace/ReplaceParser.java b/nutch-plugins/parse-replace/src/main/java/org/apache/nutch/parse/replace/ReplaceParser.java
new file mode 100644
index 0000000..9773c4a
--- /dev/null
+++ b/nutch-plugins/parse-replace/src/main/java/org/apache/nutch/parse/replace/ReplaceParser.java
@@ -0,0 +1,74 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse.replace;
+
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.parse.HTMLMetaTags;
+import org.apache.nutch.parse.HtmlParseFilter;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseResult;
+import org.apache.nutch.protocol.Content;
+import org.w3c.dom.DocumentFragment;
+
+/**
+ * Do pattern replacements on selected field contents
+ * prior to indexing.
+ */
+public class ReplaceParser implements HtmlParseFilter {
+
+  private static final Log LOG = LogFactory.getLog(ReplaceParser.class
+      .getName());
+
+  private static Map<String, List<Object>> REPLACEPATTERNS_BY_HOST = new HashMap();
+  private static Map<String, List<Object>> REPLACEPATTERNS_BY_URL = new HashMap();
+
+  private Configuration conf;
+
+  private Set<String> metatagset = new HashSet<String>();
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    String[] values = conf.getStrings("parse.replace.regexp", null);
+    if (values != null) {
+      this.parseConf(values);
+    }
+  }
+
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+  private void parseConf(String[] values) {
+	  
+  }
+
+  public ParseResult filter(Content content, ParseResult parseResult,
+      HTMLMetaTags metaTags, DocumentFragment doc) {
+
+    Parse parse = parseResult.get(content.getUrl());
+
+    return parseResult;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-replace/src/main/java/org/apache/nutch/parse/replace/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-replace/src/main/java/org/apache/nutch/parse/replace/package-info.java b/nutch-plugins/parse-replace/src/main/java/org/apache/nutch/parse/replace/package-info.java
new file mode 100644
index 0000000..b678f00
--- /dev/null
+++ b/nutch-plugins/parse-replace/src/main/java/org/apache/nutch/parse/replace/package-info.java
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Parse filter to allow pattern replacements on parsed metadata.
+ */
+package org.apache.nutch.parse.replace;
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-replace/src/test/java/org/apache/nutch/parse/replace/TestParseReplace.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-replace/src/test/java/org/apache/nutch/parse/replace/TestParseReplace.java b/nutch-plugins/parse-replace/src/test/java/org/apache/nutch/parse/replace/TestParseReplace.java
new file mode 100644
index 0000000..593d5ed
--- /dev/null
+++ b/nutch-plugins/parse-replace/src/test/java/org/apache/nutch/parse/replace/TestParseReplace.java
@@ -0,0 +1,68 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.replace;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Assert;
+import org.junit.Test;
+
+public class TestParseReplace {
+
+  private String fileSeparator = System.getProperty("file.separator");
+  private String sampleDir = System.getProperty("test.data", ".");
+  private String sampleFile = "testParseReplace.html";
+  private String description = "This is a test of description";
+  private String keywords = "This is a test of keywords";
+
+  public Metadata parseMeta(String fileName, Configuration conf) {
+    Metadata metadata = null;
+    try {
+      String urlString = "file:" + sampleDir + fileSeparator + fileName;
+      Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
+      Content content = protocol.getProtocolOutput(new Text(urlString),
+          new CrawlDatum()).getContent();
+      Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
+      metadata = parse.getData().getParseMeta();
+    } catch (Exception e) {
+      e.printStackTrace();
+      Assert.fail(e.toString());
+    }
+    return metadata;
+  }
+
+  @Test
+  /** test defaults: keywords and description */
+  public void testIt() {
+    Configuration conf = NutchConfiguration.create();
+
+    // check that we get the same values
+    Metadata parseMeta = parseMeta(sampleFile, conf);
+
+    Assert.assertEquals(description, parseMeta.get("metatag.description"));
+    Assert.assertEquals(keywords, parseMeta.get("metatag.keywords"));
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-replace/src/test/resources/testParseReplace.html
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-replace/src/test/resources/testParseReplace.html b/nutch-plugins/parse-replace/src/test/resources/testParseReplace.html
new file mode 100644
index 0000000..825dcb9
--- /dev/null
+++ b/nutch-plugins/parse-replace/src/test/resources/testParseReplace.html
@@ -0,0 +1,11 @@
+<html>
+  <head>
+    <title>Testing the power of parser-replace plugin</title>
+    <meta name="description" content="With this plugin, nutch is my bitch! Bwuhuhuhaha!">
+    <meta name="keywords" content="Awesome, Riveting, Two Thumbs Up!">
+    <meta name="author" content="Peter Ciuffetti">
+  </head>
+  <body>
+    <p>This html file is used to test the Nutch parse-replace regexp replacer plugin. A decidely boring thing to do.</p>
+  </body>
+</html>
\ No newline at end of file

[45/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build for nutch-core and nutch-plugins

Posted by th...@apache.org.

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/hostdb/UpdateHostDb.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/hostdb/UpdateHostDb.java b/nutch-core/src/main/java/org/apache/nutch/hostdb/UpdateHostDb.java
new file mode 100644
index 0000000..3ba3c81
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/hostdb/UpdateHostDb.java
@@ -0,0 +1,259 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.hostdb;
+
+import java.text.SimpleDateFormat;
+import java.util.Random;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.io.WritableUtils;
+import org.apache.hadoop.mapred.FileInputFormat;
+import org.apache.hadoop.mapred.SequenceFileInputFormat;
+import org.apache.hadoop.mapred.FileOutputFormat;
+import org.apache.hadoop.mapred.SequenceFileOutputFormat;
+import org.apache.hadoop.mapred.KeyValueTextInputFormat;
+import org.apache.hadoop.mapred.Mapper;
+import org.apache.hadoop.mapred.Reducer;
+import org.apache.hadoop.mapred.JobClient;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.hadoop.mapred.lib.MultipleInputs;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.CrawlDb;
+import org.apache.nutch.crawl.NutchWritable;
+import org.apache.nutch.util.FSUtils;
+import org.apache.nutch.util.LockUtil;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.NutchJob;
+import org.apache.nutch.util.TimingUtil;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Tool to create a HostDB from the CrawlDB. It aggregates fetch status values
+ * by host and checks DNS entries for hosts.
+ */
+public class UpdateHostDb extends Configured implements Tool {
+
+  public static final Logger LOG = LoggerFactory.getLogger(UpdateHostDb.class);
+  public static final String LOCK_NAME = ".locked";
+
+  public static final String HOSTDB_PURGE_FAILED_HOSTS_THRESHOLD = "hostdb.purge.failed.hosts.threshold";
+  public static final String HOSTDB_NUM_RESOLVER_THREADS = "hostdb.num.resolvers.threads";
+  public static final String HOSTDB_RECHECK_INTERVAL = "hostdb.recheck.interval";
+  public static final String HOSTDB_CHECK_FAILED = "hostdb.check.failed";
+  public static final String HOSTDB_CHECK_NEW = "hostdb.check.new";
+  public static final String HOSTDB_CHECK_KNOWN = "hostdb.check.known";
+  public static final String HOSTDB_FORCE_CHECK = "hostdb.force.check";
+  public static final String HOSTDB_URL_FILTERING = "hostdb.url.filter";
+  public static final String HOSTDB_URL_NORMALIZING = "hostdb.url.normalize";
+  public static final String HOSTDB_NUMERIC_FIELDS = "hostdb.numeric.fields";
+  public static final String HOSTDB_STRING_FIELDS = "hostdb.string.fields";
+  public static final String HOSTDB_PERCENTILES = "hostdb.percentiles";
+  
+  private void updateHostDb(Path hostDb, Path crawlDb, Path topHosts,
+    boolean checkFailed, boolean checkNew, boolean checkKnown,
+    boolean force, boolean filter, boolean normalize) throws Exception {
+
+    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+    long start = System.currentTimeMillis();
+    LOG.info("UpdateHostDb: starting at " + sdf.format(start));
+
+    JobConf job = new NutchJob(getConf());
+    boolean preserveBackup = job.getBoolean("db.preserve.backup", true);
+    job.setJarByClass(UpdateHostDb.class);
+    job.setJobName("UpdateHostDb");
+
+    // Check whether the urlfilter-domainblacklist plugin is loaded
+    if (filter && new String("urlfilter-domainblacklist").matches(job.get("plugin.includes"))) {
+      throw new Exception("domainblacklist-urlfilter must not be enabled");
+    }
+
+    // Check whether the urlnormalizer-host plugin is loaded
+    if (normalize && new String("urlnormalizer-host").matches(job.get("plugin.includes"))) {
+      throw new Exception("urlnormalizer-host must not be enabled");
+    }
+
+    FileSystem fs = FileSystem.get(job);
+    Path old = new Path(hostDb, "old");
+    Path current = new Path(hostDb, "current");
+    Path tempHostDb = new Path(hostDb, "hostdb-"
+      + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
+
+    // lock an existing hostdb to prevent multiple simultaneous updates
+    Path lock = new Path(hostDb, LOCK_NAME);
+    if (!fs.exists(current)) {
+      fs.mkdirs(current);
+    }
+    LockUtil.createLockFile(fs, lock, false);
+
+    MultipleInputs.addInputPath(job, current, SequenceFileInputFormat.class);
+
+    if (topHosts != null) {
+      MultipleInputs.addInputPath(job, topHosts, KeyValueTextInputFormat.class);
+    }
+    if (crawlDb != null) {
+      // Tell the job we read from CrawlDB
+      job.setBoolean("hostdb.reading.crawldb", true);
+      MultipleInputs.addInputPath(job, new Path(crawlDb,
+        CrawlDb.CURRENT_NAME), SequenceFileInputFormat.class);
+    }
+
+    FileOutputFormat.setOutputPath(job, tempHostDb);
+
+    job.setOutputFormat(SequenceFileOutputFormat.class);
+
+    job.setMapOutputKeyClass(Text.class);
+    job.setMapOutputValueClass(NutchWritable.class);
+    job.setOutputKeyClass(Text.class);
+    job.setOutputValueClass(HostDatum.class);
+    job.setMapperClass(UpdateHostDbMapper.class);
+    job.setReducerClass(UpdateHostDbReducer.class);
+
+    job.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);
+    job.setSpeculativeExecution(false);
+    job.setBoolean(HOSTDB_CHECK_FAILED, checkFailed);
+    job.setBoolean(HOSTDB_CHECK_NEW, checkNew);
+    job.setBoolean(HOSTDB_CHECK_KNOWN, checkKnown);
+    job.setBoolean(HOSTDB_FORCE_CHECK, force);
+    job.setBoolean(HOSTDB_URL_FILTERING, filter);
+    job.setBoolean(HOSTDB_URL_NORMALIZING, normalize);
+    job.setClassLoader(Thread.currentThread().getContextClassLoader());
+    
+    try {
+      JobClient.runJob(job);
+
+      FSUtils.replace(fs, old, current, true);
+      FSUtils.replace(fs, current, tempHostDb, true);
+
+      if (!preserveBackup && fs.exists(old)) fs.delete(old, true);
+    } catch (Exception e) {
+      if (fs.exists(tempHostDb)) {
+        fs.delete(tempHostDb, true);
+      }
+      LockUtil.removeLockFile(fs, lock);
+      throw e;
+    }
+
+    LockUtil.removeLockFile(fs, lock);
+    long end = System.currentTimeMillis();
+    LOG.info("UpdateHostDb: finished at " + sdf.format(end) +
+      ", elapsed: " + TimingUtil.elapsedTime(start, end));
+  }
+
+  public static void main(String args[]) throws Exception {
+    int res = ToolRunner.run(NutchConfiguration.create(), new UpdateHostDb(), args);
+    System.exit(res);
+  }
+
+  public int run(String[] args) throws Exception {
+    if (args.length < 2) {
+      System.err.println("Usage: UpdateHostDb -hostdb <hostdb> " +
+        "[-tophosts <tophosts>] [-crawldb <crawldb>] [-checkAll] [-checkFailed]" +
+        " [-checkNew] [-checkKnown] [-force] [-filter] [-normalize]");
+      return -1;
+    }
+
+    Path hostDb = null;
+    Path crawlDb = null;
+    Path topHosts = null;
+
+    boolean checkFailed = false;
+    boolean checkNew = false;
+    boolean checkKnown = false;
+    boolean force = false;
+
+    boolean filter = false;
+    boolean normalize = false;
+
+    for (int i = 0; i < args.length; i++) {
+      if (args[i].equals("-hostdb")) {
+        hostDb = new Path(args[i + 1]);
+        LOG.info("UpdateHostDb: hostdb: " + hostDb);
+        i++;
+      }
+      if (args[i].equals("-crawldb")) {
+        crawlDb = new Path(args[i + 1]);
+        LOG.info("UpdateHostDb: crawldb: " + crawlDb);
+        i++;
+      }
+      if (args[i].equals("-tophosts")) {
+        topHosts = new Path(args[i + 1]);
+        LOG.info("UpdateHostDb: tophosts: " + topHosts);
+        i++;
+      }
+
+      if (args[i].equals("-checkFailed")) {
+        LOG.info("UpdateHostDb: checking failed hosts");
+        checkFailed = true;
+      }
+      if (args[i].equals("-checkNew")) {
+        LOG.info("UpdateHostDb: checking new hosts");
+        checkNew = true;
+      }
+      if (args[i].equals("-checkKnown")) {
+        LOG.info("UpdateHostDb: checking known hosts");
+        checkKnown = true;
+      }
+      if (args[i].equals("-checkAll")) {
+        LOG.info("UpdateHostDb: checking all hosts");
+        checkFailed = true;
+        checkNew = true;
+        checkKnown = true;
+      }
+      if (args[i].equals("-force")) {
+        LOG.info("UpdateHostDb: forced check");
+        force = true;
+      }
+      if (args[i].equals("-filter")) {
+        LOG.info("UpdateHostDb: filtering enabled");
+        filter = true;
+      }
+      if (args[i].equals("-normalize")) {
+        LOG.info("UpdateHostDb: normalizing enabled");
+        normalize = true;
+      }
+    }
+
+    if (hostDb == null) {
+      System.err.println("hostDb is mandatory");
+      return -1;
+    }
+
+    try {
+      updateHostDb(hostDb, crawlDb, topHosts, checkFailed, checkNew,
+        checkKnown, force, filter, normalize);
+
+      return 0;
+    } catch (Exception e) {
+      LOG.error("UpdateHostDb: " + StringUtils.stringifyException(e));
+      return -1;
+    }
+  }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/hostdb/UpdateHostDbMapper.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/hostdb/UpdateHostDbMapper.java b/nutch-core/src/main/java/org/apache/nutch/hostdb/UpdateHostDbMapper.java
new file mode 100644
index 0000000..5844b04
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/hostdb/UpdateHostDbMapper.java
@@ -0,0 +1,239 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.hostdb;
+
+import java.io.IOException;
+
+
+import org.apache.hadoop.io.FloatWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.Mapper;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reporter;
+
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.CrawlDb;
+import org.apache.nutch.crawl.NutchWritable;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.net.URLFilters;
+import org.apache.nutch.net.URLNormalizers;
+import org.apache.nutch.protocol.ProtocolStatus;
+import org.apache.nutch.util.URLUtil;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Mapper ingesting HostDB and CrawlDB entries. Additionally it can also read
+ * host score info from a plain text key/value file generated by the
+ * Webgraph's NodeDumper tool.
+ */
+public class UpdateHostDbMapper
+  implements Mapper<Text, Writable, Text, NutchWritable> {
+  
+  public static final Logger LOG = LoggerFactory.getLogger(UpdateHostDbMapper.class);
+  protected Text host = new Text();
+  protected HostDatum hostDatum = null;
+  protected CrawlDatum crawlDatum = null;
+  protected String reprUrl = null;
+  protected String buffer = null;
+  protected String[] args = null;
+  protected boolean filter = false;
+  protected boolean normalize = false;
+  protected boolean readingCrawlDb = false;
+  protected URLFilters filters = null;
+  protected URLNormalizers normalizers = null;
+
+  public void close() {}
+
+  /**
+   * @param JobConf
+   * @return void
+   */
+  public void configure(JobConf job) {
+    readingCrawlDb = job.getBoolean("hostdb.reading.crawldb", false);
+    filter = job.getBoolean(UpdateHostDb.HOSTDB_URL_FILTERING, false);
+    normalize = job.getBoolean(UpdateHostDb.HOSTDB_URL_NORMALIZING, false);
+
+    if (filter)
+      filters = new URLFilters(job);
+    if (normalize)
+      normalizers = new URLNormalizers(job, URLNormalizers.SCOPE_DEFAULT);
+  }
+
+  /**
+   * Filters and or normalizes the input URL
+   *
+   * @param String
+   * @return String
+   */
+  protected String filterNormalize(String url) {
+    // We actually receive a hostname here so let's make a URL
+    // TODO: we force shop.fcgroningen to be https, how do we know that here?
+    // http://issues.openindex.io/browse/SPIDER-40
+    url = "http://" + url + "/";
+
+    try {
+      if (normalize)
+        url = normalizers.normalize(url, URLNormalizers.SCOPE_DEFAULT);
+      if (filter)
+        url = filters.filter(url);
+      if (url == null)
+        return null;
+    } catch (Exception e) {
+      return null;
+    }
+
+    // Turn back to host
+    return URLUtil.getHost(url);
+  }
+
+  /**
+    * Mapper ingesting records from the HostDB, CrawlDB and plaintext host
+    * scores file. Statistics and scores are passed on.
+    *
+    * @param Text key
+    * @param Writable value
+    * @param OutputCollector<Text,NutchWritable> output
+    * @param Reporter reporter
+    * @return void
+    */
+  public void map(Text key, Writable value,
+    OutputCollector<Text,NutchWritable> output, Reporter reporter)
+    throws IOException {
+
+    // Get the key!
+    String keyStr = key.toString();
+
+    // Check if we process records from the CrawlDB
+    if (key instanceof Text && value instanceof CrawlDatum) {
+      // Get the normalized and filtered host of this URL
+      buffer = filterNormalize(URLUtil.getHost(keyStr));
+
+      // Filtered out?
+      if (buffer == null) {
+        reporter.incrCounter("UpdateHostDb", "filtered_records", 1);
+        LOG.info("UpdateHostDb: " + URLUtil.getHost(keyStr) + " crawldatum has been filtered");
+        return;
+      }
+
+      // Set the host of this URL
+      host.set(buffer);
+      crawlDatum = (CrawlDatum)value;
+      hostDatum = new HostDatum();
+
+      /**
+        * TODO: fix multi redirects: host_a => host_b/page => host_c/page/whatever
+        * http://www.ferienwohnung-armbruster.de/
+        * http://www.ferienwohnung-armbruster.de/website/
+        * http://www.ferienwohnung-armbruster.de/website/willkommen.php
+        *
+        * We cannot reresolve redirects for host objects as CrawlDatum metadata is
+        * not available. We also cannot reliably use the reducer in all cases
+        * since redirects may be across hosts or even domains. The example
+        * above has redirects that will end up in the same reducer. During that
+        * phase, however, we do not know which URL redirects to the next URL.
+        */
+      // Do not resolve homepages when the root URL is unfetched
+      if (crawlDatum.getStatus() != CrawlDatum.STATUS_DB_UNFETCHED) {
+        // Get the protocol
+        String protocol = URLUtil.getProtocol(keyStr);
+        
+        // Get the proposed homepage URL
+        String homepage = protocol + "://" + buffer + "/";
+
+        // Check if the current key is equals the host
+        if (keyStr.equals(homepage)) {
+          // Check if this is a redirect to the real home page
+          if (crawlDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_PERM ||
+            crawlDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_TEMP) {
+
+            // Obtain the repr url for this redirect via protocolstatus from the metadata
+            ProtocolStatus z = (ProtocolStatus)crawlDatum.getMetaData().
+              get(Nutch.WRITABLE_PROTO_STATUS_KEY);
+
+            // Get the protocol status' arguments
+            args = z.getArgs();
+
+            // ..and the possible redirect URL
+            reprUrl = args[0];
+
+            // Am i a redirect?
+            if (reprUrl != null) {
+              LOG.info("UpdateHostDb: homepage: " + keyStr + " redirects to: " + args[0]);
+              output.collect(host, new NutchWritable(hostDatum));
+              hostDatum.setHomepageUrl(reprUrl);
+            } else {
+              LOG.info("UpdateHostDb: homepage: " + keyStr + 
+                " redirects to: " + args[0] + " but has been filtered out");
+            }
+          } else {
+            hostDatum.setHomepageUrl(homepage);
+            output.collect(host, new NutchWritable(hostDatum));
+            LOG.info("UpdateHostDb: homepage: " + homepage);
+          }
+        }
+      }
+
+      // Always emit crawl datum
+      output.collect(host, new NutchWritable(crawlDatum));
+    }
+
+    // Check if we got a record from the hostdb
+    if (key instanceof Text && value instanceof HostDatum) {
+      buffer = filterNormalize(keyStr);
+
+      // Filtered out?
+      if (buffer == null) {
+        reporter.incrCounter("UpdateHostDb", "filtered_records", 1);
+        LOG.info("UpdateHostDb: " + key.toString() + " hostdatum has been filtered");
+        return;
+      }
+
+      // Get a HostDatum
+      hostDatum = (HostDatum)value;
+      key.set(buffer);
+
+      // If we're also reading CrawlDb entries, reset db_* statistics because
+      // we're aggregating them from CrawlDB anyway
+      if (readingCrawlDb) {
+        hostDatum.resetStatistics();
+      }
+
+      output.collect(key, new NutchWritable(hostDatum));
+    }
+
+    // Check if we got a record with host scores
+    if (key instanceof Text && value instanceof Text) {
+      buffer = filterNormalize(keyStr);
+
+      // Filtered out?
+      if (buffer == null) {
+        reporter.incrCounter("UpdateHostDb", "filtered_records", 1);
+        LOG.info("UpdateHostDb: " + key.toString() + " score has been filtered");
+        return;
+      }
+
+      key.set(buffer);
+
+      output.collect(key,
+        new NutchWritable(new FloatWritable(Float.parseFloat(value.toString()))));
+    }
+  }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/hostdb/UpdateHostDbReducer.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/hostdb/UpdateHostDbReducer.java b/nutch-core/src/main/java/org/apache/nutch/hostdb/UpdateHostDbReducer.java
new file mode 100644
index 0000000..33dd18b
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/hostdb/UpdateHostDbReducer.java
@@ -0,0 +1,427 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.hostdb;
+
+import java.io.IOException;
+import java.util.Date;
+import java.util.Iterator;
+import java.util.concurrent.BlockingQueue;
+import java.util.concurrent.SynchronousQueue;
+import java.util.concurrent.ThreadPoolExecutor;
+import java.util.concurrent.TimeUnit;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.hadoop.io.FloatWritable;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reducer;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.hadoop.util.StringUtils;
+
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.NutchWritable;
+
+import com.tdunning.math.stats.TDigest;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ *
+ *
+ */
+public class UpdateHostDbReducer
+  implements Reducer<Text, NutchWritable, Text, HostDatum> {
+
+  public static final Logger LOG = LoggerFactory.getLogger(UpdateHostDbReducer.class);
+  protected ResolverThread resolverThread = null;
+  protected Integer numResolverThreads = 10;
+  protected static Integer purgeFailedHostsThreshold = -1;
+  protected static Integer recheckInterval = 86400000;
+  protected static boolean checkFailed = false;
+  protected static boolean checkNew = false;
+  protected static boolean checkKnown = false;
+  protected static boolean force = false;
+  protected static long now = new Date().getTime();
+  protected static String[] numericFields;
+  protected static String[] stringFields;
+  protected static int[] percentiles;
+  protected static Text[] numericFieldWritables;
+  protected static Text[] stringFieldWritables;
+  
+  protected BlockingQueue<Runnable> queue = new SynchronousQueue<Runnable>();
+  protected ThreadPoolExecutor executor = null;
+
+  /**
+    * Configures the thread pool and prestarts all resolver threads.
+    *
+    * @param JobConf
+    */
+  public void configure(JobConf job) {
+    purgeFailedHostsThreshold = job.getInt(UpdateHostDb.HOSTDB_PURGE_FAILED_HOSTS_THRESHOLD, -1);
+    numResolverThreads = job.getInt(UpdateHostDb.HOSTDB_NUM_RESOLVER_THREADS, 10);
+    recheckInterval = job.getInt(UpdateHostDb.HOSTDB_RECHECK_INTERVAL, 86400) * 1000;
+    checkFailed = job.getBoolean(UpdateHostDb.HOSTDB_CHECK_FAILED, false);
+    checkNew = job.getBoolean(UpdateHostDb.HOSTDB_CHECK_NEW, false);
+    checkKnown = job.getBoolean(UpdateHostDb.HOSTDB_CHECK_KNOWN, false);
+    force = job.getBoolean(UpdateHostDb.HOSTDB_FORCE_CHECK, false);
+    numericFields = job.getStrings(UpdateHostDb.HOSTDB_NUMERIC_FIELDS);
+    stringFields = job.getStrings(UpdateHostDb.HOSTDB_STRING_FIELDS);
+    percentiles = job.getInts(UpdateHostDb.HOSTDB_PERCENTILES);
+    
+    // What fields do we need to collect metadata from
+    if (numericFields != null) {
+      numericFieldWritables = new Text[numericFields.length];
+      for (int i = 0; i < numericFields.length; i++) {
+        numericFieldWritables[i] = new Text(numericFields[i]);
+      }
+    }
+    
+    if (stringFields != null) {
+      stringFieldWritables = new Text[stringFields.length];
+      for (int i = 0; i < stringFields.length; i++) {
+        stringFieldWritables[i] = new Text(stringFields[i]);
+      }
+    }
+
+    // Initialize the thread pool with our queue
+    executor = new ThreadPoolExecutor(numResolverThreads, numResolverThreads,
+      5, TimeUnit.SECONDS, queue);
+
+    // Run all threads in the pool
+    executor.prestartAllCoreThreads();
+  }
+
+  /**
+    *
+    */
+  public void reduce(Text key, Iterator<NutchWritable> values,
+    OutputCollector<Text,HostDatum> output, Reporter reporter) throws IOException {
+
+    Map<String,Map<String,Integer>> stringCounts = new HashMap<String,Map<String, Integer>>();
+    Map<String,Float> maximums = new HashMap<String,Float>();
+    Map<String,Float> sums = new HashMap<String,Float>(); // used to calc averages
+    Map<String,Integer> counts = new HashMap<String,Integer>(); // used to calc averages
+    Map<String,Float> minimums = new HashMap<String,Float>();
+    Map<String,TDigest> tdigests = new HashMap<String,TDigest>();
+    
+    HostDatum hostDatum = new HostDatum();
+    float score = 0;
+    
+    if (stringFields != null) {
+      for (int i = 0; i < stringFields.length; i++) {
+        stringCounts.put(stringFields[i], new HashMap<String,Integer>());
+      }
+    }
+    
+    // Loop through all values until we find a non-empty HostDatum or use
+    // an empty if this is a new host for the host db
+    while (values.hasNext()) {
+      Writable value = values.next().get();
+      
+      // Count crawl datum status's and collect metadata from fields
+      if (value instanceof CrawlDatum) {
+        CrawlDatum buffer = (CrawlDatum)value;
+        
+        // Set the correct status field
+        switch (buffer.getStatus()) {
+          case CrawlDatum.STATUS_DB_UNFETCHED:
+            hostDatum.setUnfetched(hostDatum.getUnfetched() + 1);
+            break;
+
+          case CrawlDatum.STATUS_DB_FETCHED:
+            hostDatum.setFetched(hostDatum.getFetched() + 1);
+            break;
+
+          case CrawlDatum.STATUS_DB_GONE:
+            hostDatum.setGone(hostDatum.getGone() + 1);
+            break;
+
+          case CrawlDatum.STATUS_DB_REDIR_TEMP:
+            hostDatum.setRedirTemp(hostDatum.getRedirTemp() + 1);
+            break;
+
+          case CrawlDatum.STATUS_DB_REDIR_PERM:
+            hostDatum.setRedirPerm(hostDatum.getRedirPerm() + 1);
+            break;
+
+          case CrawlDatum.STATUS_DB_NOTMODIFIED:
+            hostDatum.setNotModified(hostDatum.getNotModified() + 1);
+            break;
+        }
+        
+        // Record connection failures
+        if (buffer.getRetriesSinceFetch() != 0) {
+          hostDatum.incConnectionFailures();
+        }
+        
+        // Only gather metadata statistics for proper fetched pages
+        if (buffer.getStatus() == CrawlDatum.STATUS_DB_FETCHED || buffer.getStatus() == CrawlDatum.STATUS_DB_NOTMODIFIED) {            
+          // Deal with the string fields
+          if (stringFields != null) {
+            for (int i = 0; i < stringFields.length; i++) {
+              // Does this field exist?
+              if (buffer.getMetaData().get(stringFieldWritables[i]) != null) {
+                // Get it!
+                String metadataValue = null;
+                try {
+                  metadataValue = buffer.getMetaData().get(stringFieldWritables[i]).toString();
+                } catch (Exception e) {
+                  LOG.error("Metadata field " + stringFields[i] + " is probably not a numeric value");
+                }
+              
+                // Does the value exist?
+                if (stringCounts.get(stringFields[i]).containsKey(metadataValue)) {
+                  // Yes, increment it
+                  stringCounts.get(stringFields[i]).put(metadataValue, stringCounts.get(stringFields[i]).get(metadataValue) + 1);
+                } else {
+                  // Create it!
+                  stringCounts.get(stringFields[i]).put(metadataValue, 1);
+                }
+              }
+            }
+          }
+          
+          // Deal with the numeric fields
+          if (numericFields != null) {
+            for (int i = 0; i < numericFields.length; i++) {
+              // Does this field exist?
+              if (buffer.getMetaData().get(numericFieldWritables[i]) != null) {
+                try {
+                  // Get it!
+                  Float metadataValue = Float.parseFloat(buffer.getMetaData().get(numericFieldWritables[i]).toString());
+                  
+                  // Does the median value exist?
+                  if (tdigests.containsKey(numericFields[i])) {
+                    tdigests.get(numericFields[i]).add(metadataValue);
+                  } else {
+                    // Create it!
+                    TDigest tdigest = TDigest.createDigest(100);
+                    tdigest.add((double)metadataValue);
+                    tdigests.put(numericFields[i], tdigest);
+                  }
+                
+                  // Does the minimum value exist?
+                  if (minimums.containsKey(numericFields[i])) {
+                    // Write if this is lower than existing value
+                    if (metadataValue < minimums.get(numericFields[i])) {
+                      minimums.put(numericFields[i], metadataValue);
+                    }
+                  } else {
+                    // Create it!
+                    minimums.put(numericFields[i], metadataValue);
+                  }
+                  
+                  // Does the maximum value exist?
+                  if (maximums.containsKey(numericFields[i])) {
+                    // Write if this is lower than existing value
+                    if (metadataValue > maximums.get(numericFields[i])) {
+                      maximums.put(numericFields[i], metadataValue);
+                    }
+                  } else {
+                    // Create it!
+                    maximums.put(numericFields[i], metadataValue);
+                  }
+                  
+                  // Sum it up!
+                  if (sums.containsKey(numericFields[i])) {
+                    // Increment
+                    sums.put(numericFields[i], sums.get(numericFields[i]) + metadataValue);
+                    counts.put(numericFields[i], counts.get(numericFields[i]) + 1);
+                  } else {
+                    // Create it!
+                    sums.put(numericFields[i], metadataValue);
+                    counts.put(numericFields[i], 1);
+                  }
+                } catch (Exception e) {
+                  LOG.error(e.getMessage() + " when processing values for " + key.toString());
+                }
+              }
+            }
+          }
+        }
+      }
+      
+      // 
+      if (value instanceof HostDatum) {
+        HostDatum buffer = (HostDatum)value;
+
+        // Check homepage URL
+        if (buffer.hasHomepageUrl()) {
+          hostDatum.setHomepageUrl(buffer.getHomepageUrl());
+        }
+
+        // Check lastCheck timestamp
+        if (!buffer.isEmpty()) {
+          hostDatum.setLastCheck(buffer.getLastCheck());
+        }
+
+        // Check and set DNS failures
+        if (buffer.getDnsFailures() > 0) {
+          hostDatum.setDnsFailures(buffer.getDnsFailures());
+        }
+
+        // Check and set connection failures
+        if (buffer.getConnectionFailures() > 0) {
+          hostDatum.setConnectionFailures(buffer.getConnectionFailures());
+        }
+        
+        // Check metadata
+        if (!buffer.getMetaData().isEmpty()) {
+          hostDatum.setMetaData(buffer.getMetaData());
+        }
+
+        // Check and set score (score from Web Graph has precedence)
+        if (buffer.getScore() > 0) {
+          hostDatum.setScore(buffer.getScore());
+        }
+      }
+
+      // Check for the score
+      if (value instanceof FloatWritable) {
+        FloatWritable buffer = (FloatWritable)value;
+        score = buffer.get();
+      }
+    }
+
+    // Check if score was set from Web Graph
+    if (score > 0) {
+      hostDatum.setScore(score);
+    }
+    
+    // Set metadata
+    for (Map.Entry<String, Map<String,Integer>> entry : stringCounts.entrySet()) {
+      for (Map.Entry<String,Integer> subEntry : entry.getValue().entrySet()) {
+        hostDatum.getMetaData().put(new Text(entry.getKey() + "." + subEntry.getKey()), new IntWritable(subEntry.getValue()));
+      }
+    }
+    for (Map.Entry<String, Float> entry : maximums.entrySet()) {
+      hostDatum.getMetaData().put(new Text("max." + entry.getKey()), new FloatWritable(entry.getValue()));
+    }
+    for (Map.Entry<String, Float> entry : sums.entrySet()) {
+      hostDatum.getMetaData().put(new Text("avg." + entry.getKey()), new FloatWritable(entry.getValue() / counts.get(entry.getKey())));
+    }
+    for (Map.Entry<String, TDigest> entry : tdigests.entrySet()) {
+      // Emit all percentiles
+      for (int i = 0; i < percentiles.length; i++) {
+        hostDatum.getMetaData().put(new Text("pct" + Integer.toString(percentiles[i]) + "." + entry.getKey()), new FloatWritable((float)entry.getValue().quantile(0.5)));
+      }
+    }      
+    for (Map.Entry<String, Float> entry : minimums.entrySet()) {
+      hostDatum.getMetaData().put(new Text("min." + entry.getKey()), new FloatWritable(entry.getValue()));
+    }
+    
+    reporter.incrCounter("UpdateHostDb", "total_hosts", 1);
+
+    // See if this record is to be checked
+    if (shouldCheck(hostDatum)) {
+      // Make an entry
+      resolverThread = new ResolverThread(key.toString(), hostDatum, output, reporter, purgeFailedHostsThreshold);
+
+      // Add the entry to the queue (blocking)
+      try {
+        queue.put(resolverThread);
+      } catch (InterruptedException e) {
+        LOG.error("UpdateHostDb: " + StringUtils.stringifyException(e));
+      }
+
+      // Do not progress, the datum will be written in the resolver thread
+      return;
+    } else {
+      reporter.incrCounter("UpdateHostDb", "skipped_not_eligible", 1);
+      LOG.info("UpdateHostDb: " + key.toString() + ": skipped_not_eligible");
+    }
+
+    // Write the host datum if it wasn't written by the resolver thread
+    output.collect(key, hostDatum);
+  }
+
+  /**
+    * Determines whether a record should be checked.
+    *
+    * @param HostDatum
+    * @return boolean
+    */
+  protected boolean shouldCheck(HostDatum datum) {
+    // Whether a new record is to be checked
+    if (checkNew && datum.isEmpty()) {
+      return true;
+    }
+
+    // Whether existing known hosts should be rechecked
+    if (checkKnown && !datum.isEmpty() && datum.getDnsFailures() == 0) {
+      return isEligibleForCheck(datum);
+    }
+
+    // Whether failed records are forced to be rechecked
+    if (checkFailed && datum.getDnsFailures() > 0) {
+      return isEligibleForCheck(datum);
+    }
+
+    // It seems this record is not to be checked
+    return false;
+  }
+
+  /**
+    * Determines whether a record is eligible for recheck.
+    *
+    * @param HostDatum
+    * @return boolean
+    */
+  protected boolean isEligibleForCheck(HostDatum datum) {
+    // Whether an existing host, known or unknown, if forced to be rechecked
+    if (force || datum.getLastCheck().getTime() +
+      (recheckInterval * datum.getDnsFailures() + 1) > now) {
+      return true;
+    }
+
+    return false;
+  }
+
+  /**
+    * Shut down all running threads and wait for completion.
+    */
+  public void close() {
+    LOG.info("UpdateHostDb: feeder finished, waiting for shutdown");
+
+    // If we're here all keys have been fed and we can issue a shut down
+    executor.shutdown();
+
+    boolean finished = false;
+
+    // Wait until all resolvers have finished
+    while (!finished) {
+      try {
+        // Wait for the executor to shut down completely
+        if (!executor.isTerminated()) {
+          LOG.info("UpdateHostDb: resolver threads waiting: " + Integer.toString(executor.getPoolSize()));
+          Thread.sleep(1000);
+        } else {
+          // All is well, get out
+          finished = true;
+        }
+      } catch (InterruptedException e) {
+        // Huh?
+        LOG.warn(StringUtils.stringifyException(e));
+      }
+    }
+  }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/indexer/CleaningJob.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/indexer/CleaningJob.java b/nutch-core/src/main/java/org/apache/nutch/indexer/CleaningJob.java
new file mode 100644
index 0000000..c16003a
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/indexer/CleaningJob.java
@@ -0,0 +1,210 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer;
+
+import java.io.IOException;
+import java.text.SimpleDateFormat;
+import java.util.Iterator;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.ByteWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.FileInputFormat;
+import org.apache.hadoop.mapred.JobClient;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.Mapper;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reducer;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.hadoop.mapred.SequenceFileInputFormat;
+import org.apache.hadoop.mapred.lib.NullOutputFormat;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.CrawlDb;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.NutchJob;
+import org.apache.nutch.util.TimingUtil;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * The class scans CrawlDB looking for entries with status DB_GONE (404) or
+ * DB_DUPLICATE and sends delete requests to indexers for those documents.
+ */
+
+public class CleaningJob implements Tool {
+  public static final Logger LOG = LoggerFactory.getLogger(CleaningJob.class);
+  private Configuration conf;
+
+  @Override
+  public Configuration getConf() {
+    return conf;
+  }
+
+  @Override
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+  }
+
+  public static class DBFilter implements
+      Mapper<Text, CrawlDatum, ByteWritable, Text> {
+    private ByteWritable OUT = new ByteWritable(CrawlDatum.STATUS_DB_GONE);
+
+    @Override
+    public void configure(JobConf arg0) {
+    }
+
+    @Override
+    public void close() throws IOException {
+    }
+
+    @Override
+    public void map(Text key, CrawlDatum value,
+        OutputCollector<ByteWritable, Text> output, Reporter reporter)
+        throws IOException {
+
+      if (value.getStatus() == CrawlDatum.STATUS_DB_GONE
+          || value.getStatus() == CrawlDatum.STATUS_DB_DUPLICATE) {
+        output.collect(OUT, key);
+      }
+    }
+  }
+
+  public static class DeleterReducer implements
+      Reducer<ByteWritable, Text, Text, ByteWritable> {
+    private static final int NUM_MAX_DELETE_REQUEST = 1000;
+    private int numDeletes = 0;
+    private int totalDeleted = 0;
+
+    private boolean noCommit = false;
+
+    IndexWriters writers = null;
+
+    @Override
+    public void configure(JobConf job) {
+      writers = new IndexWriters(job);
+      try {
+        writers.open(job, "Deletion");
+      } catch (IOException e) {
+        throw new RuntimeException(e);
+      }
+      noCommit = job.getBoolean("noCommit", false);
+    }
+
+    @Override
+    public void close() throws IOException {
+      // BUFFERING OF CALLS TO INDEXER SHOULD BE HANDLED AT INDEXER LEVEL
+      // if (numDeletes > 0) {
+      // LOG.info("CleaningJob: deleting " + numDeletes + " documents");
+      // // TODO updateRequest.process(solr);
+      // totalDeleted += numDeletes;
+      // }
+
+      writers.close();
+
+      if (totalDeleted > 0 && !noCommit) {
+        writers.commit();
+      }
+
+      LOG.info("CleaningJob: deleted a total of " + totalDeleted + " documents");
+    }
+
+    @Override
+    public void reduce(ByteWritable key, Iterator<Text> values,
+        OutputCollector<Text, ByteWritable> output, Reporter reporter)
+        throws IOException {
+      while (values.hasNext()) {
+        Text document = values.next();
+        writers.delete(document.toString());
+        totalDeleted++;
+        reporter.incrCounter("CleaningJobStatus", "Deleted documents", 1);
+        // if (numDeletes >= NUM_MAX_DELETE_REQUEST) {
+        // LOG.info("CleaningJob: deleting " + numDeletes
+        // + " documents");
+        // // TODO updateRequest.process(solr);
+        // // TODO updateRequest = new UpdateRequest();
+        // writers.delete(key.toString());
+        // totalDeleted += numDeletes;
+        // numDeletes = 0;
+        // }
+      }
+    }
+  }
+
+  public void delete(String crawldb, boolean noCommit) throws IOException {
+    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+    long start = System.currentTimeMillis();
+    LOG.info("CleaningJob: starting at " + sdf.format(start));
+
+    JobConf job = new NutchJob(getConf());
+
+    FileInputFormat.addInputPath(job, new Path(crawldb, CrawlDb.CURRENT_NAME));
+    job.setBoolean("noCommit", noCommit);
+    job.setInputFormat(SequenceFileInputFormat.class);
+    job.setOutputFormat(NullOutputFormat.class);
+    job.setMapOutputKeyClass(ByteWritable.class);
+    job.setMapOutputValueClass(Text.class);
+    job.setMapperClass(DBFilter.class);
+    job.setReducerClass(DeleterReducer.class);
+
+    job.setJobName("CleaningJob");
+
+    // need to expicitely allow deletions
+    job.setBoolean(IndexerMapReduce.INDEXER_DELETE, true);
+
+    JobClient.runJob(job);
+
+    long end = System.currentTimeMillis();
+    LOG.info("CleaningJob: finished at " + sdf.format(end) + ", elapsed: "
+        + TimingUtil.elapsedTime(start, end));
+  }
+
+  public int run(String[] args) throws IOException {
+    if (args.length < 1) {
+      String usage = "Usage: CleaningJob <crawldb> [-noCommit]";
+      LOG.error("Missing crawldb. " + usage);
+      System.err.println(usage);
+      IndexWriters writers = new IndexWriters(getConf());
+      System.err.println(writers.describe());
+      return 1;
+    }
+
+    boolean noCommit = false;
+    if (args.length == 2 && args[1].equals("-noCommit")) {
+      noCommit = true;
+    }
+
+    try {
+      delete(args[0], noCommit);
+    } catch (final Exception e) {
+      LOG.error("CleaningJob: " + StringUtils.stringifyException(e));
+      System.err.println("ERROR CleaningJob: "
+          + StringUtils.stringifyException(e));
+      return -1;
+    }
+    return 0;
+  }
+
+  public static void main(String[] args) throws Exception {
+    int result = ToolRunner.run(NutchConfiguration.create(), new CleaningJob(),
+        args);
+    System.exit(result);
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/indexer/IndexWriter.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/indexer/IndexWriter.java b/nutch-core/src/main/java/org/apache/nutch/indexer/IndexWriter.java
new file mode 100644
index 0000000..fbbf2e8
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/indexer/IndexWriter.java
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer;
+
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configurable;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.plugin.Pluggable;
+
+public interface IndexWriter extends Pluggable, Configurable {
+  /** The name of the extension point. */
+  final static String X_POINT_ID = IndexWriter.class.getName();
+
+  public void open(JobConf job, String name) throws IOException;
+
+  public void write(NutchDocument doc) throws IOException;
+
+  public void delete(String key) throws IOException;
+
+  public void update(NutchDocument doc) throws IOException;
+
+  public void commit() throws IOException;
+
+  public void close() throws IOException;
+
+  /**
+   * Returns a String describing the IndexWriter instance and the specific
+   * parameters it can take
+   */
+  public String describe();
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/indexer/IndexWriters.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/indexer/IndexWriters.java b/nutch-core/src/main/java/org/apache/nutch/indexer/IndexWriters.java
new file mode 100644
index 0000000..681812b
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/indexer/IndexWriters.java
@@ -0,0 +1,145 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer;
+
+import java.io.IOException;
+import java.util.HashMap;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.plugin.Extension;
+import org.apache.nutch.plugin.ExtensionPoint;
+import org.apache.nutch.plugin.PluginRepository;
+import org.apache.nutch.plugin.PluginRuntimeException;
+import org.apache.nutch.util.ObjectCache;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/** Creates and caches {@link IndexWriter} implementing plugins. */
+public class IndexWriters {
+
+  public final static Logger LOG = LoggerFactory.getLogger(IndexWriters.class);
+
+  private IndexWriter[] indexWriters;
+
+  public IndexWriters(Configuration conf) {
+    ObjectCache objectCache = ObjectCache.get(conf);
+    synchronized (objectCache) {
+      this.indexWriters = (IndexWriter[]) objectCache
+          .getObject(IndexWriter.class.getName());
+      if (this.indexWriters == null) {
+        try {
+          ExtensionPoint point = PluginRepository.get(conf).getExtensionPoint(
+              IndexWriter.X_POINT_ID);
+          if (point == null)
+            throw new RuntimeException(IndexWriter.X_POINT_ID + " not found.");
+          Extension[] extensions = point.getExtensions();
+          HashMap<String, IndexWriter> indexerMap = new HashMap<String, IndexWriter>();
+          for (int i = 0; i < extensions.length; i++) {
+            Extension extension = extensions[i];
+            IndexWriter writer = (IndexWriter) extension.getExtensionInstance();
+            LOG.info("Adding " + writer.getClass().getName());
+            if (!indexerMap.containsKey(writer.getClass().getName())) {
+              indexerMap.put(writer.getClass().getName(), writer);
+            }
+          }
+          objectCache.setObject(IndexWriter.class.getName(), indexerMap
+              .values().toArray(new IndexWriter[0]));
+        } catch (PluginRuntimeException e) {
+          throw new RuntimeException(e);
+        }
+        this.indexWriters = (IndexWriter[]) objectCache
+            .getObject(IndexWriter.class.getName());
+      }
+    }
+  }
+
+  public void open(JobConf job, String name) throws IOException {
+    for (int i = 0; i < this.indexWriters.length; i++) {
+      try {
+        this.indexWriters[i].open(job, name);
+      } catch (IOException ioe) {
+        throw ioe;
+      }
+    }
+  }
+
+  public void write(NutchDocument doc) throws IOException {
+    for (int i = 0; i < this.indexWriters.length; i++) {
+      try {
+        this.indexWriters[i].write(doc);
+      } catch (IOException ioe) {
+        throw ioe;
+      }
+    }
+  }
+
+  public void update(NutchDocument doc) throws IOException {
+    for (int i = 0; i < this.indexWriters.length; i++) {
+      try {
+        this.indexWriters[i].update(doc);
+      } catch (IOException ioe) {
+        throw ioe;
+      }
+    }
+  }
+
+  public void delete(String key) throws IOException {
+    for (int i = 0; i < this.indexWriters.length; i++) {
+      try {
+        this.indexWriters[i].delete(key);
+      } catch (IOException ioe) {
+        throw ioe;
+      }
+    }
+  }
+
+  public void close() throws IOException {
+    for (int i = 0; i < this.indexWriters.length; i++) {
+      try {
+        this.indexWriters[i].close();
+      } catch (IOException ioe) {
+        throw ioe;
+      }
+    }
+  }
+
+  public void commit() throws IOException {
+    for (int i = 0; i < this.indexWriters.length; i++) {
+      try {
+        this.indexWriters[i].commit();
+      } catch (IOException ioe) {
+        throw ioe;
+      }
+    }
+  }
+
+  // lists the active IndexWriters and their configuration
+  public String describe() throws IOException {
+    StringBuffer buffer = new StringBuffer();
+    if (this.indexWriters.length == 0)
+      buffer.append("No IndexWriters activated - check your configuration\n");
+    else
+      buffer.append("Active IndexWriters :\n");
+    for (int i = 0; i < this.indexWriters.length; i++) {
+      buffer.append(this.indexWriters[i].describe()).append("\n");
+    }
+    return buffer.toString();
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/indexer/IndexerMapReduce.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/indexer/IndexerMapReduce.java b/nutch-core/src/main/java/org/apache/nutch/indexer/IndexerMapReduce.java
new file mode 100644
index 0000000..5025525
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/indexer/IndexerMapReduce.java
@@ -0,0 +1,422 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer;
+
+import java.io.IOException;
+import java.util.Collection;
+import java.util.Iterator;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.commons.codec.binary.Base64;
+import org.apache.commons.codec.binary.StringUtils;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapred.FileInputFormat;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.Mapper;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reducer;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.hadoop.mapred.SequenceFileInputFormat;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.CrawlDb;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.crawl.LinkDb;
+import org.apache.nutch.crawl.NutchWritable;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.net.URLFilters;
+import org.apache.nutch.net.URLNormalizers;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.ParseText;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.scoring.ScoringFilterException;
+import org.apache.nutch.scoring.ScoringFilters;
+
+public class IndexerMapReduce extends Configured implements
+    Mapper<Text, Writable, Text, NutchWritable>,
+    Reducer<Text, NutchWritable, Text, NutchIndexAction> {
+
+  public static final Logger LOG = LoggerFactory
+      .getLogger(IndexerMapReduce.class);
+
+  public static final String INDEXER_PARAMS = "indexer.additional.params";
+  public static final String INDEXER_DELETE = "indexer.delete";
+  public static final String INDEXER_DELETE_ROBOTS_NOINDEX = "indexer.delete.robots.noindex";
+  public static final String INDEXER_DELETE_SKIPPED = "indexer.delete.skipped.by.indexingfilter";
+  public static final String INDEXER_SKIP_NOTMODIFIED = "indexer.skip.notmodified";
+  public static final String URL_FILTERING = "indexer.url.filters";
+  public static final String URL_NORMALIZING = "indexer.url.normalizers";
+  public static final String INDEXER_BINARY_AS_BASE64 = "indexer.binary.base64";
+
+  private boolean skip = false;
+  private boolean delete = false;
+  private boolean deleteRobotsNoIndex = false;
+  private boolean deleteSkippedByIndexingFilter = false;
+  private boolean base64 = false;
+  private IndexingFilters filters;
+  private ScoringFilters scfilters;
+
+  // using normalizers and/or filters
+  private boolean normalize = false;
+  private boolean filter = false;
+
+  // url normalizers, filters and job configuration
+  private URLNormalizers urlNormalizers;
+  private URLFilters urlFilters;
+
+  /** Predefined action to delete documents from the index */
+  private static final NutchIndexAction DELETE_ACTION = new NutchIndexAction(
+      null, NutchIndexAction.DELETE);
+
+  public void configure(JobConf job) {
+    setConf(job);
+    this.filters = new IndexingFilters(getConf());
+    this.scfilters = new ScoringFilters(getConf());
+    this.delete = job.getBoolean(INDEXER_DELETE, false);
+    this.deleteRobotsNoIndex = job.getBoolean(INDEXER_DELETE_ROBOTS_NOINDEX,
+        false);
+    this.deleteSkippedByIndexingFilter = job.getBoolean(INDEXER_DELETE_SKIPPED,
+        false);
+    this.skip = job.getBoolean(INDEXER_SKIP_NOTMODIFIED, false);
+    this.base64 = job.getBoolean(INDEXER_BINARY_AS_BASE64, false);
+
+    normalize = job.getBoolean(URL_NORMALIZING, false);
+    filter = job.getBoolean(URL_FILTERING, false);
+
+    if (normalize) {
+      urlNormalizers = new URLNormalizers(getConf(),
+          URLNormalizers.SCOPE_INDEXER);
+    }
+
+    if (filter) {
+      urlFilters = new URLFilters(getConf());
+    }
+  }
+
+  /**
+   * Normalizes and trims extra whitespace from the given url.
+   * 
+   * @param url
+   *          The url to normalize.
+   * 
+   * @return The normalized url.
+   */
+  private String normalizeUrl(String url) {
+    if (!normalize) {
+      return url;
+    }
+
+    String normalized = null;
+    if (urlNormalizers != null) {
+      try {
+
+        // normalize and trim the url
+        normalized = urlNormalizers
+            .normalize(url, URLNormalizers.SCOPE_INDEXER);
+        normalized = normalized.trim();
+      } catch (Exception e) {
+        LOG.warn("Skipping " + url + ":" + e);
+        normalized = null;
+      }
+    }
+
+    return normalized;
+  }
+
+  /**
+   * Filters the given url.
+   * 
+   * @param url
+   *          The url to filter.
+   * 
+   * @return The filtered url or null.
+   */
+  private String filterUrl(String url) {
+    if (!filter) {
+      return url;
+    }
+
+    try {
+      url = urlFilters.filter(url);
+    } catch (Exception e) {
+      url = null;
+    }
+
+    return url;
+  }
+
+  public void map(Text key, Writable value,
+      OutputCollector<Text, NutchWritable> output, Reporter reporter)
+          throws IOException {
+
+    String urlString = filterUrl(normalizeUrl(key.toString()));
+    if (urlString == null) {
+      return;
+    } else {
+      key.set(urlString);
+    }
+
+    output.collect(key, new NutchWritable(value));
+  }
+
+  public void reduce(Text key, Iterator<NutchWritable> values,
+      OutputCollector<Text, NutchIndexAction> output, Reporter reporter)
+          throws IOException {
+    Inlinks inlinks = null;
+    CrawlDatum dbDatum = null;
+    CrawlDatum fetchDatum = null;
+    Content content = null;
+    ParseData parseData = null;
+    ParseText parseText = null;
+
+    while (values.hasNext()) {
+      final Writable value = values.next().get(); // unwrap
+      if (value instanceof Inlinks) {
+        inlinks = (Inlinks) value;
+      } else if (value instanceof CrawlDatum) {
+        final CrawlDatum datum = (CrawlDatum) value;
+        if (CrawlDatum.hasDbStatus(datum)) {
+          dbDatum = datum;
+        } else if (CrawlDatum.hasFetchStatus(datum)) {
+          // don't index unmodified (empty) pages
+          if (datum.getStatus() != CrawlDatum.STATUS_FETCH_NOTMODIFIED) {
+            fetchDatum = datum;
+          }
+        } else if (CrawlDatum.STATUS_LINKED == datum.getStatus()
+            || CrawlDatum.STATUS_SIGNATURE == datum.getStatus()
+            || CrawlDatum.STATUS_PARSE_META == datum.getStatus()) {
+          continue;
+        } else {
+          throw new RuntimeException("Unexpected status: " + datum.getStatus());
+        }
+      } else if (value instanceof ParseData) {
+        parseData = (ParseData) value;
+
+        // Handle robots meta? https://issues.apache.org/jira/browse/NUTCH-1434
+        if (deleteRobotsNoIndex) {
+          // Get the robots meta data
+          String robotsMeta = parseData.getMeta("robots");
+
+          // Has it a noindex for this url?
+          if (robotsMeta != null
+              && robotsMeta.toLowerCase().indexOf("noindex") != -1) {
+            // Delete it!
+            output.collect(key, DELETE_ACTION);
+            reporter.incrCounter("IndexerStatus", "deleted (robots=noindex)", 1);
+            return;
+          }
+        }
+      } else if (value instanceof ParseText) {
+        parseText = (ParseText) value;
+      } else if (value instanceof Content) {
+        content = (Content)value;
+      } else if (LOG.isWarnEnabled()) {
+        LOG.warn("Unrecognized type: " + value.getClass());
+      }
+    }
+
+    // Whether to delete GONE or REDIRECTS
+    if (delete && fetchDatum != null && dbDatum != null) {
+      if (fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_GONE
+          || dbDatum.getStatus() == CrawlDatum.STATUS_DB_GONE) {
+        reporter.incrCounter("IndexerStatus", "deleted (gone)", 1);
+        output.collect(key, DELETE_ACTION);
+        return;
+      }
+
+      if (fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_PERM
+          || fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_TEMP
+          || dbDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_PERM
+          || dbDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_TEMP) {
+        reporter.incrCounter("IndexerStatus", "deleted (redirects)", 1);
+        output.collect(key, DELETE_ACTION);
+        return;
+      }
+    }
+
+    if (fetchDatum == null || dbDatum == null || parseText == null
+        || parseData == null) {
+      return; // only have inlinks
+    }
+
+    // Whether to delete pages marked as duplicates
+    if (delete && dbDatum.getStatus() == CrawlDatum.STATUS_DB_DUPLICATE) {
+      reporter.incrCounter("IndexerStatus", "deleted (duplicates)", 1);
+      output.collect(key, DELETE_ACTION);
+      return;
+    }
+
+    // Whether to skip DB_NOTMODIFIED pages
+    if (skip && dbDatum.getStatus() == CrawlDatum.STATUS_DB_NOTMODIFIED) {
+      reporter.incrCounter("IndexerStatus", "skipped (not modified)", 1);
+      return;
+    }
+
+    if (!parseData.getStatus().isSuccess()
+        || fetchDatum.getStatus() != CrawlDatum.STATUS_FETCH_SUCCESS) {
+      return;
+    }
+
+    NutchDocument doc = new NutchDocument();
+    doc.add("id", key.toString());
+
+    final Metadata metadata = parseData.getContentMeta();
+
+    // add segment, used to map from merged index back to segment files
+    doc.add("segment", metadata.get(Nutch.SEGMENT_NAME_KEY));
+
+    // add digest, used by dedup
+    doc.add("digest", metadata.get(Nutch.SIGNATURE_KEY));
+    
+    final Parse parse = new ParseImpl(parseText, parseData);
+    float boost = 1.0f;
+    // run scoring filters
+    try {
+      boost = this.scfilters.indexerScore(key, doc, dbDatum, fetchDatum, parse,
+          inlinks, boost);
+    } catch (final ScoringFilterException e) {
+      reporter.incrCounter("IndexerStatus", "errors (ScoringFilter)", 1);
+      if (LOG.isWarnEnabled()) {
+        LOG.warn("Error calculating score {}: {}", key, e);
+      }
+      return;
+    }
+    // apply boost to all indexed fields.
+    doc.setWeight(boost);
+    // store boost for use by explain and dedup
+    doc.add("boost", Float.toString(boost));
+
+    try {
+      // Indexing filters may also be interested in the signature
+      fetchDatum.setSignature(dbDatum.getSignature());
+      
+      // extract information from dbDatum and pass it to
+      // fetchDatum so that indexing filters can use it
+      final Text url = (Text) dbDatum.getMetaData().get(
+          Nutch.WRITABLE_REPR_URL_KEY);
+      if (url != null) {
+        // Representation URL also needs normalization and filtering.
+        // If repr URL is excluded by filters we still accept this document
+        // but represented by its primary URL ("key") which has passed URL
+        // filters.
+        String urlString = filterUrl(normalizeUrl(url.toString()));
+        if (urlString != null) {
+          url.set(urlString);
+          fetchDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, url);
+        }
+      }
+      // run indexing filters
+      doc = this.filters.filter(doc, parse, key, fetchDatum, inlinks);
+    } catch (final IndexingException e) {
+      if (LOG.isWarnEnabled()) {
+        LOG.warn("Error indexing " + key + ": " + e);
+      }
+      reporter.incrCounter("IndexerStatus", "errors (IndexingFilter)", 1);
+      return;
+    }
+
+    // skip documents discarded by indexing filters
+    if (doc == null) {
+      // https://issues.apache.org/jira/browse/NUTCH-1449
+      if (deleteSkippedByIndexingFilter) {
+        NutchIndexAction action = new NutchIndexAction(null, NutchIndexAction.DELETE);
+        output.collect(key, action);
+        reporter.incrCounter("IndexerStatus", "deleted (IndexingFilter)", 1);
+      } else {
+        reporter.incrCounter("IndexerStatus", "skipped (IndexingFilter)", 1);
+      }
+      return;
+    }
+
+    if (content != null) {
+      // Add the original binary content
+      String binary;
+      if (base64) {
+        // optionally encode as base64
+        binary = Base64.encodeBase64String(content.getContent());
+      } else {
+        binary = new String(content.getContent());
+      }
+      doc.add("binaryContent", binary);
+    }
+
+    reporter.incrCounter("IndexerStatus", "indexed (add/update)", 1);
+
+    NutchIndexAction action = new NutchIndexAction(doc, NutchIndexAction.ADD);
+    output.collect(key, action);
+  }
+
+  public void close() throws IOException {
+  }
+
+  public static void initMRJob(Path crawlDb, Path linkDb,
+      Collection<Path> segments, JobConf job, boolean addBinaryContent) {
+
+    LOG.info("IndexerMapReduce: crawldb: {}", crawlDb);
+
+    if (linkDb != null)
+      LOG.info("IndexerMapReduce: linkdb: {}", linkDb);
+
+    for (final Path segment : segments) {
+      LOG.info("IndexerMapReduces: adding segment: {}", segment);
+      FileInputFormat.addInputPath(job, new Path(segment,
+          CrawlDatum.FETCH_DIR_NAME));
+      FileInputFormat.addInputPath(job, new Path(segment,
+          CrawlDatum.PARSE_DIR_NAME));
+      FileInputFormat.addInputPath(job, new Path(segment, ParseData.DIR_NAME));
+      FileInputFormat.addInputPath(job, new Path(segment, ParseText.DIR_NAME));
+
+      if (addBinaryContent) {
+        FileInputFormat.addInputPath(job, new Path(segment, Content.DIR_NAME));
+      }
+    }
+
+    FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME));
+
+    if (linkDb != null) {
+      Path currentLinkDb = new Path(linkDb, LinkDb.CURRENT_NAME);
+      try {
+        if (FileSystem.get(job).exists(currentLinkDb)) {
+          FileInputFormat.addInputPath(job, currentLinkDb);
+        } else {
+          LOG.warn("Ignoring linkDb for indexing, no linkDb found in path: {}",
+              linkDb);
+        }
+      } catch (IOException e) {
+        LOG.warn("Failed to use linkDb ({}) for indexing: {}", linkDb,
+            org.apache.hadoop.util.StringUtils.stringifyException(e));
+      }
+    }
+
+    job.setInputFormat(SequenceFileInputFormat.class);
+
+    job.setMapperClass(IndexerMapReduce.class);
+    job.setReducerClass(IndexerMapReduce.class);
+
+    job.setOutputFormat(IndexerOutputFormat.class);
+    job.setOutputKeyClass(Text.class);
+    job.setMapOutputValueClass(NutchWritable.class);
+    job.setOutputValueClass(NutchWritable.class);
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/indexer/IndexerOutputFormat.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/indexer/IndexerOutputFormat.java b/nutch-core/src/main/java/org/apache/nutch/indexer/IndexerOutputFormat.java
new file mode 100644
index 0000000..baa9ce6
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/indexer/IndexerOutputFormat.java
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer;
+
+import java.io.IOException;
+
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.FileOutputFormat;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.RecordWriter;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.hadoop.util.Progressable;
+
+public class IndexerOutputFormat extends
+    FileOutputFormat<Text, NutchIndexAction> {
+
+  @Override
+  public RecordWriter<Text, NutchIndexAction> getRecordWriter(
+      FileSystem ignored, JobConf job, String name, Progressable progress)
+      throws IOException {
+
+    final IndexWriters writers = new IndexWriters(job);
+
+    writers.open(job, name);
+
+    return new RecordWriter<Text, NutchIndexAction>() {
+
+      public void close(Reporter reporter) throws IOException {
+        writers.close();
+      }
+
+      public void write(Text key, NutchIndexAction indexAction)
+          throws IOException {
+        if (indexAction.action == NutchIndexAction.ADD) {
+          writers.write(indexAction.doc);
+        } else if (indexAction.action == NutchIndexAction.DELETE) {
+          writers.delete(key.toString());
+        }
+      }
+    };
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/indexer/IndexingException.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/indexer/IndexingException.java b/nutch-core/src/main/java/org/apache/nutch/indexer/IndexingException.java
new file mode 100644
index 0000000..adfefeb
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/indexer/IndexingException.java
@@ -0,0 +1,39 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.indexer;
+
+@SuppressWarnings("serial")
+public class IndexingException extends Exception {
+
+  public IndexingException() {
+    super();
+  }
+
+  public IndexingException(String message) {
+    super(message);
+  }
+
+  public IndexingException(String message, Throwable cause) {
+    super(message, cause);
+  }
+
+  public IndexingException(Throwable cause) {
+    super(cause);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/indexer/IndexingFilter.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/indexer/IndexingFilter.java b/nutch-core/src/main/java/org/apache/nutch/indexer/IndexingFilter.java
new file mode 100644
index 0000000..f22a0e5
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/indexer/IndexingFilter.java
@@ -0,0 +1,61 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.indexer;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configurable;
+import org.apache.hadoop.io.Text;
+
+// Nutch imports
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.plugin.Pluggable;
+
+/**
+ * Extension point for indexing. Permits one to add metadata to the indexed
+ * fields. All plugins found which implement this extension point are run
+ * sequentially on the parse.
+ */
+public interface IndexingFilter extends Pluggable, Configurable {
+  /** The name of the extension point. */
+  final static String X_POINT_ID = IndexingFilter.class.getName();
+
+  /**
+   * Adds fields or otherwise modifies the document that will be indexed for a
+   * parse. Unwanted documents can be removed from indexing by returning a null
+   * value.
+   * 
+   * @param doc
+   *          document instance for collecting fields
+   * @param parse
+   *          parse data instance
+   * @param url
+   *          page url
+   * @param datum
+   *          crawl datum for the page (fetch datum from segment containing
+   *          fetch status and fetch time)
+   * @param inlinks
+   *          page inlinks
+   * @return modified (or a new) document instance, or null (meaning the
+   *         document should be discarded)
+   * @throws IndexingException
+   */
+  NutchDocument filter(NutchDocument doc, Parse parse, Text url,
+      CrawlDatum datum, Inlinks inlinks) throws IndexingException;
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/indexer/IndexingFilters.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/indexer/IndexingFilters.java b/nutch-core/src/main/java/org/apache/nutch/indexer/IndexingFilters.java
new file mode 100644
index 0000000..334fcad
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/indexer/IndexingFilters.java
@@ -0,0 +1,60 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.indexer;
+
+// Commons Logging imports
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.nutch.plugin.PluginRepository;
+import org.apache.nutch.parse.Parse;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.hadoop.io.Text;
+
+/** Creates and caches {@link IndexingFilter} implementing plugins. */
+public class IndexingFilters {
+
+  public static final String INDEXINGFILTER_ORDER = "indexingfilter.order";
+
+  public final static Logger LOG = LoggerFactory
+      .getLogger(IndexingFilters.class);
+
+  private IndexingFilter[] indexingFilters;
+
+  public IndexingFilters(Configuration conf) {
+    indexingFilters = (IndexingFilter[]) PluginRepository.get(conf)
+        .getOrderedPlugins(IndexingFilter.class, IndexingFilter.X_POINT_ID,
+            INDEXINGFILTER_ORDER);
+  }
+
+  /** Run all defined filters. */
+  public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
+      CrawlDatum datum, Inlinks inlinks) throws IndexingException {
+    for (int i = 0; i < this.indexingFilters.length; i++) {
+      doc = this.indexingFilters[i].filter(doc, parse, url, datum, inlinks);
+      // break the loop if an indexing filter discards the doc
+      if (doc == null)
+        return null;
+    }
+
+    return doc;
+  }
+
+}

[37/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build for nutch-core and nutch-plugins

Posted by th...@apache.org.

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/service/impl/JobWorker.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/service/impl/JobWorker.java b/nutch-core/src/main/java/org/apache/nutch/service/impl/JobWorker.java
new file mode 100644
index 0000000..04821e7
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/service/impl/JobWorker.java
@@ -0,0 +1,114 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.service.impl;
+
+import java.text.MessageFormat;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.service.model.request.JobConfig;
+import org.apache.nutch.service.model.response.JobInfo;
+import org.apache.nutch.service.model.response.JobInfo.State;
+import org.apache.nutch.service.resources.ConfigResource;
+import org.apache.nutch.util.NutchTool;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class JobWorker implements Runnable{
+
+  private JobInfo jobInfo;
+  private JobConfig jobConfig;
+  private static final Logger LOG = LoggerFactory.getLogger(JobWorker.class);
+  private NutchTool tool;
+
+  /**
+   * To initialize JobWorker thread with the Job Configurations provided by user.
+   * @param jobConfig
+   * @param conf
+   * @param tool - NutchTool to run 
+   */
+  public JobWorker(JobConfig jobConfig, Configuration conf, NutchTool tool) {
+    this.jobConfig = jobConfig;
+    this.tool = tool;
+    if (jobConfig.getConfId() == null) {
+      jobConfig.setConfId(ConfigResource.DEFAULT);
+    }
+
+    jobInfo = new JobInfo(generateId(), jobConfig, State.IDLE, "idle");
+    if (jobConfig.getCrawlId() != null) {
+      conf.set(Nutch.CRAWL_ID_KEY, jobConfig.getCrawlId());
+    }
+  }
+
+  private String generateId() {
+    if (jobConfig.getCrawlId() == null) {
+      return MessageFormat.format("{0}-{1}-{2}", jobConfig.getConfId(),
+          jobConfig.getType(), String.valueOf(hashCode()));
+    }
+    return MessageFormat.format("{0}-{1}-{2}-{3}", jobConfig.getCrawlId(),
+        jobConfig.getConfId(), jobConfig.getType(), String.valueOf(hashCode()));
+  }
+
+  @Override
+  public void run() {
+    try {
+      getInfo().setState(State.RUNNING);
+      getInfo().setMsg("OK");
+      getInfo().setResult(tool.run(getInfo().getArgs(), getInfo().getCrawlId()));
+      getInfo().setState(State.FINISHED);
+    } catch (Exception e) {
+      LOG.error("Cannot run job worker!", e);
+      getInfo().setMsg("ERROR: " + e.toString());
+      getInfo().setState(State.FAILED);
+    }
+  }
+
+  public JobInfo getInfo() {
+    return jobInfo;
+  }
+
+  /**
+   * To stop the executing job
+   * @return boolean true/false
+   */
+  public boolean stopJob() {
+    getInfo().setState(State.STOPPING);
+    try {
+      return tool.stopJob();
+    } catch (Exception e) {
+      throw new RuntimeException(
+          "Cannot stop job with id " + getInfo().getId(), e);
+    }
+  }
+
+  public boolean killJob() {
+    getInfo().setState(State.KILLING);
+    try {
+      boolean result = tool.killJob();
+      getInfo().setState(State.KILLED);
+      return result;
+    } catch (Exception e) {
+      throw new RuntimeException(
+          "Cannot kill job with id " + getInfo().getId(), e);
+    }
+  }
+
+  public void setInfo(JobInfo jobInfo) {
+    this.jobInfo = jobInfo;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/service/impl/LinkReader.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/service/impl/LinkReader.java b/nutch-core/src/main/java/org/apache/nutch/service/impl/LinkReader.java
new file mode 100644
index 0000000..cc88501
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/service/impl/LinkReader.java
@@ -0,0 +1,175 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.service.impl;
+
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+
+import javax.ws.rs.WebApplicationException;
+
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.io.SequenceFile.Reader;
+import org.apache.hadoop.util.ReflectionUtils;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.nutch.scoring.webgraph.LinkDatum;
+import org.apache.nutch.service.NutchReader;
+
+public class LinkReader implements NutchReader{
+
+  @Override
+  public List read(String path) throws FileNotFoundException {
+    List<HashMap> rows=new ArrayList<HashMap>();
+    Path file = new Path(path);
+    SequenceFile.Reader reader;
+    try{
+      reader = new SequenceFile.Reader(conf, Reader.file(file));
+      Writable key = (Writable)
+          ReflectionUtils.newInstance(reader.getKeyClass(), conf);
+      LinkDatum value = new LinkDatum();
+
+      while(reader.next(key, value)) {
+        try {
+          HashMap<String, String> t_row = getLinksRow(key,value);
+          rows.add(t_row);
+        }
+        catch (Exception e) {
+        }
+      }
+      reader.close();
+
+    }catch(FileNotFoundException fne){ 
+      throw new FileNotFoundException();
+
+    }catch (IOException e) {
+      // TODO Auto-generated catch block
+      e.printStackTrace();
+      LOG.error("Error occurred while reading file {} : ", file, StringUtils.stringifyException(e));
+      throw new WebApplicationException();
+    } 
+
+    return rows;
+  }
+
+  @Override
+  public List head(String path, int nrows) throws FileNotFoundException {
+    List<HashMap> rows=new ArrayList<HashMap>();
+    Path file = new Path(path);
+    SequenceFile.Reader reader;
+    try{
+      reader = new SequenceFile.Reader(conf, Reader.file(file));
+      Writable key = (Writable)
+          ReflectionUtils.newInstance(reader.getKeyClass(), conf);
+      LinkDatum value = new LinkDatum();
+      int i = 0;
+      while(reader.next(key, value) && i<nrows) {
+
+        HashMap<String, String> t_row = getLinksRow(key,value);
+        rows.add(t_row);
+
+        i++;
+      }
+      reader.close();
+
+    }catch(FileNotFoundException fne){ 
+      throw new FileNotFoundException();
+
+    }catch (IOException e) {
+      // TODO Auto-generated catch block
+      e.printStackTrace();
+      LOG.error("Error occurred while reading file {} : ", file, StringUtils.stringifyException(e));
+      throw new WebApplicationException();
+    } 
+
+    return rows;
+  }
+
+  @Override
+  public List slice(String path, int start, int end)
+      throws FileNotFoundException {
+    List<HashMap> rows=new ArrayList<HashMap>();
+    Path file = new Path(path);
+    SequenceFile.Reader reader;
+    try{
+      reader = new SequenceFile.Reader(conf, Reader.file(file));
+      Writable key = (Writable)
+          ReflectionUtils.newInstance(reader.getKeyClass(), conf);
+      LinkDatum value = new LinkDatum();
+      int i = 0;
+
+      for(;i<start && reader.next(key, value);i++){} // increment to read start position
+      while(reader.next(key, value) && i<end) {
+        HashMap<String, String> t_row = getLinksRow(key,value);
+        rows.add(t_row);
+
+        i++;
+      }
+      reader.close();
+
+    }catch(FileNotFoundException fne){ 
+      throw new FileNotFoundException();
+
+    }catch (IOException e) {
+      // TODO Auto-generated catch block
+      e.printStackTrace();
+      LOG.error("Error occurred while reading file {} : ", file, StringUtils.stringifyException(e));
+      throw new WebApplicationException();
+    } 
+
+    return rows;
+  }
+
+  @Override
+  public int count(String path) throws FileNotFoundException {
+    Path file = new Path(path);
+    SequenceFile.Reader reader;
+    int i = 0;
+    try {
+      reader = new SequenceFile.Reader(conf, Reader.file(file));
+      Writable key = (Writable)ReflectionUtils.newInstance(reader.getKeyClass(), conf);
+      Writable value = (Writable)ReflectionUtils.newInstance(reader.getValueClass(), conf);
+
+      while(reader.next(key, value)) {
+        i++;
+      }
+      reader.close();
+    } catch(FileNotFoundException fne){ 
+      throw new FileNotFoundException();
+    }catch (IOException e) {
+      // TODO Auto-generated catch block
+      LOG.error("Error occurred while reading file {} : ", file, StringUtils.stringifyException(e));
+      throw new WebApplicationException();
+    } 
+    return i;
+  }
+
+  private HashMap<String, String> getLinksRow(Writable key, LinkDatum value) {
+    HashMap<String, String> t_row = new HashMap<String, String>();
+    t_row.put("key_url", key.toString());
+    t_row.put("url", value.getUrl());
+    t_row.put("anchor", value.getAnchor());
+    t_row.put("score", String.valueOf(value.getScore()));
+    t_row.put("timestamp", String.valueOf(value.getTimestamp()));
+    t_row.put("linktype", String.valueOf(value.getLinkType()));
+
+    return t_row;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/service/impl/NodeReader.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/service/impl/NodeReader.java b/nutch-core/src/main/java/org/apache/nutch/service/impl/NodeReader.java
new file mode 100644
index 0000000..2155a16
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/service/impl/NodeReader.java
@@ -0,0 +1,184 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.service.impl;
+
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+
+import javax.ws.rs.WebApplicationException;
+
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.util.ReflectionUtils;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.hadoop.io.SequenceFile.Reader;
+import org.apache.nutch.scoring.webgraph.Node;
+import org.apache.nutch.service.NutchReader;
+
+public class NodeReader implements NutchReader {
+
+  @Override
+  public List read(String path) throws FileNotFoundException {
+    // TODO Auto-generated method stub
+    List<HashMap> rows=new ArrayList<HashMap>();
+    Path file = new Path(path);
+    SequenceFile.Reader reader;
+    try{
+      reader = new SequenceFile.Reader(conf, Reader.file(file));
+      Writable key = (Writable)
+          ReflectionUtils.newInstance(reader.getKeyClass(), conf);
+      Node value = new Node();
+
+      while(reader.next(key, value)) {
+        try {
+          HashMap<String, String> t_row = getNodeRow(key,value);
+          rows.add(t_row);
+        }
+        catch (Exception e) {
+        }
+      }
+      reader.close();
+
+    }catch(FileNotFoundException fne){ 
+      throw new FileNotFoundException();
+
+    }catch (IOException e) {
+      // TODO Auto-generated catch block
+      e.printStackTrace();
+      LOG.error("Error occurred while reading file {} : ", file, StringUtils.stringifyException(e));
+      throw new WebApplicationException();
+    } 
+
+    return rows;
+
+  }
+
+  @Override
+  public List head(String path, int nrows) throws FileNotFoundException {
+    List<HashMap> rows=new ArrayList<HashMap>();
+    Path file = new Path(path);
+    SequenceFile.Reader reader;
+    try{
+      reader = new SequenceFile.Reader(conf, Reader.file(file));
+      Writable key = (Writable)
+          ReflectionUtils.newInstance(reader.getKeyClass(), conf);
+      Node value = new Node();
+      int i = 0;
+      while(reader.next(key, value) && i<nrows) {
+        HashMap<String, String> t_row = getNodeRow(key,value);
+        rows.add(t_row);
+
+        i++;
+      }
+      reader.close();
+
+    }catch(FileNotFoundException fne){ 
+      throw new FileNotFoundException();
+
+    }catch (IOException e) {
+      // TODO Auto-generated catch block
+      e.printStackTrace();
+      LOG.error("Error occurred while reading file {} : ", file, 
+          StringUtils.stringifyException(e));
+      throw new WebApplicationException();
+    } 
+
+    return rows;
+  }
+
+  @Override
+  public List slice(String path, int start, int end)
+      throws FileNotFoundException {
+    List<HashMap> rows=new ArrayList<HashMap>();
+    Path file = new Path(path);
+    SequenceFile.Reader reader;
+    try{
+      reader = new SequenceFile.Reader(conf, Reader.file(file));
+      Writable key = (Writable)
+          ReflectionUtils.newInstance(reader.getKeyClass(), conf);
+      Node value = new Node();
+      int i = 0;
+
+      for(;i<start && reader.next(key, value);i++){} // increment to read start position
+      while(reader.next(key, value) && i<end) {
+        HashMap<String, String> t_row = getNodeRow(key,value);
+        rows.add(t_row);
+
+        i++;
+      }
+      reader.close();
+
+    }catch(FileNotFoundException fne){ 
+      throw new FileNotFoundException();
+
+    }catch (IOException e) {
+      // TODO Auto-generated catch block
+      e.printStackTrace();
+      LOG.error("Error occurred while reading file {} : ", file, 
+          StringUtils.stringifyException(e));
+      throw new WebApplicationException();
+    } 
+
+    return rows;
+  }
+
+  @Override
+  public int count(String path) throws FileNotFoundException {
+    Path file = new Path(path);
+    SequenceFile.Reader reader;
+    int i =0;
+    try{
+      reader = new SequenceFile.Reader(conf, Reader.file(file));
+      Writable key = (Writable)
+          ReflectionUtils.newInstance(reader.getKeyClass(), conf);
+      Node value = new Node();
+
+      while(reader.next(key, value)) {
+        i++;
+      }
+      reader.close();
+
+    }catch(FileNotFoundException fne){ 
+      throw new FileNotFoundException();
+
+    }catch (IOException e) {
+      // TODO Auto-generated catch block
+      e.printStackTrace();
+      LOG.error("Error occurred while reading file {} : ", file, 
+          StringUtils.stringifyException(e));
+      throw new WebApplicationException();
+    } 
+
+    return i;
+  }
+
+  private HashMap<String, String> getNodeRow(Writable key, Node value) {
+    HashMap<String, String> t_row = new HashMap<String, String>();
+    t_row.put("key_url", key.toString());
+    t_row.put("num_inlinks", String.valueOf(value.getNumInlinks()) );
+    t_row.put("num_outlinks", String.valueOf(value.getNumOutlinks()) );
+    t_row.put("inlink_score", String.valueOf(value.getInlinkScore()));
+    t_row.put("outlink_score", String.valueOf(value.getOutlinkScore()));
+    t_row.put("metadata", value.getMetadata().toString());
+
+    return t_row;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/service/impl/NutchServerPoolExecutor.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/service/impl/NutchServerPoolExecutor.java b/nutch-core/src/main/java/org/apache/nutch/service/impl/NutchServerPoolExecutor.java
new file mode 100644
index 0000000..3fc5ba3
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/service/impl/NutchServerPoolExecutor.java
@@ -0,0 +1,131 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.service.impl;
+
+import java.util.Collection;
+import java.util.List;
+import java.util.Queue;
+import java.util.concurrent.BlockingQueue;
+import java.util.concurrent.ThreadPoolExecutor;
+import java.util.concurrent.TimeUnit;
+
+import org.apache.commons.collections.CollectionUtils;
+import org.apache.commons.lang.StringUtils;
+import org.apache.nutch.service.model.response.JobInfo;
+
+import com.google.common.collect.Lists;
+import com.google.common.collect.Queues;
+
+
+
+public class NutchServerPoolExecutor extends ThreadPoolExecutor{
+
+  private Queue<JobWorker> workersHistory;
+  private Queue<JobWorker> runningWorkers;
+
+  public NutchServerPoolExecutor(int corePoolSize, int maxPoolSize, long keepAliveTime, TimeUnit unit, BlockingQueue<Runnable> workQueue){
+    super(corePoolSize, maxPoolSize, keepAliveTime, unit, workQueue);
+    workersHistory = Queues.newArrayBlockingQueue(maxPoolSize);
+    runningWorkers = Queues.newArrayBlockingQueue(maxPoolSize);
+  }
+
+  @Override
+  protected void beforeExecute(Thread thread, Runnable runnable) {
+    super.beforeExecute(thread, runnable);
+    synchronized (runningWorkers) {
+      runningWorkers.offer(((JobWorker) runnable));
+    }
+  }
+  @Override
+  protected void afterExecute(Runnable runnable, Throwable throwable) {
+    super.afterExecute(runnable, throwable);
+    synchronized (runningWorkers) {
+      runningWorkers.remove(((JobWorker) runnable).getInfo());
+    }
+    JobWorker worker = ((JobWorker) runnable);
+    addStatusToHistory(worker);
+  }
+
+  private void addStatusToHistory(JobWorker worker) {
+    synchronized (workersHistory) {
+      if (!workersHistory.offer(worker)) {
+        workersHistory.poll();
+        workersHistory.add(worker);
+      }
+    }
+  }
+
+  /**
+   * Find the Job Worker Thread
+   * @param jobId
+   * @return
+   */
+  public JobWorker findWorker(String jobId) {
+    synchronized (runningWorkers) {
+      for (JobWorker worker : runningWorkers) {
+        if (StringUtils.equals(worker.getInfo().getId(), jobId)) {
+          return worker;
+        }
+      }
+    }
+    return null;
+  }
+
+  /**
+   * Gives the Job history
+   * @return
+   */
+  public Collection<JobInfo> getJobHistory() {
+    return getJobsInfo(workersHistory);
+  }
+
+  /**
+   * Gives the list of currently running jobs
+   * @return
+   */
+  public Collection<JobInfo> getJobRunning() {
+    return getJobsInfo(runningWorkers);
+  }
+
+  /**
+   * Gives all jobs(currently running and completed)
+   * @return
+   */
+  @SuppressWarnings("unchecked")
+  public Collection<JobInfo> getAllJobs() {
+    return CollectionUtils.union(getJobRunning(), getJobHistory());
+  }
+
+  private Collection<JobInfo> getJobsInfo(Collection<JobWorker> workers) {
+    List<JobInfo> jobsInfo = Lists.newLinkedList();
+    for (JobWorker worker : workers) {
+      jobsInfo.add(worker.getInfo());
+    }
+    return jobsInfo;
+  }
+
+
+  public JobInfo getInfo(String jobId) {
+    for (JobInfo jobInfo : getAllJobs()) {
+      if (StringUtils.equals(jobId, jobInfo.getId())) {
+        return jobInfo;
+      }
+    }
+    return null;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/service/impl/SequenceReader.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/service/impl/SequenceReader.java b/nutch-core/src/main/java/org/apache/nutch/service/impl/SequenceReader.java
new file mode 100644
index 0000000..ce5d120
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/service/impl/SequenceReader.java
@@ -0,0 +1,171 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.service.impl;
+
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import javax.ws.rs.WebApplicationException;
+
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.SequenceFile.Reader;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.util.ReflectionUtils;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.nutch.service.NutchReader;
+
+/**
+ * Enables reading a sequence file and methods provide different 
+ * ways to read the file. 
+ * @author Sujen Shah
+ *
+ */
+public class SequenceReader implements NutchReader {
+
+  @Override
+  public List<List<String>> read(String path) throws FileNotFoundException {
+    // TODO Auto-generated method stub
+    List<List<String>> rows=new ArrayList<List<String>>();
+    Path file = new Path(path);
+    SequenceFile.Reader reader;
+    try {
+      reader = new SequenceFile.Reader(conf, Reader.file(file));
+      Writable key = 
+          (Writable)ReflectionUtils.newInstance(reader.getKeyClass(), conf);
+      Writable value = 
+          (Writable)ReflectionUtils.newInstance(reader.getValueClass(), conf);
+      
+      while(reader.next(key, value)) {
+        List<String> row =new ArrayList<String>();
+        row.add(key.toString());
+        row.add(value.toString());
+        rows.add(row);
+      }
+      reader.close();
+    }catch(FileNotFoundException fne){ 
+      throw new FileNotFoundException();
+    }catch (IOException e) {
+      // TODO Auto-generated catch block
+      e.printStackTrace();
+      LOG.error("Error occurred while reading file {} : ", file, 
+          StringUtils.stringifyException(e));
+      throw new WebApplicationException();
+    } 
+    return rows;
+  }
+
+  @Override
+  public List<List<String>> head(String path, int nrows) 
+      throws FileNotFoundException {
+    // TODO Auto-generated method stub
+    
+    List<List<String>> rows=new ArrayList<List<String>>();
+    Path file = new Path(path);
+    SequenceFile.Reader reader;
+    try {
+      
+      reader = new SequenceFile.Reader(conf, Reader.file(file));
+      Writable key = 
+          (Writable)ReflectionUtils.newInstance(reader.getKeyClass(), conf);
+      Writable value = 
+          (Writable)ReflectionUtils.newInstance(reader.getValueClass(), conf);
+      int i = 0;
+      while(reader.next(key, value) && i<nrows) {
+        List<String> row =new ArrayList<String>();
+        row.add(key.toString());
+        row.add(value.toString());
+        rows.add(row);
+        i++;
+      }
+      reader.close();
+    } catch(FileNotFoundException fne){ 
+      throw new FileNotFoundException();
+    }catch (IOException e) {
+      // TODO Auto-generated catch block
+      LOG.error("Error occurred while reading file {} : ", file, 
+          StringUtils.stringifyException(e));
+      throw new WebApplicationException();
+    } 
+    return rows;
+  }
+
+  @Override
+  public List<List<String>> slice(String path, int start, int end) 
+      throws FileNotFoundException {
+    List<List<String>> rows=new ArrayList<List<String>>();
+    Path file = new Path(path);
+    SequenceFile.Reader reader;
+    try {
+      
+      reader = new SequenceFile.Reader(conf, Reader.file(file));
+      Writable key = 
+          (Writable)ReflectionUtils.newInstance(reader.getKeyClass(), conf);
+      Writable value = 
+          (Writable)ReflectionUtils.newInstance(reader.getValueClass(), conf);
+      int i = 0;
+      
+      for(;i<start && reader.next(key, value);i++){} // increment to read start position
+      while(reader.next(key, value) && i<end) {
+        List<String> row =new ArrayList<String>();
+        row.add(key.toString());
+        row.add(value.toString());
+        rows.add(row);
+        i++;
+      }
+      reader.close();
+    } catch(FileNotFoundException fne){ 
+      throw new FileNotFoundException();
+    }catch (IOException e) {
+      // TODO Auto-generated catch block
+      LOG.error("Error occurred while reading file {} : ", file, 
+          StringUtils.stringifyException(e));
+      throw new WebApplicationException();
+    } 
+    return rows;
+  }
+
+  @Override
+  public int count(String path) throws FileNotFoundException {
+    Path file = new Path(path);
+    SequenceFile.Reader reader;
+    int i = 0;
+    try {
+      reader = new SequenceFile.Reader(conf, Reader.file(file));
+      Writable key = 
+          (Writable)ReflectionUtils.newInstance(reader.getKeyClass(), conf);
+      Writable value = 
+          (Writable)ReflectionUtils.newInstance(reader.getValueClass(), conf);
+     
+      while(reader.next(key, value)) {
+        i++;
+      }
+      reader.close();
+    } catch(FileNotFoundException fne){ 
+      throw new FileNotFoundException();
+    }catch (IOException e) {
+      // TODO Auto-generated catch block
+      LOG.error("Error occurred while reading file {} : ", file, 
+          StringUtils.stringifyException(e));
+      throw new WebApplicationException();
+    } 
+    return i;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/service/model/request/DbQuery.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/service/model/request/DbQuery.java b/nutch-core/src/main/java/org/apache/nutch/service/model/request/DbQuery.java
new file mode 100644
index 0000000..5d069dc
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/service/model/request/DbQuery.java
@@ -0,0 +1,56 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.service.model.request;
+
+import java.util.HashMap;
+import java.util.Map;
+
+public class DbQuery {
+
+  private String confId;
+  private String type;
+  private Map<String, String> args = new HashMap<String, String>();
+  private String crawlId;
+
+  public String getConfId() {
+    return confId;
+  }
+  public void setConfId(String confId) {
+    this.confId = confId;
+  }
+  public Map<String, String> getArgs() {
+    return args;
+  }
+  public void setArgs(Map<String, String> args) {
+    this.args = args;
+  }
+  public String getType() {
+    return type;
+  }
+  public void setType(String type) {
+    this.type = type;
+  }
+  public String getCrawlId() {
+    return crawlId;
+  }
+  public void setCrawlId(String crawlId) {
+    this.crawlId = crawlId;
+  }
+
+
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/service/model/request/JobConfig.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/service/model/request/JobConfig.java b/nutch-core/src/main/java/org/apache/nutch/service/model/request/JobConfig.java
new file mode 100644
index 0000000..af6c945
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/service/model/request/JobConfig.java
@@ -0,0 +1,71 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.service.model.request;
+
+import java.util.Map;
+
+import org.apache.nutch.service.JobManager.JobType;
+
+
+public class JobConfig {
+  private String crawlId;
+  private JobType type;
+  private String confId;
+  private String jobClassName;
+  private Map<String, Object> args;
+
+  public String getCrawlId() {
+    return crawlId;
+  }
+
+  public void setCrawlId(String crawlId) {
+    this.crawlId = crawlId;
+  }
+
+  public JobType getType() {
+    return type;
+  }
+
+  public void setType(JobType type) {
+    this.type = type;
+  }
+
+  public String getConfId() {
+    return confId;
+  }
+
+  public void setConfId(String confId) {
+    this.confId = confId;
+  }
+
+  public Map<String, Object> getArgs() {
+    return args;
+  }
+
+  public void setArgs(Map<String, Object> args) {
+    this.args = args;
+  }
+
+  public String getJobClassName() {
+    return jobClassName;
+  }
+
+  public void setJobClassName(String jobClass) {
+    this.jobClassName = jobClass;
+  }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/service/model/request/NutchConfig.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/service/model/request/NutchConfig.java b/nutch-core/src/main/java/org/apache/nutch/service/model/request/NutchConfig.java
new file mode 100644
index 0000000..ffa9e3e
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/service/model/request/NutchConfig.java
@@ -0,0 +1,51 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.service.model.request;
+
+import java.util.Map;
+
+import java.util.Collections;
+
+public class NutchConfig {
+  private String configId;
+  private boolean force = false;
+  private Map<String, String> params = Collections.emptyMap();
+
+  public Map<String, String> getParams() {
+    return params;
+  }
+
+  public void setParams(Map<String, String> params) {
+    this.params = params;
+  }
+
+  public String getConfigId() {
+    return configId;
+  }
+
+  public void setConfigId(String configId) {
+    this.configId = configId;
+  }
+
+  public boolean isForce() {
+    return force;
+  }
+
+  public void setForce(boolean force) {
+    this.force = force;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/service/model/request/ReaderConfig.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/service/model/request/ReaderConfig.java b/nutch-core/src/main/java/org/apache/nutch/service/model/request/ReaderConfig.java
new file mode 100644
index 0000000..81d7440
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/service/model/request/ReaderConfig.java
@@ -0,0 +1,30 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.service.model.request;
+
+public class ReaderConfig {
+
+  private String path;
+
+  public String getPath() {
+    return path;
+  }
+
+  public void setPath(String path) {
+    this.path = path;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/service/model/request/SeedList.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/service/model/request/SeedList.java b/nutch-core/src/main/java/org/apache/nutch/service/model/request/SeedList.java
new file mode 100644
index 0000000..bbb3e2a
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/service/model/request/SeedList.java
@@ -0,0 +1,93 @@
+/*******************************************************************************
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ ******************************************************************************/
+package org.apache.nutch.service.model.request;
+
+import java.io.Serializable;
+import java.util.Collection;
+
+import org.apache.commons.collections4.CollectionUtils;
+
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonManagedReference;
+
+public class SeedList implements Serializable {
+
+  private Long id;
+
+  private String name;
+
+  @JsonManagedReference
+  private Collection<SeedUrl> seedUrls;
+
+  public Long getId() {
+    return id;
+  }
+
+  public void setId(Long id) {
+    this.id = id;
+  }
+
+  public Collection<SeedUrl> getSeedUrls() {
+    return seedUrls;
+  }
+
+  public void setSeedUrls(Collection<SeedUrl> seedUrls) {
+    this.seedUrls = seedUrls;
+  }
+
+  public String getName() {
+    return name;
+  }
+
+  public void setName(String name) {
+    this.name = name;
+  }
+
+  @JsonIgnore
+  public int getSeedUrlsCount() {
+    if (CollectionUtils.isEmpty(seedUrls)) {
+      return 0;
+    }
+    return seedUrls.size();
+  }
+
+  @Override
+  public int hashCode() {
+    final int prime = 31;
+    int result = 1;
+    result = prime * result + ((id == null) ? 0 : id.hashCode());
+    return result;
+  }
+
+  @Override
+  public boolean equals(Object obj) {
+    if (this == obj)
+      return true;
+    if (obj == null)
+      return false;
+    if (getClass() != obj.getClass())
+      return false;
+    SeedList other = (SeedList) obj;
+    if (id == null) {
+      if (other.id != null)
+        return false;
+    } else if (!id.equals(other.id))
+      return false;
+    return true;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/service/model/request/SeedUrl.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/service/model/request/SeedUrl.java b/nutch-core/src/main/java/org/apache/nutch/service/model/request/SeedUrl.java
new file mode 100644
index 0000000..b1c93a8
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/service/model/request/SeedUrl.java
@@ -0,0 +1,89 @@
+/*******************************************************************************
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ ******************************************************************************/
+package org.apache.nutch.service.model.request;
+
+import java.io.Serializable;
+
+import com.fasterxml.jackson.annotation.JsonBackReference;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+
+public class SeedUrl implements Serializable {
+
+  private Long id;
+
+  @JsonBackReference
+  private SeedList seedList;
+
+  private String url;
+
+  public SeedUrl() {}
+
+  public SeedUrl(String url) {
+    this.url = url;
+  }
+  
+  public Long getId() {
+    return id;
+  }
+
+  public void setId(Long id) {
+    this.id = id;
+  }
+
+  public String getUrl() {
+    return url;
+  }
+
+  public void setUrl(String url) {
+    this.url = url;
+  }
+
+  @JsonIgnore
+  public SeedList getSeedList() {
+    return seedList;
+  }
+
+  @JsonIgnore
+  public void setSeedList(SeedList seedList) {
+    this.seedList = seedList;
+  }
+
+  @Override
+  public int hashCode() {
+    final int prime = 31;
+    int result = 1;
+    result = prime * result + ((id == null) ? 0 : id.hashCode());
+    return result;
+  }
+
+  @Override
+  public boolean equals(Object obj) {
+    if (this == obj)
+      return true;
+    if (obj == null)
+      return false;
+    if (getClass() != obj.getClass())
+      return false;
+    SeedUrl other = (SeedUrl) obj;
+    if (id == null) {
+      if (other.id != null)
+        return false;
+    } else if (!id.equals(other.id))
+      return false;
+    return true;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/service/model/response/FetchNodeDbInfo.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/service/model/response/FetchNodeDbInfo.java b/nutch-core/src/main/java/org/apache/nutch/service/model/response/FetchNodeDbInfo.java
new file mode 100644
index 0000000..267b50b
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/service/model/response/FetchNodeDbInfo.java
@@ -0,0 +1,103 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.service.model.response;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.nutch.parse.Outlink;
+
+public class FetchNodeDbInfo {
+  
+  private String url;
+  private int status;
+  private int numOfOutlinks;
+  private List<ChildNode> children = new ArrayList<ChildNode>();
+  
+  
+  public String getUrl() {
+    return url;
+  }
+
+
+  public void setUrl(String url) {
+    this.url = url;
+  }
+
+
+  public int getStatus() {
+    return status;
+  }
+
+
+  public void setStatus(int status) {
+    this.status = status;
+  }
+
+
+  public int getNumOfOutlinks() {
+    return numOfOutlinks;
+  }
+
+
+  public void setNumOfOutlinks(int numOfOutlinks) {
+    this.numOfOutlinks = numOfOutlinks;
+  }
+  
+  public void setChildNodes(Outlink[] links){
+    ChildNode childNode;
+    for(Outlink outlink: links){
+      childNode = new ChildNode(outlink.getToUrl(), outlink.getAnchor());
+      children.add(childNode);
+    }
+  }
+
+
+  private class ChildNode{
+    private String childUrl;
+    private String anchorText;
+    
+    public ChildNode(String childUrl, String anchorText){
+      this.childUrl = childUrl;
+      this.anchorText = anchorText;
+    }
+    
+    public String getAnchorText() {
+      return anchorText;
+    }
+    public void setAnchorText(String anchorText) {
+      this.anchorText = anchorText;
+    }
+    public String getChildUrl() {
+      return childUrl;
+    }
+    public void setChildUrl(String childUrl) {
+      this.childUrl = childUrl;
+    }
+  }
+
+
+  public List<ChildNode> getChildren() {
+    return children;
+  }
+
+
+  public void setChildren(List<ChildNode> children) {
+    this.children = children;
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/service/model/response/JobInfo.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/service/model/response/JobInfo.java b/nutch-core/src/main/java/org/apache/nutch/service/model/response/JobInfo.java
new file mode 100644
index 0000000..c2e185d
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/service/model/response/JobInfo.java
@@ -0,0 +1,102 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.service.model.response;
+
+import java.util.Map;
+
+import org.apache.nutch.service.JobManager.JobType;
+import org.apache.nutch.service.model.request.JobConfig;
+
+/**
+ * This is the response object containing Job information
+ * 
+ *
+ */
+public class JobInfo {
+
+  public static enum State {
+    IDLE, RUNNING, FINISHED, FAILED, KILLED, STOPPING, KILLING, ANY
+  };
+
+  private String id;
+  private JobType type;
+  private String confId;
+  private Map<String, Object> args;
+  private Map<String, Object> result;
+  private State state;
+  private String msg;
+  private String crawlId;
+
+  public JobInfo(String generateId, JobConfig jobConfig, State state,
+      String msg) {
+    this.id = generateId;
+    this.type = jobConfig.getType();
+    this.confId = jobConfig.getConfId();
+    this.crawlId = jobConfig.getCrawlId();
+    this.args = jobConfig.getArgs();
+    this.msg = msg;
+    this.state = state;
+  }
+  public String getId() {
+    return id;
+  }
+  public void setId(String id) {
+    this.id = id;
+  }
+  public JobType getType() {
+    return type;
+  }
+  public void setType(JobType type) {
+    this.type = type;
+  }
+  public String getConfId() {
+    return confId;
+  }
+  public void setConfId(String confId) {
+    this.confId = confId;
+  }
+  public Map<String, Object> getArgs() {
+    return args;
+  }
+  public void setArgs(Map<String, Object> args) {
+    this.args = args;
+  }
+  public Map<String, Object> getResult() {
+    return result;
+  }
+  public void setResult(Map<String, Object> result) {
+    this.result = result;
+  }	
+  public State getState() {
+    return state;
+  }
+  public void setState(State state) {
+    this.state = state;
+  }
+  public String getMsg() {
+    return msg;
+  }
+  public void setMsg(String msg) {
+    this.msg = msg;
+  }
+  public String getCrawlId() {
+    return crawlId;
+  }
+  public void setCrawlId(String crawlId) {
+    this.crawlId = crawlId;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/service/model/response/NutchServerInfo.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/service/model/response/NutchServerInfo.java b/nutch-core/src/main/java/org/apache/nutch/service/model/response/NutchServerInfo.java
new file mode 100644
index 0000000..f8867e6
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/service/model/response/NutchServerInfo.java
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.service.model.response;
+
+import java.util.Collection;
+import java.util.Date;
+import java.util.Set;
+
+public class NutchServerInfo {
+
+  private Date startDate;
+  private Set<String> configuration;
+  private Collection<JobInfo> jobs;
+  private Collection<JobInfo> runningJobs;
+  public Date getStartDate() {
+    return startDate;
+  }
+  public void setStartDate(Date startDate) {
+    this.startDate = startDate;
+  }
+  public Set<String> getConfiguration() {
+    return configuration;
+  }
+  public void setConfiguration(Set<String> configuration) {
+    this.configuration = configuration;
+  }
+  public Collection<JobInfo> getJobs() {
+    return jobs;
+  }
+  public void setJobs(Collection<JobInfo> jobs) {
+    this.jobs = jobs;
+  }
+  public Collection<JobInfo> getRunningJobs() {
+    return runningJobs;
+  }
+  public void setRunningJobs(Collection<JobInfo> runningJobs) {
+    this.runningJobs = runningJobs;
+  }
+  
+  
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/service/resources/AbstractResource.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/service/resources/AbstractResource.java b/nutch-core/src/main/java/org/apache/nutch/service/resources/AbstractResource.java
new file mode 100644
index 0000000..ebe4138
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/service/resources/AbstractResource.java
@@ -0,0 +1,45 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.service.resources;
+
+import javax.ws.rs.Produces;
+import javax.ws.rs.WebApplicationException;
+import javax.ws.rs.core.MediaType;
+import javax.ws.rs.core.Response;
+import javax.ws.rs.core.Response.Status;
+
+import org.apache.nutch.service.ConfManager;
+import org.apache.nutch.service.JobManager;
+import org.apache.nutch.service.NutchServer;
+
+@Produces(MediaType.APPLICATION_JSON)
+public abstract class AbstractResource {
+
+  protected JobManager jobManager;
+  protected ConfManager configManager;
+  protected NutchServer server;
+
+  public AbstractResource() {
+    server = NutchServer.getInstance();
+    configManager = NutchServer.getInstance().getConfManager();
+    jobManager = NutchServer.getInstance().getJobManager();
+  }
+
+  protected void throwBadRequestException(String message) {
+    throw new WebApplicationException(Response.status(Status.BAD_REQUEST).entity(message).build());
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/service/resources/AdminResource.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/service/resources/AdminResource.java b/nutch-core/src/main/java/org/apache/nutch/service/resources/AdminResource.java
new file mode 100644
index 0000000..3f0189e
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/service/resources/AdminResource.java
@@ -0,0 +1,85 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.service.resources;
+
+import java.util.Date;
+
+import javax.ws.rs.GET;
+import javax.ws.rs.Path;
+import javax.ws.rs.QueryParam;
+
+import org.apache.nutch.service.model.response.JobInfo.State;
+import org.apache.nutch.service.model.response.NutchServerInfo;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+@Path(value="/admin")
+public class AdminResource extends AbstractResource{
+
+  private final int DELAY_SEC = 1;
+  private static final Logger LOG = LoggerFactory
+      .getLogger(AdminResource.class);
+
+  /**
+   * To get the status of the Nutch Server 
+   * @return
+   */
+  @GET
+  @Path(value="/")
+  public NutchServerInfo getServerStatus(){
+    NutchServerInfo serverInfo = new NutchServerInfo();
+    serverInfo.setConfiguration(configManager.list());
+    serverInfo.setStartDate(new Date(server.getStarted()));
+    serverInfo.setJobs(jobManager.list(null, State.ANY));
+    serverInfo.setRunningJobs(jobManager.list(null, State.RUNNING));    
+    return serverInfo;
+  }
+
+  /**
+   * Stop the Nutch server
+   * @param force If set to true, it will kill any running jobs
+   * @return
+   */
+  @GET
+  @Path(value="/stop")
+  public String stopServer(@QueryParam("force") boolean force){
+    if(!server.canStop(force)){
+      return "Jobs still running -- Cannot stop server now" ;
+    }    
+    scheduleServerStop();
+    return "Stopping in server on port " + server.getPort();
+  }
+
+  private void scheduleServerStop() {
+    LOG.info("Shutting down server in {} sec", DELAY_SEC);
+    Thread thread = new Thread() {
+      public void run() {
+        try {
+          Thread.sleep(DELAY_SEC*1000);
+        } catch (InterruptedException e) {
+          Thread.currentThread().interrupt();
+        }
+        server.stop();
+        LOG.info("Service stopped.");
+      }
+    };
+    thread.setDaemon(true);
+    thread.start();
+    LOG.info("Service shutting down...");
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/service/resources/ConfigResource.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/service/resources/ConfigResource.java b/nutch-core/src/main/java/org/apache/nutch/service/resources/ConfigResource.java
new file mode 100644
index 0000000..6afd621
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/service/resources/ConfigResource.java
@@ -0,0 +1,137 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.service.resources;
+
+
+import java.util.Map;
+import java.util.Set;
+
+import javax.ws.rs.Consumes;
+import javax.ws.rs.DELETE;
+import javax.ws.rs.GET;
+import javax.ws.rs.POST;
+import javax.ws.rs.PUT;
+import javax.ws.rs.Path;
+import javax.ws.rs.PathParam;
+import javax.ws.rs.Produces;
+import javax.ws.rs.WebApplicationException;
+import javax.ws.rs.core.MediaType;
+import javax.ws.rs.core.Response;
+import javax.ws.rs.core.Response.Status;
+
+import org.apache.nutch.service.model.request.NutchConfig;
+import com.fasterxml.jackson.jaxrs.annotation.JacksonFeatures;
+import com.fasterxml.jackson.databind.SerializationFeature;
+
+@Path("/config")
+public class ConfigResource extends AbstractResource{
+
+  public static final String DEFAULT = "default";
+
+  /**
+   * Returns a list of all configurations created.
+   * @return List of configurations
+   */
+  @GET
+  @Path("/")
+	@JacksonFeatures(serializationEnable =  { SerializationFeature.INDENT_OUTPUT })
+  public Set<String> getConfigs() {
+    return configManager.list();
+  }
+
+  /** 
+   * Get configuration properties 
+   * @param configId The configuration ID to fetch
+   * @return HashMap of the properties set within the given configId
+   */
+  @GET
+  @Path("/{configId}")
+	@JacksonFeatures(serializationEnable =  { SerializationFeature.INDENT_OUTPUT })
+  public Map<String, String> getConfig(@PathParam("configId") String configId) {
+    return configManager.getAsMap(configId);
+  }
+
+  /**
+   * Get property 
+   * @param configId The ID of the configuration
+   * @param propertyId The name(key) of the property
+   * @return value of the specified property in the provided configId.
+   */
+  @GET
+  @Path("/{configId}/{propertyId}")
+  @Produces(MediaType.TEXT_PLAIN)
+	@JacksonFeatures(serializationEnable =  { SerializationFeature.INDENT_OUTPUT })
+  public String getProperty(@PathParam("configId") String configId,
+      @PathParam("propertyId") String propertyId) {
+    return configManager.getAsMap(configId).get(propertyId);
+  }
+
+  /**
+   * Removes the configuration from the list of known configurations. 
+   * @param configId The ID of the configuration to delete
+   */
+  @DELETE
+  @Path("/{configId}")
+  public void deleteConfig(@PathParam("configId") String configId) {
+    configManager.delete(configId);
+  }
+
+  /**
+   * Create new configuration.
+   * @param newConfig 
+   * @return The name of the new configuration created
+   */
+  @POST
+  @Path("/create")
+  @Consumes(MediaType.APPLICATION_JSON)
+  @Produces(MediaType.TEXT_PLAIN)
+  public Response createConfig(NutchConfig newConfig) {
+    if (newConfig == null) {
+      return Response.status(400)
+          .entity("Nutch configuration cannot be empty!").build();
+    }
+    try{
+      configManager.create(newConfig);
+    }catch(Exception e){
+      return Response.status(400)
+      .entity(e.getMessage()).build();
+    }
+    return Response.ok(newConfig.getConfigId()).build();
+  }
+  
+  /**
+   * Adds/Updates a particular property value in the configuration
+   * @param confId Configuration ID whose property needs to be updated. Make sure that the given
+   *               confId exists to prevent errors. 
+   * @param propertyKey Name of the property
+   * @param value Value as a simple text 
+   * @return Success code
+   */
+  @PUT
+  @Path("/{configId}/{propertyId}")
+  @Consumes(MediaType.TEXT_PLAIN)
+  public Response updateProperty(@PathParam("configId")String confId, 
+      @PathParam("propertyId")String propertyKey, String value) {
+    try{
+    configManager.setProperty(confId, propertyKey, value);
+    }catch(Exception e) {
+      return Response.status(400).entity(e.getMessage()).build();
+    }
+    return Response.ok().build();
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/service/resources/DbResource.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/service/resources/DbResource.java b/nutch-core/src/main/java/org/apache/nutch/service/resources/DbResource.java
new file mode 100644
index 0000000..2672fcc
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/service/resources/DbResource.java
@@ -0,0 +1,143 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.service.resources;
+
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+
+import javax.ws.rs.Consumes;
+import javax.ws.rs.DefaultValue;
+import javax.ws.rs.GET;
+import javax.ws.rs.POST;
+import javax.ws.rs.Path;
+import javax.ws.rs.Produces;
+import javax.ws.rs.QueryParam;
+import javax.ws.rs.core.MediaType;
+import javax.ws.rs.core.Response;
+import javax.ws.rs.core.Response.Status;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.crawl.CrawlDbReader;
+import org.apache.nutch.fetcher.FetchNode;
+import org.apache.nutch.fetcher.FetchNodeDb;
+import org.apache.nutch.service.model.request.DbQuery;
+import org.apache.nutch.service.model.response.FetchNodeDbInfo;
+
+@Path(value = "/db")
+public class DbResource extends AbstractResource {
+
+  @POST
+  @Path(value = "/crawldb")
+  @Consumes(MediaType.APPLICATION_JSON)
+  public Response readdb(DbQuery dbQuery){
+    if(dbQuery == null)
+      return Response.status(Status.BAD_REQUEST).build();
+    
+    Configuration conf = configManager.get(dbQuery.getConfId());
+    if(conf == null){
+      conf = configManager.get(ConfigResource.DEFAULT);
+    }
+    if(dbQuery.getCrawlId() == null || dbQuery.getType() == null){
+      return Response.status(Status.BAD_REQUEST).build();
+    }
+    String type = dbQuery.getType();
+
+    if(type.equalsIgnoreCase("stats")){
+      return crawlDbStats(conf, dbQuery.getArgs(), dbQuery.getCrawlId());
+    }
+    if(type.equalsIgnoreCase("dump")){
+      return crawlDbDump(conf, dbQuery.getArgs(), dbQuery.getCrawlId());
+    }
+    if(type.equalsIgnoreCase("topN")){
+      return crawlDbTopN(conf, dbQuery.getArgs(), dbQuery.getCrawlId());
+    }
+    if(type.equalsIgnoreCase("url")){
+      return crawlDbUrl(conf, dbQuery.getArgs(), dbQuery.getCrawlId());
+    }
+    return null;
+
+  }	
+
+  @GET
+  @Path(value="/fetchdb")
+  public List<FetchNodeDbInfo> fetchDb(@DefaultValue("0")@QueryParam("to")int to, @DefaultValue("0")@QueryParam("from")int from){
+    List<FetchNodeDbInfo> listOfFetchedNodes = new ArrayList<FetchNodeDbInfo>();
+    Map<Integer, FetchNode> fetchNodedbMap = FetchNodeDb.getInstance().getFetchNodeDb();
+
+    if(to ==0 || to>fetchNodedbMap.size()){
+      to = fetchNodedbMap.size();
+    }
+    for(int i=from;i<=to;i++){
+      if(!fetchNodedbMap.containsKey(i)){
+        continue;
+      }
+      FetchNode node = fetchNodedbMap.get(i);
+      FetchNodeDbInfo fdbInfo = new FetchNodeDbInfo();
+      fdbInfo.setUrl(node.getUrl().toString());
+      fdbInfo.setStatus(node.getStatus());
+      fdbInfo.setNumOfOutlinks(node.getOutlinks().length);
+      fdbInfo.setChildNodes(node.getOutlinks());
+      listOfFetchedNodes.add(fdbInfo);
+    }
+
+    return listOfFetchedNodes;
+  }
+  @SuppressWarnings("resource")
+  private Response crawlDbStats(Configuration conf, Map<String, String> args, String crawlId){
+    CrawlDbReader dbr = new CrawlDbReader();
+    try{
+      return Response.ok(dbr.query(args, conf, "stats", crawlId)).build();
+    }catch(Exception e){
+      e.printStackTrace();
+      return Response.serverError().entity(e.getMessage()).type(MediaType.TEXT_PLAIN).build();
+    }
+  }
+
+  @Produces(MediaType.APPLICATION_OCTET_STREAM)
+  private Response crawlDbDump(Configuration conf, Map<String, String> args, String crawlId){
+    CrawlDbReader dbr = new CrawlDbReader();
+    try{
+      return Response.ok(dbr.query(args, conf, "dump", crawlId), MediaType.APPLICATION_OCTET_STREAM).build();
+    }catch(Exception e){
+      e.printStackTrace();
+      return Response.serverError().entity(e.getMessage()).type(MediaType.TEXT_PLAIN).build();
+    }
+  }
+
+  @Produces(MediaType.APPLICATION_OCTET_STREAM)
+  private Response crawlDbTopN(Configuration conf, Map<String, String> args, String crawlId) {
+    CrawlDbReader dbr = new CrawlDbReader();
+    try{
+      return Response.ok(dbr.query(args, conf, "topN", crawlId), MediaType.APPLICATION_OCTET_STREAM).build();
+    }catch(Exception e){
+      e.printStackTrace();
+      return Response.serverError().entity(e.getMessage()).type(MediaType.TEXT_PLAIN).build();
+    }		
+  }
+
+  private Response crawlDbUrl(Configuration conf, Map<String, String> args, String crawlId){
+    CrawlDbReader dbr = new CrawlDbReader();
+    try{
+      return Response.ok(dbr.query(args, conf, "url", crawlId)).build();
+    }catch(Exception e){
+      e.printStackTrace();
+      return Response.serverError().entity(e.getMessage()).type(MediaType.TEXT_PLAIN).build();
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/service/resources/JobResource.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/service/resources/JobResource.java b/nutch-core/src/main/java/org/apache/nutch/service/resources/JobResource.java
new file mode 100644
index 0000000..b142d73
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/service/resources/JobResource.java
@@ -0,0 +1,99 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.service.resources;
+
+import java.util.Collection;
+
+import javax.ws.rs.Consumes;
+import javax.ws.rs.GET;
+import javax.ws.rs.POST;
+import javax.ws.rs.Path;
+import javax.ws.rs.PathParam;
+import javax.ws.rs.QueryParam;
+import javax.ws.rs.core.MediaType;
+
+import com.fasterxml.jackson.databind.SerializationFeature;
+import com.fasterxml.jackson.jaxrs.annotation.JacksonFeatures;
+import org.apache.nutch.service.model.request.JobConfig;
+import org.apache.nutch.service.model.response.JobInfo;
+import org.apache.nutch.service.model.response.JobInfo.State;
+
+@Path(value = "/job")
+public class JobResource extends AbstractResource {
+
+  /**
+   * Get job history
+   * @param crawlId
+   * @return A nested JSON object of all the jobs created
+   */
+  @GET
+  @Path(value = "/")
+  @JacksonFeatures(serializationEnable =  { SerializationFeature.INDENT_OUTPUT })
+  public Collection<JobInfo> getJobs(@QueryParam("crawlId") String crawlId) {
+    return jobManager.list(crawlId, State.ANY);
+  }
+
+  /**
+   * Get job info
+   * @param id Job ID
+   * @param crawlId Crawl ID
+   * @return A JSON object of job parameters
+   */
+  @GET
+  @Path(value = "/{id}")
+  @JacksonFeatures(serializationEnable =  { SerializationFeature.INDENT_OUTPUT })
+  public JobInfo getInfo(@PathParam("id") String id,
+      @QueryParam("crawlId") String crawlId) {
+    return jobManager.get(crawlId, id);
+  }
+
+  /**
+   * Stop Job
+   * @param id Job ID
+   * @param crawlId
+   * @return
+   */
+  @GET
+  @Path(value = "/{id}/stop")
+  public boolean stop(@PathParam("id") String id,
+      @QueryParam("crawlId") String crawlId) {
+    return jobManager.stop(crawlId, id);
+  }
+
+  
+  @GET
+  @Path(value = "/{id}/abort")
+  public boolean abort(@PathParam("id") String id,
+      @QueryParam("crawlId") String crawlId) {
+    return jobManager.abort(crawlId, id);
+  }
+
+  /**
+   * Create a new job
+   * @param config The parameters of the job to create
+   * @return A JSON object of the job created with its details
+   */
+  @POST
+  @Path(value = "/create")
+  @Consumes(MediaType.APPLICATION_JSON)
+  public JobInfo create(JobConfig config) {
+    if (config == null) {
+      throwBadRequestException("Job configuration is required!");
+    }   
+    return jobManager.create(config);   
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/service/resources/ReaderResouce.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/service/resources/ReaderResouce.java b/nutch-core/src/main/java/org/apache/nutch/service/resources/ReaderResouce.java
new file mode 100644
index 0000000..030999e
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/service/resources/ReaderResouce.java
@@ -0,0 +1,177 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.service.resources;
+
+import java.util.HashMap;
+
+import javax.ws.rs.Consumes;
+import javax.ws.rs.DefaultValue;
+import javax.ws.rs.GET;
+import javax.ws.rs.POST;
+import javax.ws.rs.Path;
+import javax.ws.rs.Produces;
+import javax.ws.rs.QueryParam;
+import javax.ws.rs.core.MediaType;
+import javax.ws.rs.core.Response;
+import javax.ws.rs.core.Response.Status;
+
+import org.apache.nutch.service.NutchReader;
+import org.apache.nutch.service.impl.LinkReader;
+import org.apache.nutch.service.impl.NodeReader;
+import org.apache.nutch.service.impl.SequenceReader;
+import org.apache.nutch.service.model.request.ReaderConfig;
+
+/**
+ * The Reader endpoint enables a user to read sequence files, 
+ * nodes and links from the Nutch webgraph.
+ * @author Sujen Shah
+ *
+ */
+@Path("/reader")
+public class ReaderResouce {
+
+  /**
+   * Read a sequence file
+   * @param readerConf 
+   * @param nrows Number of rows to read. If not specified all rows will be read
+   * @param start Specify a starting line number to read the file from
+   * @param end The line number to read the file till
+   * @param count Boolean value. If true, this endpoint will return the number of lines in the line
+   * @return Appropriate HTTP response based on the query
+   */
+  @Path("/sequence/read")
+  @POST
+  @Consumes(MediaType.APPLICATION_JSON)
+  @Produces(MediaType.APPLICATION_JSON)
+  public Response seqRead(ReaderConfig readerConf, 
+      @DefaultValue("-1")@QueryParam("nrows") int nrows, 
+      @DefaultValue("-1")@QueryParam("start") int start, 
+      @QueryParam("end")int end, @QueryParam("count") boolean count) {
+
+    NutchReader reader = new SequenceReader();
+    String path = readerConf.getPath();
+    return performRead(reader, path, nrows, start, end, count);
+  }
+
+  /**
+   * Get Link Reader response schema 
+   * @return JSON object specifying the schema of the responses returned by the Link Reader
+   */
+  @Path("/link")
+  @GET
+  @Produces(MediaType.APPLICATION_JSON)
+  public Response linkRead() {
+    HashMap<String, String> schema = new HashMap<>();
+    schema.put("key_url","string");
+    schema.put("timestamp", "int");
+    schema.put("score","float"); 
+    schema.put("anchor","string");
+    schema.put("linktype","string");
+    schema.put("url","string");
+    return Response.ok(schema).type(MediaType.APPLICATION_JSON).build();
+  }
+
+  /**
+   * Read link object 
+   * @param readerConf
+   * @param nrows
+   * @param start
+   * @param end
+   * @param count
+   * @return
+   */
+  @Path("/link/read")
+  @POST
+  @Consumes(MediaType.APPLICATION_JSON)
+  @Produces(MediaType.APPLICATION_JSON)
+  public Response linkRead(ReaderConfig readerConf, 
+      @DefaultValue("-1")@QueryParam("nrows") int nrows, 
+      @DefaultValue("-1")@QueryParam("start") int start, 
+      @QueryParam("end") int end, @QueryParam("count") boolean count) {
+
+    NutchReader reader = new LinkReader();
+    String path = readerConf.getPath();
+    return performRead(reader, path, nrows, start, end, count);
+  }
+
+  /**
+   * Get schema of the Node object
+   * @return
+   */
+  @Path("/node")
+  @GET
+  @Produces(MediaType.APPLICATION_JSON)
+  public Response nodeRead() {
+    HashMap<String, String> schema = new HashMap<>();
+    schema.put("key_url","string");
+    schema.put("num_inlinks", "int");
+    schema.put("num_outlinks","int");
+    schema.put("inlink_score","float"); 
+    schema.put("outlink_score","float"); 
+    schema.put("metadata","string");
+    return Response.ok(schema).type(MediaType.APPLICATION_JSON).build();
+  }
+
+
+  /**
+   * Read Node object as stored in the Nutch Webgraph
+   * @param readerConf
+   * @param nrows
+   * @param start
+   * @param end
+   * @param count
+   * @return
+   */
+  @Path("/node/read")
+  @POST
+  @Consumes(MediaType.APPLICATION_JSON)
+  @Produces(MediaType.APPLICATION_JSON)
+  public Response nodeRead(ReaderConfig readerConf, 
+      @DefaultValue("-1")@QueryParam("nrows") int nrows, 
+      @DefaultValue("-1")@QueryParam("start") int start, 
+      @QueryParam("end") int end, @QueryParam("count") boolean count) {
+
+    NutchReader reader = new NodeReader();
+    String path = readerConf.getPath();
+    return performRead(reader, path, nrows, start, end, count);
+  }
+
+
+  private Response performRead(NutchReader reader, String path, 
+      int nrows, int start, int end, boolean count) {
+    Object result;
+    try{
+      if(count){
+        result = reader.count(path);
+        return Response.ok(result).type(MediaType.TEXT_PLAIN).build();
+      }
+      else if(start>-1 && end>0) {
+        result = reader.slice(path, start, end);
+      }
+      else if(nrows>-1) {
+        result = reader.head(path, nrows);
+      }
+      else {
+        result = reader.read(path);
+      }
+      return Response.ok(result).type(MediaType.APPLICATION_JSON).build();
+    }catch(Exception e){
+      return Response.status(Status.BAD_REQUEST).entity("File not found").build();
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/service/resources/SeedResource.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/service/resources/SeedResource.java b/nutch-core/src/main/java/org/apache/nutch/service/resources/SeedResource.java
new file mode 100644
index 0000000..5261139
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/service/resources/SeedResource.java
@@ -0,0 +1,111 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.service.resources;
+
+import static javax.ws.rs.core.Response.status;
+
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.util.Collection;
+
+import javax.ws.rs.Consumes;
+import javax.ws.rs.POST;
+import javax.ws.rs.Path;
+import javax.ws.rs.Produces;
+import javax.ws.rs.WebApplicationException;
+import javax.ws.rs.core.MediaType;
+import javax.ws.rs.core.Response;
+import javax.ws.rs.core.Response.Status;
+
+import org.apache.commons.collections.CollectionUtils;
+import org.apache.nutch.service.model.request.SeedList;
+import org.apache.nutch.service.model.request.SeedUrl;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.google.common.io.Files;
+
+@Path("/seed")
+public class SeedResource extends AbstractResource {
+  private static final Logger log = LoggerFactory
+      .getLogger(AdminResource.class);
+
+  /**
+   * Method creates seed list file and returns temporary directory path
+   * @param seedList
+   * @return
+   */
+  @POST
+  @Path("/create")
+  @Consumes(MediaType.APPLICATION_JSON)
+  @Produces(MediaType.TEXT_PLAIN)
+  public Response createSeedFile(SeedList seedList) {
+    if (seedList == null) {
+      return Response.status(Status.BAD_REQUEST)
+          .entity("Seed list cannot be empty!").build();
+    }
+    File seedFile = createSeedFile();
+    BufferedWriter writer = getWriter(seedFile);
+
+    Collection<SeedUrl> seedUrls = seedList.getSeedUrls();
+    if (CollectionUtils.isNotEmpty(seedUrls)) {
+      for (SeedUrl seedUrl : seedUrls) {
+        writeUrl(writer, seedUrl);
+      }
+    }
+
+    return Response.ok().entity(seedFile.getParent()).build();
+  }
+
+  private void writeUrl(BufferedWriter writer, SeedUrl seedUrl) {
+    try {
+      writer.write(seedUrl.getUrl());
+      writer.newLine();
+      writer.flush();
+    } catch (IOException e) {
+      throw handleException(e);
+    }
+  }
+
+  private BufferedWriter getWriter(File seedFile) {
+    try {
+      return new BufferedWriter(new FileWriter(seedFile));
+    } catch (FileNotFoundException e) {
+      throw handleException(e);
+    } catch (IOException e) {
+      throw handleException(e);
+    }
+  }
+
+  private File createSeedFile() {
+    try {
+      return File.createTempFile("seed", ".txt", Files.createTempDir());
+    } catch (IOException e) {
+      throw handleException(e);
+    }
+  }
+
+  private RuntimeException handleException(Exception e) {
+    log.error("Cannot create seed file!", e);
+    return new WebApplicationException(status(Status.INTERNAL_SERVER_ERROR)
+        .entity("Cannot create seed file!").build());
+  }
+
+}

[23/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build for nutch-core and nutch-plugins

Posted by th...@apache.org.

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-metadata/src/main/java/org/apache/nutch/indexer/metadata/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-metadata/src/main/java/org/apache/nutch/indexer/metadata/package-info.java b/nutch-plugins/index-metadata/src/main/java/org/apache/nutch/indexer/metadata/package-info.java
new file mode 100644
index 0000000..8f2bee5
--- /dev/null
+++ b/nutch-plugins/index-metadata/src/main/java/org/apache/nutch/indexer/metadata/package-info.java
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Indexing filter to add document metadata to the index.
+ * Metadata may come from CrawlDb, parse or content metadata.
+ */
+package org.apache.nutch.indexer.metadata;
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-more/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-more/build.xml b/nutch-plugins/index-more/build.xml
new file mode 100644
index 0000000..dec1e12
--- /dev/null
+++ b/nutch-plugins/index-more/build.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="index-more" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-more/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-more/ivy.xml b/nutch-plugins/index-more/ivy.xml
new file mode 100644
index 0000000..1a86d68
--- /dev/null
+++ b/nutch-plugins/index-more/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-more/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-more/plugin.xml b/nutch-plugins/index-more/plugin.xml
new file mode 100644
index 0000000..d920f72
--- /dev/null
+++ b/nutch-plugins/index-more/plugin.xml
@@ -0,0 +1,42 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="index-more"
+   name="More Indexing Filter"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+
+   <runtime>
+      <library name="index-more.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+   <extension id="org.apache.nutch.indexer.more"
+              name="Nutch More Indexing Filter"
+              point="org.apache.nutch.indexer.IndexingFilter">
+      <implementation id="MoreIndexingFilter"
+                      class="org.apache.nutch.indexer.more.MoreIndexingFilter"/>
+   </extension>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-more/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-more/pom.xml b/nutch-plugins/index-more/pom.xml
new file mode 100644
index 0000000..80e5de0
--- /dev/null
+++ b/nutch-plugins/index-more/pom.xml
@@ -0,0 +1,38 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>index-more</artifactId>
+    <packaging>jar</packaging>
+
+    <name>index-more</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-more/src/main/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-more/src/main/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java b/nutch-plugins/index-more/src/main/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
new file mode 100644
index 0000000..6e64ede
--- /dev/null
+++ b/nutch-plugins/index-more/src/main/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
@@ -0,0 +1,344 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer.more;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.nutch.metadata.Metadata;
+
+import org.apache.nutch.net.protocols.HttpDateFormat;
+import org.apache.nutch.net.protocols.Response;
+
+import org.apache.nutch.parse.Parse;
+
+import org.apache.nutch.indexer.IndexingFilter;
+import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.indexer.NutchDocument;
+
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.util.MimeUtil;
+import org.apache.tika.Tika;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+
+import java.text.ParseException;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.util.Date;
+import java.util.regex.*;
+import java.util.HashMap;
+
+import org.apache.commons.lang.StringUtils;
+import org.apache.commons.lang.time.DateUtils;
+
+/**
+ * Add (or reset) a few metaData properties as respective fields (if they are
+ * available), so that they can be accurately used within the search index.
+ * 
+ * 'lastModifed' is indexed to support query by date, 'contentLength' obtains
+ * content length from the HTTP header, 'type' field is indexed to support query
+ * by type and finally the 'title' field is an attempt to reset the title if a
+ * content-disposition hint exists. The logic is that such a presence is
+ * indicative that the content provider wants the filename therein to be used as
+ * the title.
+ * 
+ * Still need to make content-length searchable!
+ * 
+ * @author John Xing
+ */
+
+public class MoreIndexingFilter implements IndexingFilter {
+  public static final Logger LOG = LoggerFactory
+      .getLogger(MoreIndexingFilter.class);
+
+  /** Get the MimeTypes resolver instance. */
+  private MimeUtil MIME;
+  private Tika tika = new Tika();
+
+  /** Map for mime-type substitution */
+  private HashMap<String, String> mimeMap = null;
+  private boolean mapMimes = false;
+
+  public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
+      CrawlDatum datum, Inlinks inlinks) throws IndexingException {
+
+    String url_s = url.toString();
+
+    addTime(doc, parse.getData(), url_s, datum);
+    addLength(doc, parse.getData(), url_s);
+    addType(doc, parse.getData(), url_s, datum);
+    resetTitle(doc, parse.getData(), url_s);
+
+    return doc;
+  }
+
+  // Add time related meta info. Add last-modified if present. Index date as
+  // last-modified, or, if that's not present, use fetch time.
+  private NutchDocument addTime(NutchDocument doc, ParseData data, String url,
+      CrawlDatum datum) {
+    long time = -1;
+
+    String lastModified = data.getMeta(Metadata.LAST_MODIFIED);
+    if (lastModified != null) { // try parse last-modified
+      time = getTime(lastModified, url); // use as time
+                                         // store as string
+      doc.add("lastModified", new Date(time));
+    }
+
+    if (time == -1) { // if no last-modified specified in HTTP header
+      time = datum.getModifiedTime(); // use value in CrawlDatum
+      if (time <= 0) { // if also unset
+        time = datum.getFetchTime(); // use time the fetch took place (fetchTime
+                                     // of fetchDatum)
+      }
+    }
+
+    // un-stored, indexed and un-tokenized
+    doc.add("date", new Date(time));
+    return doc;
+  }
+
+  private long getTime(String date, String url) {
+    long time = -1;
+    try {
+      time = HttpDateFormat.toLong(date);
+    } catch (ParseException e) {
+      // try to parse it as date in alternative format
+      try {
+        Date parsedDate = DateUtils.parseDate(date, new String[] {
+            "EEE MMM dd HH:mm:ss yyyy", "EEE MMM dd HH:mm:ss yyyy zzz",
+            "EEE MMM dd HH:mm:ss zzz yyyy", "EEE, MMM dd HH:mm:ss yyyy zzz",
+            "EEE, dd MMM yyyy HH:mm:ss zzz", "EEE,dd MMM yyyy HH:mm:ss zzz",
+            "EEE, dd MMM yyyy HH:mm:sszzz", "EEE, dd MMM yyyy HH:mm:ss",
+            "EEE, dd-MMM-yy HH:mm:ss zzz", "yyyy/MM/dd HH:mm:ss.SSS zzz",
+            "yyyy/MM/dd HH:mm:ss.SSS", "yyyy/MM/dd HH:mm:ss zzz", "yyyy/MM/dd",
+            "yyyy.MM.dd HH:mm:ss", "yyyy-MM-dd HH:mm",
+            "MMM dd yyyy HH:mm:ss. zzz", "MMM dd yyyy HH:mm:ss zzz",
+            "dd.MM.yyyy HH:mm:ss zzz", "dd MM yyyy HH:mm:ss zzz",
+            "dd.MM.yyyy; HH:mm:ss", "dd.MM.yyyy HH:mm:ss", "dd.MM.yyyy zzz",
+            "yyyy-MM-dd'T'HH:mm:ss'Z'" });
+        time = parsedDate.getTime();
+        // if (LOG.isWarnEnabled()) {
+        // LOG.warn(url + ": parsed date: " + date +" to:"+time);
+        // }
+      } catch (Exception e2) {
+        if (LOG.isWarnEnabled()) {
+          LOG.warn(url + ": can't parse erroneous date: " + date);
+        }
+      }
+    }
+    return time;
+  }
+
+  // Add Content-Length
+  private NutchDocument addLength(NutchDocument doc, ParseData data, String url) {
+    String contentLength = data.getMeta(Response.CONTENT_LENGTH);
+
+    if (contentLength != null) {
+      // NUTCH-1010 ContentLength not trimmed
+      String trimmed = contentLength.toString().trim();
+      if (!trimmed.isEmpty())
+        doc.add("contentLength", trimmed);
+    }
+    return doc;
+  }
+
+  /**
+   * <p>
+   * Add Content-Type and its primaryType and subType add contentType,
+   * primaryType and subType to field "type" as un-stored, indexed and
+   * un-tokenized, so that search results can be confined by contentType or its
+   * primaryType or its subType.
+   * </p>
+   * <p>
+   * For example, if contentType is application/vnd.ms-powerpoint, search can be
+   * done with one of the following qualifiers
+   * type:application/vnd.ms-powerpoint type:application type:vnd.ms-powerpoint
+   * all case insensitive. The query filter is implemented in
+   * {@link TypeQueryFilter}.
+   * </p>
+   * 
+   * @param doc
+   * @param data
+   * @param url
+   * @return
+   */
+  private NutchDocument addType(NutchDocument doc, ParseData data, String url,
+      CrawlDatum datum) {
+    String mimeType = null;
+    String contentType = null;
+
+    Writable tcontentType = datum.getMetaData().get(
+        new Text(Response.CONTENT_TYPE));
+    if (tcontentType != null) {
+      contentType = tcontentType.toString();
+    } else
+      contentType = data.getMeta(Response.CONTENT_TYPE);
+    if (contentType == null) {
+      // Note by Jerome Charron on 20050415:
+      // Content Type not solved by a previous plugin
+      // Or unable to solve it... Trying to find it
+      // Should be better to use the doc content too
+      // (using MimeTypes.getMimeType(byte[], String), but I don't know
+      // which field it is?
+      // if (MAGIC) {
+      // contentType = MIME.getMimeType(url, content);
+      // } else {
+      // contentType = MIME.getMimeType(url);
+      // }
+
+      mimeType = tika.detect(url);
+    } else {
+      mimeType = MIME.forName(MimeUtil.cleanMimeType(contentType));
+    }
+
+    // Checks if we solved the content-type.
+    if (mimeType == null) {
+      return doc;
+    }
+
+    // Check if we have to map mime types
+    if (mapMimes) {
+      // Check if the current mime is mapped
+      if (mimeMap.containsKey(mimeType)) {
+        // It's mapped, let's replace it
+        mimeType = mimeMap.get(mimeType);
+      }
+    }
+
+    contentType = mimeType;
+    doc.add("type", contentType);
+
+    // Check if we need to split the content type in sub parts
+    if (conf.getBoolean("moreIndexingFilter.indexMimeTypeParts", true)) {
+      String[] parts = getParts(contentType);
+
+      for (String part : parts) {
+        doc.add("type", part);
+      }
+    }
+
+    // leave this for future improvement
+    // MimeTypeParameterList parameterList = mimeType.getParameters()
+
+    return doc;
+  }
+
+  /**
+   * Utility method for splitting mime type into type and subtype.
+   * 
+   * @param mimeType
+   * @return
+   */
+  static String[] getParts(String mimeType) {
+    return mimeType.split("/");
+  }
+
+  // Reset title if we see non-standard HTTP header "Content-Disposition".
+  // It's a good indication that content provider wants filename therein
+  // be used as the title of this url.
+
+  // Patterns used to extract filename from possible non-standard
+  // HTTP header "Content-Disposition". Typically it looks like:
+  // Content-Disposition: inline; filename="foo.ppt"
+  private Configuration conf;
+
+  static Pattern patterns[] = { null, null };
+
+  static {
+    try {
+      // order here is important
+      patterns[0] = Pattern.compile("\\bfilename=['\"](.+)['\"]");
+      patterns[1] = Pattern.compile("\\bfilename=(\\S+)\\b");
+    } catch (PatternSyntaxException e) {
+      // just ignore
+    }
+  }
+
+  private NutchDocument resetTitle(NutchDocument doc, ParseData data, String url) {
+    String contentDisposition = data.getMeta(Metadata.CONTENT_DISPOSITION);
+    if (contentDisposition == null || doc.getFieldValue("title") != null)
+      return doc;
+
+    for (int i = 0; i < patterns.length; i++) {
+      Matcher matcher = patterns[i].matcher(contentDisposition);
+      if (matcher.find()) {
+        doc.add("title", matcher.group(1));
+        break;
+      }
+    }
+
+    return doc;
+  }
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    MIME = new MimeUtil(conf);
+
+    if (conf.getBoolean("moreIndexingFilter.mapMimeTypes", false) == true) {
+      mapMimes = true;
+
+      // Load the mapping
+      try {
+        readConfiguration();
+      } catch (Exception e) {
+        LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
+      }
+    }
+  }
+
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+  private void readConfiguration() throws IOException {
+    LOG.info("Reading content type mappings from file contenttype-mapping.txt");
+    BufferedReader reader = new BufferedReader(
+        conf.getConfResourceAsReader("contenttype-mapping.txt"));
+    String line;
+    String parts[];
+    boolean formatWarningShown = false;
+
+    mimeMap = new HashMap<String, String>();
+
+    while ((line = reader.readLine()) != null) {
+      if (StringUtils.isNotBlank(line) && !line.startsWith("#")) {
+        line.trim();
+        parts = line.split("\t");
+
+        // Must be at least two parts
+        if (parts.length > 1) {
+          for (int i = 1; i < parts.length; i++) {
+            mimeMap.put(parts[i].trim(), parts[0].trim());
+          }
+        } else {
+          LOG.warn("Wrong format of line: {}", line);
+          if (!formatWarningShown) {
+            LOG.warn("Expected format: <target type> <tab> <type1> [<tab> <type2> ...]");
+            formatWarningShown = true;
+          }
+        }
+      }
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-more/src/main/java/org/apache/nutch/indexer/more/package.html
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-more/src/main/java/org/apache/nutch/indexer/more/package.html b/nutch-plugins/index-more/src/main/java/org/apache/nutch/indexer/more/package.html
new file mode 100644
index 0000000..7b8fade
--- /dev/null
+++ b/nutch-plugins/index-more/src/main/java/org/apache/nutch/indexer/more/package.html
@@ -0,0 +1,6 @@
+<html>
+<body>
+<p>A more indexing plugin, adds "more" index fields:
+last modified date, MIME type, content length.</p><p></p>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-more/src/test/java/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-more/src/test/java/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java b/nutch-plugins/index-more/src/test/java/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java
new file mode 100644
index 0000000..f918dde
--- /dev/null
+++ b/nutch-plugins/index-more/src/test/java/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java
@@ -0,0 +1,123 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer.more;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Assert;
+import org.junit.Test;
+
+public class TestMoreIndexingFilter {
+
+  @Test
+  public void testContentType() throws IndexingException {
+    Configuration conf = NutchConfiguration.create();
+    assertContentType(conf, "text/html", "text/html");
+    assertContentType(conf, "text/html; charset=UTF-8", "text/html");
+  }
+
+  @Test
+  public void testGetParts() {
+    String[] parts = MoreIndexingFilter.getParts("text/html");
+    assertParts(parts, 2, "text", "html");
+  }
+
+  /**
+   * @since NUTCH-901
+   */
+  @Test
+  public void testNoParts() {
+    Configuration conf = NutchConfiguration.create();
+    conf.setBoolean("moreIndexingFilter.indexMimeTypeParts", false);
+    MoreIndexingFilter filter = new MoreIndexingFilter();
+    filter.setConf(conf);
+    Assert.assertNotNull(filter);
+    NutchDocument doc = new NutchDocument();
+    ParseImpl parse = new ParseImpl("foo bar", new ParseData());
+
+    try {
+      filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"),
+          new CrawlDatum(), new Inlinks());
+    } catch (Exception e) {
+      e.printStackTrace();
+      Assert.fail(e.getMessage());
+    }
+    Assert.assertNotNull(doc);
+    Assert.assertTrue(doc.getFieldNames().contains("type"));
+    Assert.assertEquals(1, doc.getField("type").getValues().size());
+    Assert.assertEquals("text/html", doc.getFieldValue("type"));
+  }
+
+  @Test
+  public void testContentDispositionTitle() throws IndexingException {
+    Configuration conf = NutchConfiguration.create();
+
+    Metadata metadata = new Metadata();
+    metadata.add(Response.CONTENT_DISPOSITION, "filename=filename.ext");
+    MoreIndexingFilter filter = new MoreIndexingFilter();
+    filter.setConf(conf);
+
+    Text url = new Text("http://www.example.com/");
+    ParseImpl parseImpl = new ParseImpl("text", new ParseData(
+        new ParseStatus(), "title", new Outlink[0], metadata));
+
+    NutchDocument doc = new NutchDocument();
+    doc = filter.filter(doc, parseImpl, url, new CrawlDatum(), new Inlinks());
+
+    Assert.assertEquals("content-disposition not detected", "filename.ext",
+        doc.getFieldValue("title"));
+
+    /* NUTCH-1140: do not add second title to avoid a multi-valued title field */
+    doc = new NutchDocument();
+    doc.add("title", "title");
+    doc = filter.filter(doc, parseImpl, url, new CrawlDatum(), new Inlinks());
+    Assert.assertEquals("do not add second title by content-disposition",
+        "title", doc.getFieldValue("title"));
+  }
+
+  private void assertParts(String[] parts, int count, String... expected) {
+    Assert.assertEquals(count, parts.length);
+    for (int i = 0; i < expected.length; i++) {
+      Assert.assertEquals(expected[i], parts[i]);
+    }
+  }
+
+  private void assertContentType(Configuration conf, String source,
+      String expected) throws IndexingException {
+    Metadata metadata = new Metadata();
+    metadata.add(Response.CONTENT_TYPE, source);
+    MoreIndexingFilter filter = new MoreIndexingFilter();
+    filter.setConf(conf);
+    NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl(
+        "text", new ParseData(new ParseStatus(), "title", new Outlink[0],
+            metadata)), new Text("http://www.example.com/"), new CrawlDatum(),
+        new Inlinks());
+    Assert.assertEquals("mime type not detected", expected,
+        doc.getFieldValue("type"));
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-replace/README.txt
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-replace/README.txt b/nutch-plugins/index-replace/README.txt
new file mode 100644
index 0000000..4c866a7
--- /dev/null
+++ b/nutch-plugins/index-replace/README.txt
@@ -0,0 +1,95 @@
+IndexReplace plugin
+
+Allows indexing-time regexp replace manipulation of metadata fields.
+
+Configuration Example
+    <property>
+      <name>index.replace.regexp</name>
+      <value>
+        id=/file\:/http\:my.site.com/
+        url=/file\:/http\:my.site.com/2
+      </value>
+    </property
+
+Property format: index.replace.regexp
+    The format of the property is a list of regexp replacements, one line per field being
+    modified.  Field names would be one of those from https://wiki.apache.org/nutch/IndexStructure.
+
+    The fieldname precedes the equal sign.  The first character after the equal sign signifies
+    the delimiter for the regexp, the replacement value and the flags.
+
+Replacement Sequence
+    The replacements will happen in the order listed. If a field needs multiple replacement operations
+    they may be listed more than once.
+
+RegExp Format
+    The regexp and the optional flags should correspond to Pattern.compile(String regexp, int flags) defined
+    here: http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#compile%28java.lang.String,%20int%29
+    Patterns are compiled when the plugin is initialized for efficiency.
+
+Replacement Format
+    The replacement value should correspond to Java Matcher(CharSequence input).replaceAll(String replacement):
+    http://docs.oracle.com/javase/7/docs/api/java/util/regex/Matcher.html#replaceAll%28java.lang.String%29
+
+Flags
+    The flags is an integer sum of the flag values defined in
+    http://docs.oracle.com/javase/7/docs/api/constant-values.html (Sec: java.util.regex.Pattern)
+
+Creating New Fields
+    If you express the fieldname as fldname1:fldname2=[replacement], then the replacer will create a new field
+    from the source field.  The source field remains unmodified.  This is an alternative to solrindex-mapping
+    which is only able to copy fields verbatim.
+
+Multi-valued Fields
+    If a field has multiple values, the replacement will be applied to each value in turn.
+
+Non-string Datatypes
+    Replacement is possible only on String field datatypes.  If the field you name in the property is
+    not a String datatype, it will be silently ignored.
+
+Host and URL specific replacements.
+    If the replacements should apply only to specific pages, then add a sequence like
+
+    hostmatch=hostmatchpattern
+    fld1=/regexp/replace/flags
+    fld2=/regexp/replace/flags
+
+    or
+    urlmatch=urlmatchpattern
+    fld1=/regexp/replace/flags
+    fld2=/regexp/replace/flags
+
+When using Host and URL replacements, all replacements preceding the first hostmatch or urlmatch
+will apply to all parsed pages.  Replacements following a hostmatch or urlmatch will be applied
+to pages which match the host or url field (up to the next hostmatch or urlmatch line).  hostmatch
+and urlmatch patterns must be unique in this property.
+
+Plugin order
+    In most cases you will want this plugin to run last.
+
+Testing your match patterns
+    Online Regexp testers like http://www.regexplanet.com/advanced/java/index.html
+    can help get the basics of your pattern working.
+    To test in nutch: 
+        Prepare a test HTML file with the field contents you want to test. 
+        Place this in a directory accessible to nutch.
+        Use the file:/// syntax to list the test file(s) in a test/urls seed list.
+        See the nutch faq "index my local file system" for conf settings you will need.
+        (Note the urlmatch and hostmatch patterns may not conform to your test file host and url; This
+        test approach confirms only how your global matches behave, unless your urlmatch and hostmatch
+        patterns also match the file: URL pattern)
+ 
+    Run..
+        bin/nutch inject crawl/crawldb test
+        bin/nutch generate crawl/crawldb crawl/segments
+        bin/nutch fetch crawl/segments/[segment]
+        bin/nutch parse crawl/segments/[segment]
+        bin/nutch invertlinks crawl/linkdb -dir crawl/segments
+        ...index your document, for example with SOLR...
+        bin/nutch solrindex http://localhost:8983/solr crawl/crawldb/ -linkdb crawl/linkdb/ crawl/segement[segment] -filter -normalize
+
+    Inspect hadoop.log for info about pattern parsing and compilation..
+        grep replace logs/hadoop.log
+
+    To inspect your index with the solr admin panel...
+        http://localhost:8983/solr/#/

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-replace/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-replace/build.xml b/nutch-plugins/index-replace/build.xml
new file mode 100644
index 0000000..ea8c95d
--- /dev/null
+++ b/nutch-plugins/index-replace/build.xml
@@ -0,0 +1,55 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="index-replace" default="jar-core">
+
+	<import file="../build-plugin.xml" />
+
+	<!-- Add compilation dependencies to classpath -->
+	<path id="plugin.deps">
+		<fileset dir="${nutch.root}/build">
+			<include name="**/index-basic/*.jar" />
+			<include name="**/index-metadata/*.jar" />
+		</fileset>
+		<pathelement location="${nutch.root}/build/lib-regex-filter/test"/>
+	</path>
+
+	<!-- Compile Unit test dependencies -->
+	<target name="deps-test-compile">
+		<ant target="compile-test" inheritall="false" dir="../index-basic"/>
+		<ant target="compile-test" inheritall="false" dir="../index-metadata"/>
+	</target>
+
+	<!-- Deploy Unit test dependencies -->
+	<target name="deps-test">
+		<ant target="deploy" inheritall="false" dir="../nutch-extensionpoints" />
+		<ant target="deploy" inheritall="false" dir="../protocol-file" />
+		<ant target="deploy" inheritall="false" dir="../parse-html" />
+		<ant target="deploy" inheritall="false" dir="../parse-metatags" />
+		<ant target="deploy" inheritall="false" dir="../index-basic" />
+		<ant target="deploy" inheritall="false" dir="../index-metadata" />
+	</target>
+
+	<!-- Copy test file for junit test -->
+	<mkdir dir="${build.test}/data" />
+	<copy todir="${build.test}/data">
+		<fileset dir="sample">
+			<include name="*.html" />
+		</fileset>
+	</copy>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-replace/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-replace/ivy.xml b/nutch-plugins/index-replace/ivy.xml
new file mode 100644
index 0000000..1a86d68
--- /dev/null
+++ b/nutch-plugins/index-replace/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-replace/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-replace/plugin.xml b/nutch-plugins/index-replace/plugin.xml
new file mode 100644
index 0000000..3cffe60
--- /dev/null
+++ b/nutch-plugins/index-replace/plugin.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<plugin
+   id="index-replace"
+   name="Replace Indexer"
+   version="1.0"
+   provider-name="PeterCiuffetti">
+
+   <runtime>
+      <library name="index-replace.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <extension id="org.apache.nutch.indexer.replace"
+              name="Replace Indexer"
+              point="org.apache.nutch.indexer.IndexingFilter">
+      <implementation id="ReplaceIndexer"
+                      class="org.apache.nutch.indexer.replace.ReplaceIndexer"/>
+   </extension>
+
+</plugin>
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-replace/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-replace/pom.xml b/nutch-plugins/index-replace/pom.xml
new file mode 100644
index 0000000..d39851d
--- /dev/null
+++ b/nutch-plugins/index-replace/pom.xml
@@ -0,0 +1,50 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>index-replace</artifactId>
+    <packaging>jar</packaging>
+
+    <name>index-replace</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+    <dependencies>
+        <dependency>
+            <groupId>org.apache.nutch</groupId>
+            <artifactId>index-basic</artifactId>
+            <version>${parent.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.nutch</groupId>
+            <artifactId>index-metadata</artifactId>
+            <version>${parent.version}</version>
+        </dependency>
+    </dependencies>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-replace/src/main/java/org/apache/nutch/indexer/replace/FieldReplacer.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-replace/src/main/java/org/apache/nutch/indexer/replace/FieldReplacer.java b/nutch-plugins/index-replace/src/main/java/org/apache/nutch/indexer/replace/FieldReplacer.java
new file mode 100644
index 0000000..ddfe24d
--- /dev/null
+++ b/nutch-plugins/index-replace/src/main/java/org/apache/nutch/indexer/replace/FieldReplacer.java
@@ -0,0 +1,196 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer.replace;
+
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.regex.PatternSyntaxException;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+/**
+ * POJO to store a filename, its match pattern and its replacement string.
+ *
+ * A checkAndReplace method is provided where you can simultaneously check if
+ * the field matches this replacer and if the pattern matches your field value.
+ *
+ * @author Peter Ciuffetti
+ */
+public class FieldReplacer {
+
+  private static final Log LOG = LogFactory.getLog(FieldReplacer.class
+      .getName());
+
+  private final String fieldName;
+  private final String toFieldName;
+  private final Pattern pattern;
+  private final String replacement;
+  private boolean isValid;
+
+  /**
+   * Create a FieldReplacer for a field.
+   *
+   * Any pattern exceptions are caught within this constructor and the object is
+   * marked inValid. The error will be logged. This prevents this caller from
+   * attempting invalid replacements.
+   *
+   * @param fieldName
+   *          the name of the source field to operate on. Required.
+   * @param toFieldName
+   *          the name of the target field. Required.
+   * @param pattern
+   *          the pattern the field must match. Required.
+   * @param replacement
+   *          the replacement string
+   * @param flags
+   *          the Pattern flags value, or null if no flags are needed
+   */
+  public FieldReplacer(String fieldName, String toFieldName, String pattern,
+      String replacement, Integer flags) {
+
+    this.isValid = true;
+    // Must have a non-empty field name and pattern.
+    if (fieldName == null || fieldName.trim().length() == 0) {
+      LOG.error("Empty fieldName provided, FieldReplacer marked invalid.");
+      this.isValid = false;
+    }
+    if (pattern == null || pattern.trim().length() == 0) {
+      LOG.error("Empty pattern for field " + fieldName
+          + "provided, FieldReplacer marked invalid.");
+      this.isValid = false;
+    }
+
+    if (replacement == null) {
+      this.replacement = "";
+    } else {
+      this.replacement = replacement;
+    }
+
+    this.fieldName = fieldName.trim();
+    this.toFieldName = toFieldName.trim();
+
+    if (this.isValid) {
+      LOG.info("Compiling pattern " + pattern + " for field " + fieldName);
+      Pattern myPattern = null;
+      try {
+        if (flags != null) {
+          myPattern = Pattern.compile(pattern, flags);
+        } else {
+          myPattern = Pattern.compile(pattern);
+        }
+      } catch (PatternSyntaxException e) {
+        LOG.error("Pattern " + pattern + " for field " + fieldName
+            + " failed to compile: " + e.toString());
+        this.isValid = false;
+      }
+      this.pattern = myPattern;
+    } else {
+      this.pattern = null;
+    }
+  }
+
+  /**
+   * Field replacer with the input and output field the same.
+   *
+   * @param fieldName
+   * @param pattern
+   * @param replacement
+   * @param flags
+   */
+  public FieldReplacer(String fieldName, String pattern, String replacement,
+      Integer flags) {
+    this(fieldName, fieldName, pattern, replacement, flags);
+  }
+
+  public String getFieldName() {
+    return this.fieldName;
+  }
+
+  public String getToFieldName() {
+    return this.toFieldName;
+  }
+
+  public Pattern getPattern() {
+    return this.pattern;
+  }
+
+  public String getReplacement() {
+    return this.replacement;
+  }
+
+  /**
+   * Does this FieldReplacer have a valid fieldname and pattern?
+   *
+   * @return
+   */
+  public boolean isValid() {
+    return this.isValid;
+  }
+
+  /**
+   * Return the replacement value for a field value.
+   *
+   * This does not check for a matching field; the caller must decide if this
+   * FieldReplacer should operate on this value by checking getFieldName().
+   *
+   * The method returns the value with the replacement. If the value returned is
+   * not different then eiher the pattern didn't match or the replacement was a
+   * no-op.
+   *
+   * @param value
+   * @return
+   */
+  public String replace(String value) {
+    if (this.isValid) {
+      return this.pattern.matcher(value).replaceAll(replacement);
+    } else {
+      return value;
+    }
+  }
+
+  /**
+   * Return a replacement value for a field.
+   *
+   * This is designed to fail fast and trigger a replacement only when
+   * necessary. If this method returns null, either the field does not match or
+   * the value does not match the pattern (or possibly the pattern is invalid).
+   *
+   * So only if the method returns a non-null value will you need to replace the
+   * value for the field.
+   *
+   * @param fieldName
+   *          the name of the field you are checking
+   * @param value
+   *          the value of the field you are checking
+   * @return a replacement value. If null, either the field does not match or
+   *         the value does not match.
+   */
+  public String checkAndReplace(String fieldName, String value) {
+    if (this.fieldName.equals(fieldName)) {
+      if (value != null && value.length() > 0) {
+        if (this.isValid) {
+          Matcher m = this.pattern.matcher(value);
+          if (m.find()) {
+            return m.replaceAll(this.replacement);
+          }
+        }
+      }
+    }
+    return null;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-replace/src/main/java/org/apache/nutch/indexer/replace/ReplaceIndexer.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-replace/src/main/java/org/apache/nutch/indexer/replace/ReplaceIndexer.java b/nutch-plugins/index-replace/src/main/java/org/apache/nutch/indexer/replace/ReplaceIndexer.java
new file mode 100644
index 0000000..7017603
--- /dev/null
+++ b/nutch-plugins/index-replace/src/main/java/org/apache/nutch/indexer/replace/ReplaceIndexer.java
@@ -0,0 +1,330 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer.replace;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.regex.PatternSyntaxException;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.indexer.IndexingFilter;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.indexer.NutchField;
+import org.apache.nutch.parse.Parse;
+
+/**
+ * Do pattern replacements on selected field contents prior to indexing.
+ * 
+ * To use this plugin, add <code>index-replace</code> to your
+ * <code>plugin.includes</code>. Example:
+ * 
+ * <pre>
+ *   &lt;property>
+ *    &lt;name>plugin.includes&lt;/name>
+ *    &lt;value>protocol-(http)|urlfilter-regex|parse-(html|tika|metatags)|index-(basic|anchor|metadata|replace)|urlnormalizer-(pass|regex|basic)|indexer-solr&lt;/value>
+ *   &lt;/property>
+ * </pre>
+ *
+ * And then add the <code>index.replace.regexp</code> property to
+ * <code>conf/nutch-site.xml</code>. This contains a list of replacement
+ * instructions per field name, one per line. eg.
+ * 
+ * <pre>
+ *   fieldname=/regexp/replacement/[flags]
+ * </pre>
+ * 
+ * <pre>
+ *   &lt;property>
+ *    &lt;name>index.replace.regexp&lt;/name>
+ *    &lt;value>
+ *      hostmatch=.*\\.com
+ *      title=/search/replace/2
+ *    &lt;/value>
+ *   &lt;/property>
+ * </pre>
+ * 
+ * <code>hostmatch=</code> and <code>urlmatch=</code> lines indicate the match
+ * pattern for a host or url. The field replacements that follow this line will
+ * apply only to pages from the matching host or url. Replacements run in the
+ * order specified. Field names may appear multiple times if multiple
+ * replacements are needed.
+ * 
+ * The property format is defined in greater detail in
+ * <code>conf/nutch-default.xml</code>.
+ *
+ * @author Peter Ciuffetti
+ * @see <a
+ *      href="https://issues.apache.org/jira/browse/NUTCH-2058">NUTCH-2058</a>
+ */
+public class ReplaceIndexer implements IndexingFilter {
+
+  private static final Log LOG = LogFactory.getLog(ReplaceIndexer.class
+      .getName());
+
+  /** Special field name signifying the start of a host-specific match set */
+  private static final String HOSTMATCH = "hostmatch";
+  /** Special field name signifying the start of a url-specific match set */
+  private static final String URLMATCH = "urlmatch";
+
+  private static Map<Pattern, List<FieldReplacer>> FIELDREPLACERS_BY_HOST = new LinkedHashMap<Pattern, List<FieldReplacer>>();
+  private static Map<Pattern, List<FieldReplacer>> FIELDREPLACERS_BY_URL = new LinkedHashMap<Pattern, List<FieldReplacer>>();
+
+  private static Pattern LINE_SPLIT = Pattern.compile("(^.+$)+",
+      Pattern.MULTILINE);
+  private static Pattern NAME_VALUE_SPLIT = Pattern.compile("(.*?)=(.*)");
+
+  private Configuration conf;
+
+  /**
+   * {@inheritDoc}
+   */
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    FIELDREPLACERS_BY_HOST.clear();
+    FIELDREPLACERS_BY_URL.clear();
+    String value = conf.get("index.replace.regexp", null);
+    if (value != null) {
+      LOG.debug("Parsing index.replace.regexp property");
+      this.parseConf(value);
+    }
+  }
+
+  /**
+   * {@inheritDoc}
+   */
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+  /**
+   * Parse the property value into a set of maps that store a list of
+   * replacements by field for each host and url configured into the property.
+   * 
+   * @param propertyValue
+   */
+  private void parseConf(String propertyValue) {
+    if (propertyValue == null || propertyValue.trim().length() == 0) {
+      return;
+    }
+
+    // At the start, all replacements apply globally to every host.
+    Pattern hostPattern = Pattern.compile(".*");
+    Pattern urlPattern = null;
+
+    // Split the property into lines
+    Matcher lineMatcher = LINE_SPLIT.matcher(propertyValue);
+    while (lineMatcher.find()) {
+      String line = lineMatcher.group();
+      if (line != null && line.length() > 0) {
+
+        // Split the line into field and value
+        Matcher nameValueMatcher = NAME_VALUE_SPLIT.matcher(line.trim());
+        if (nameValueMatcher.find()) {
+          String fieldName = nameValueMatcher.group(1).trim();
+          String value = nameValueMatcher.group(2);
+          if (fieldName != null && value != null) {
+            // Check if the field name is one of our special cases.
+            if (HOSTMATCH.equals(fieldName)) {
+              urlPattern = null;
+              try {
+                hostPattern = Pattern.compile(value);
+              } catch (PatternSyntaxException pse) {
+                LOG.error("hostmatch pattern " + value + " does not compile: "
+                    + pse.getMessage());
+                // Deactivate this invalid match set by making it match no host.
+                hostPattern = Pattern.compile("willnotmatchanyhost");
+              }
+            } else if (URLMATCH.equals(fieldName)) {
+              try {
+                urlPattern = Pattern.compile(value);
+              } catch (PatternSyntaxException pse) {
+                LOG.error("urlmatch pattern " + value + " does not compile: "
+                    + pse.getMessage());
+                // Deactivate this invalid match set by making it match no url.
+                urlPattern = Pattern.compile("willnotmatchanyurl");
+              }
+            } else if (value.length() > 3) {
+              String toFieldName = fieldName;
+              // If the fieldname has a colon, this indicates a different target
+              // field.
+              if (fieldName.indexOf(':') > 0) {
+                toFieldName = fieldName.substring(fieldName.indexOf(':') + 1);
+                fieldName = fieldName.substring(0, fieldName.indexOf(':'));
+              }
+              String sep = value.substring(0, 1);
+
+              // Divide the value into pattern / replacement / flags.
+              value = value.substring(1);
+              if (!value.contains(sep)) {
+                LOG.error("Pattern '" + line
+                    + "', not parseable.  Missing separator " + sep);
+                continue;
+              }
+              String pattern = value.substring(0, value.indexOf(sep));
+              value = value.substring(pattern.length() + 1);
+              String replacement = value;
+              if (value.contains(sep)) {
+                replacement = value.substring(0, value.indexOf(sep));
+              }
+              int flags = 0;
+              if (value.length() > replacement.length() + 1) {
+                value = value.substring(replacement.length() + 1).trim();
+                try {
+                  flags = Integer.parseInt(value);
+                } catch (NumberFormatException e) {
+                  LOG.error("Pattern " + line + ", has invalid flags component");
+                  continue;
+                }
+              }
+              Integer iFlags = (flags > 0) ? new Integer(flags) : null;
+
+              // Make a FieldReplacer out of these params.
+              FieldReplacer fr = new FieldReplacer(fieldName, toFieldName,
+                  pattern, replacement, iFlags);
+
+              // Add this field replacer to the list for this host or URL.
+              if (urlPattern != null) {
+                List<FieldReplacer> lfp = FIELDREPLACERS_BY_URL.get(urlPattern);
+                if (lfp == null) {
+                  lfp = new ArrayList<FieldReplacer>();
+                }
+                lfp.add(fr);
+                FIELDREPLACERS_BY_URL.put(urlPattern, lfp);
+              } else {
+                List<FieldReplacer> lfp = FIELDREPLACERS_BY_HOST
+                    .get(hostPattern);
+                if (lfp == null) {
+                  lfp = new ArrayList<FieldReplacer>();
+                }
+                lfp.add(fr);
+                FIELDREPLACERS_BY_HOST.put(hostPattern, lfp);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  /**
+   * {@inheritDoc}
+   */
+  public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
+      CrawlDatum datum, Inlinks inlinks) throws IndexingException {
+
+    if (doc != null) {
+      if (FIELDREPLACERS_BY_HOST.size() > 0) {
+        this.doReplace(doc, "host", FIELDREPLACERS_BY_HOST);
+      }
+
+      if (FIELDREPLACERS_BY_URL.size() > 0) {
+        this.doReplace(doc, "url", FIELDREPLACERS_BY_URL);
+      }
+    }
+
+    return doc;
+  }
+
+  /**
+   * Iterates through the replacement map provided, to update the fields in the
+   * Nutch Document.
+   * 
+   * @param doc
+   *          the document we are modifying
+   * @param keyName
+   *          either "host" or "url" -- the field that determines the
+   *          replacement set used
+   * @param replaceMap
+   *          the list of FieldReplacers that applies to this keyName.
+   */
+  private void doReplace(NutchDocument doc, String keyName,
+      Map<Pattern, List<FieldReplacer>> replaceMap) {
+
+    if (doc == null || replaceMap.size() == 0) {
+      return;
+    }
+
+    Collection<String> docFieldNames = doc.getFieldNames();
+    NutchField keyField = doc.getField(keyName);
+    if (keyField == null) {
+      // This document doesn't have the key field; no work to do.
+      return;
+    }
+
+    List<Object> keyFieldValues = keyField.getValues();
+    if (keyFieldValues.size() == 0) {
+      // This document doesn't have any values for the key field; no work to do.
+      return;
+    }
+
+    // For every value of the keyField (one expected)
+    for (Object oKeyFieldValue : keyFieldValues) {
+      if (oKeyFieldValue != null && oKeyFieldValue instanceof java.lang.String) {
+        String keyFieldValue = (String) oKeyFieldValue;
+
+        // For each pattern that we have a replacement list for...
+        for (Map.Entry<Pattern, List<FieldReplacer>> entries : replaceMap
+            .entrySet()) {
+          // If this key is a match for a replacement set...
+          if (entries.getKey().matcher(keyFieldValue).find()) {
+
+            // For each field we will replace for this key...
+            for (FieldReplacer fp : entries.getValue()) {
+              String fieldName = fp.getFieldName();
+
+              // Does this document contain the FieldReplacer's field?
+              if (docFieldNames.contains(fieldName)) {
+                NutchField docField = doc.getField(fieldName);
+                List<Object> fieldValues = docField.getValues();
+                ArrayList<String> newFieldValues = new ArrayList<String>();
+
+                // For each value of the field, match against our
+                // replacer...
+                for (Object oFieldValue : fieldValues) {
+                  if (oFieldValue != null
+                      && oFieldValue instanceof java.lang.String) {
+                    String fieldValue = (String) oFieldValue;
+                    String newValue = fp.replace(fieldValue);
+                    newFieldValues.add(newValue);
+                  }
+                }
+
+                // Remove the target field and add our replaced values.
+                String targetFieldName = fp.getToFieldName();
+                doc.removeField(targetFieldName);
+                for (String newFieldValue : newFieldValues) {
+                  doc.add(targetFieldName, newFieldValue);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-replace/src/main/java/org/apache/nutch/indexer/replace/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-replace/src/main/java/org/apache/nutch/indexer/replace/package-info.java b/nutch-plugins/index-replace/src/main/java/org/apache/nutch/indexer/replace/package-info.java
new file mode 100644
index 0000000..28c24a4
--- /dev/null
+++ b/nutch-plugins/index-replace/src/main/java/org/apache/nutch/indexer/replace/package-info.java
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Indexing filter to allow pattern replacements on metadata.
+ */
+package org.apache.nutch.indexer.replace;
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-replace/src/test/java/org/apache/nutch/indexer/replace/TestIndexReplace.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-replace/src/test/java/org/apache/nutch/indexer/replace/TestIndexReplace.java b/nutch-plugins/index-replace/src/test/java/org/apache/nutch/indexer/replace/TestIndexReplace.java
new file mode 100644
index 0000000..ca90ca3
--- /dev/null
+++ b/nutch-plugins/index-replace/src/test/java/org/apache/nutch/indexer/replace/TestIndexReplace.java
@@ -0,0 +1,456 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.indexer.replace;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.indexer.basic.BasicIndexingFilter;
+import org.apache.nutch.indexer.metadata.MetadataIndexer;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Assert;
+import org.junit.Test;
+
+/**
+ * JUnit tests for the <code>index-replace</code> plugin.
+ * 
+ * In these tests, the sample file has some meta tags added to the Nutch
+ * document by the <code>index-metadata</code> plugin. The
+ * <code>index-replace</code> plugin is then used to either change (or not
+ * change) the fields depending on the various values of
+ * <code>index.replace.regexp</code> property being provided to Nutch.
+ * 
+ * 
+ * @author Peter Ciuffetti
+ *
+ */
+public class TestIndexReplace {
+
+  private static final String INDEX_REPLACE_PROPERTY = "index.replace.regexp";
+
+  private String fileSeparator = System.getProperty("file.separator");
+  private String sampleDir = System.getProperty("test.data", ".");
+  private String sampleFile = "testIndexReplace.html";
+
+  /**
+   * Run a test file through the Nutch parser and index filters.
+   * 
+   * @param fileName
+   * @param conf
+   * @return the Nutch document with the replace indexer applied
+   */
+  public NutchDocument parseAndFilterFile(String fileName, Configuration conf) {
+    NutchDocument doc = new NutchDocument();
+
+    BasicIndexingFilter basicIndexer = new BasicIndexingFilter();
+    basicIndexer.setConf(conf);
+    Assert.assertNotNull(basicIndexer);
+
+    MetadataIndexer metaIndexer = new MetadataIndexer();
+    metaIndexer.setConf(conf);
+    Assert.assertNotNull(basicIndexer);
+
+    ReplaceIndexer replaceIndexer = new ReplaceIndexer();
+    replaceIndexer.setConf(conf);
+    Assert.assertNotNull(replaceIndexer);
+
+    try {
+      String urlString = "file:" + sampleDir + fileSeparator + fileName;
+      Text text = new Text(urlString);
+      CrawlDatum crawlDatum = new CrawlDatum();
+      Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
+      Content content = protocol.getProtocolOutput(text, crawlDatum)
+          .getContent();
+      Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
+      crawlDatum.setFetchTime(100L);
+
+      Inlinks inlinks = new Inlinks();
+      doc = basicIndexer.filter(doc, parse, text, crawlDatum, inlinks);
+      doc = metaIndexer.filter(doc, parse, text, crawlDatum, inlinks);
+      doc = replaceIndexer.filter(doc, parse, text, crawlDatum, inlinks);
+    } catch (Exception e) {
+      e.printStackTrace();
+      Assert.fail(e.toString());
+    }
+
+    return doc;
+  }
+
+  /**
+   * Test property parsing.
+   * 
+   * The filter does not expose details of the parse. So all we are checking is
+   * that the parse does not throw a runtime exception and that the value
+   * provided is the value returned.
+   */
+  @Test
+  public void testPropertyParse() {
+    Configuration conf = NutchConfiguration.create();
+    String indexReplaceProperty = "  metatag.description=/this(.*)plugin/this awesome plugin/2\n"
+        + "  metatag.keywords=/\\,/\\!/\n"
+        + "  hostmatch=.*.com\n"
+        + "  metatag.keywords=/\\,/\\?/\n"
+        + "  metatag.author:dc_author=/\\s+/ David /\n"
+        + "  urlmatch=.*.html\n"
+        + "  metatag.keywords=/\\,/\\./\n" + "  metatag.author=/\\s+/ D. /\n";
+
+    conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty);
+
+    ReplaceIndexer rp = new ReplaceIndexer();
+    try {
+      rp.setConf(conf);
+    } catch (RuntimeException ohno) {
+      Assert.fail("Unable to parse a valid index.replace.regexp property! "
+          + ohno.getMessage());
+    }
+
+    Configuration parsedConf = rp.getConf();
+
+    // Does the getter equal the setter? Too easy!
+    Assert.assertEquals(indexReplaceProperty,
+        parsedConf.get(INDEX_REPLACE_PROPERTY));
+  }
+
+  /**
+   * Test metatag value replacement using global replacement settings.
+   * 
+   * The index.replace.regexp property does not use hostmatch or urlmatch, so
+   * all patterns are global.
+   */
+  @Test
+  public void testGlobalReplacement() {
+    String expectedDescription = "With this awesome plugin, I control the description! Bwuhuhuhaha!";
+    String expectedKeywords = "Breathtaking! Riveting! Two Thumbs Up!";
+    String expectedAuthor = "Peter D. Ciuffetti";
+    String indexReplaceProperty = "  metatag.description=/this(.*)plugin/this awesome plugin/\n"
+        + "  metatag.keywords=/\\,/\\!/\n" + "  metatag.author=/\\s+/ D. /\n";
+
+    Configuration conf = NutchConfiguration.create();
+    conf.set(
+        "plugin.includes",
+        "protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)");
+    conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty);
+    conf.set("metatags.names", "author,description,keywords");
+    conf.set("index.parse.md",
+        "metatag.author,metatag.description,metatag.keywords");
+    // Not necessary but helpful when debugging the filter.
+    conf.set("http.timeout", "99999999999");
+
+    // Run the document through the parser and index filters.
+    NutchDocument doc = parseAndFilterFile(sampleFile, conf);
+
+    Assert.assertEquals(expectedDescription,
+        doc.getFieldValue("metatag.description"));
+    Assert
+        .assertEquals(expectedKeywords, doc.getFieldValue("metatag.keywords"));
+    Assert.assertEquals(expectedAuthor, doc.getFieldValue("metatag.author"));
+  }
+
+  /**
+   * Test that invalid property settings are handled and ignored.
+   * 
+   * This test provides an invalid property setting that will fail property
+   * parsing and Pattern.compile. The expected outcome is that the patterns will
+   * not cause failure and the targeted fields will not be modified by the
+   * filter.
+   */
+  @Test
+  public void testInvalidPatterns() {
+    String expectedDescription = "With this plugin, I control the description! Bwuhuhuhaha!";
+    String expectedKeywords = "Breathtaking, Riveting, Two Thumbs Up!";
+    String expectedAuthor = "Peter Ciuffetti";
+    // Contains: invalid pattern, invalid flags, incomplete property
+    String indexReplaceProperty = "  metatag.description=/this\\s+**plugin/this awesome plugin/\n"
+        + "  metatag.keywords=/\\,/\\!/what\n" + " metatag.author=#notcomplete";
+
+    Configuration conf = NutchConfiguration.create();
+    conf.set(
+        "plugin.includes",
+        "protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)");
+    conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty);
+    conf.set("metatags.names", "author,description,keywords");
+    conf.set("index.parse.md",
+        "metatag.author,metatag.description,metatag.keywords");
+    // Not necessary but helpful when debugging the filter.
+    conf.set("http.timeout", "99999999999");
+
+    // Run the document through the parser and index filters.
+    NutchDocument doc = parseAndFilterFile(sampleFile, conf);
+
+    // Assert that our metatags have not changed.
+    Assert.assertEquals(expectedDescription,
+        doc.getFieldValue("metatag.description"));
+    Assert
+        .assertEquals(expectedKeywords, doc.getFieldValue("metatag.keywords"));
+    Assert.assertEquals(expectedAuthor, doc.getFieldValue("metatag.author"));
+
+  }
+
+  /**
+   * Test URL pattern matching
+   */
+  @Test
+  public void testUrlMatchesPattern() {
+    String expectedDescription = "With this awesome plugin, I control the description! Bwuhuhuhaha!";
+    String expectedKeywords = "Breathtaking! Riveting! Two Thumbs Up!";
+    String expectedAuthor = "Peter D. Ciuffetti";
+    String indexReplaceProperty = " urlmatch=.*.html\n"
+        + "  metatag.description=/this(.*)plugin/this awesome plugin/\n"
+        + "  metatag.keywords=/\\,/\\!/\n" + "  metatag.author=/\\s+/ D. /\n";
+
+    Configuration conf = NutchConfiguration.create();
+    conf.set(
+        "plugin.includes",
+        "protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)");
+    conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty);
+    conf.set("metatags.names", "author,description,keywords");
+    conf.set("index.parse.md",
+        "metatag.author,metatag.description,metatag.keywords");
+    // Not necessary but helpful when debugging the filter.
+    conf.set("http.timeout", "99999999999");
+
+    // Run the document through the parser and index filters.
+    NutchDocument doc = parseAndFilterFile(sampleFile, conf);
+
+    // Assert that our metatags have changed.
+    Assert.assertEquals(expectedDescription,
+        doc.getFieldValue("metatag.description"));
+    Assert
+        .assertEquals(expectedKeywords, doc.getFieldValue("metatag.keywords"));
+    Assert.assertEquals(expectedAuthor, doc.getFieldValue("metatag.author"));
+
+  }
+
+  /**
+   * Test URL pattern not matching.
+   * 
+   * Expected result is that the filter does not change the fields.
+   */
+  @Test
+  public void testUrlNotMatchesPattern() {
+    String expectedDescription = "With this plugin, I control the description! Bwuhuhuhaha!";
+    String expectedKeywords = "Breathtaking, Riveting, Two Thumbs Up!";
+    String expectedAuthor = "Peter Ciuffetti";
+    String indexReplaceProperty = " urlmatch=.*.xml\n"
+        + "  metatag.description=/this(.*)plugin/this awesome plugin/\n"
+        + "  metatag.keywords=/\\,/\\!/\n" + "  metatag.author=/\\s+/ D. /\n";
+
+    Configuration conf = NutchConfiguration.create();
+    conf.set(
+        "plugin.includes",
+        "protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)");
+    conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty);
+    conf.set("metatags.names", "author,description,keywords");
+    conf.set("index.parse.md",
+        "metatag.author,metatag.description,metatag.keywords");
+    // Not necessary but helpful when debugging the filter.
+    conf.set("http.timeout", "99999999999");
+
+    // Run the document through the parser and index filters.
+    NutchDocument doc = parseAndFilterFile(sampleFile, conf);
+
+    // Assert that our metatags have not changed.
+    Assert.assertEquals(expectedDescription,
+        doc.getFieldValue("metatag.description"));
+    Assert
+        .assertEquals(expectedKeywords, doc.getFieldValue("metatag.keywords"));
+    Assert.assertEquals(expectedAuthor, doc.getFieldValue("metatag.author"));
+
+  }
+
+  /**
+   * Test a global pattern match for description and URL pattern match for
+   * keywords and author.
+   * 
+   * All three should be triggered. It also tests replacement groups.
+   */
+  @Test
+  public void testGlobalAndUrlMatchesPattern() {
+    String expectedDescription = "With this awesome plugin, I control the description! Bwuhuhuhaha!";
+    String expectedKeywords = "Breathtaking! Riveting! Two Thumbs Up!";
+    String expectedAuthor = "Peter D. Ciuffetti";
+    String indexReplaceProperty = "  metatag.description=/this(.*)plugin/this$1awesome$1plugin/\n"
+        + "  urlmatch=.*.html\n"
+        + "  metatag.keywords=/\\,/\\!/\n"
+        + "  metatag.author=/\\s+/ D. /\n";
+
+    Configuration conf = NutchConfiguration.create();
+    conf.set(
+        "plugin.includes",
+        "protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)");
+    conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty);
+    conf.set("metatags.names", "author,description,keywords");
+    conf.set("index.parse.md",
+        "metatag.author,metatag.description,metatag.keywords");
+    // Not necessary but helpful when debugging the filter.
+    conf.set("http.timeout", "99999999999");
+
+    // Run the document through the parser and index filters.
+    NutchDocument doc = parseAndFilterFile(sampleFile, conf);
+
+    // Assert that our metatags have changed.
+    Assert.assertEquals(expectedDescription,
+        doc.getFieldValue("metatag.description"));
+    Assert
+        .assertEquals(expectedKeywords, doc.getFieldValue("metatag.keywords"));
+    Assert.assertEquals(expectedAuthor, doc.getFieldValue("metatag.author"));
+
+  }
+
+  /**
+   * Test a global pattern match for description and URL pattern match for
+   * keywords and author.
+   * 
+   * Only the global match should be triggered.
+   */
+  @Test
+  public void testGlobalAndUrlNotMatchesPattern() {
+    String expectedDescription = "With this awesome plugin, I control the description! Bwuhuhuhaha!";
+    String expectedKeywords = "Breathtaking, Riveting, Two Thumbs Up!";
+    String expectedAuthor = "Peter Ciuffetti";
+    String indexReplaceProperty = "  metatag.description=/this(.*)plugin/this$1awesome$1plugin/\n"
+        + "  urlmatch=.*.xml\n"
+        + "  metatag.keywords=/\\,/\\!/\n"
+        + "  metatag.author=/\\s+/ D. /\n";
+
+    Configuration conf = NutchConfiguration.create();
+    conf.set(
+        "plugin.includes",
+        "protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)");
+    conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty);
+    conf.set("metatags.names", "author,description,keywords");
+    conf.set("index.parse.md",
+        "metatag.author,metatag.description,metatag.keywords");
+    // Not necessary but helpful when debugging the filter.
+    conf.set("http.timeout", "99999999999");
+
+    // Run the document through the parser and index filters.
+    NutchDocument doc = parseAndFilterFile(sampleFile, conf);
+
+    // Assert that description has changed and the others have not changed.
+    Assert.assertEquals(expectedDescription,
+        doc.getFieldValue("metatag.description"));
+    Assert
+        .assertEquals(expectedKeywords, doc.getFieldValue("metatag.keywords"));
+    Assert.assertEquals(expectedAuthor, doc.getFieldValue("metatag.author"));
+  }
+
+  /**
+   * Test order-specific replacement settings.
+   * 
+   * This makes multiple replacements on the same field and will produce the
+   * expected value only if the replacements are run in the order specified.
+   */
+  @Test
+  public void testReplacementsRunInSpecifedOrder() {
+    String expectedDescription = "With this awesome plugin, I control the description! Bwuhuhuhaha!";
+    String indexReplaceProperty = "  metatag.description=/this plugin/this amazing plugin/\n"
+        + "  metatag.description=/this amazing plugin/this valuable plugin/\n"
+        + "  metatag.description=/this valuable plugin/this cool plugin/\n"
+        + "  metatag.description=/this cool plugin/this wicked plugin/\n"
+        + "  metatag.description=/this wicked plugin/this awesome plugin/\n";
+
+    Configuration conf = NutchConfiguration.create();
+    conf.set(
+        "plugin.includes",
+        "protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)");
+    conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty);
+    conf.set("metatags.names", "author,description,keywords");
+    conf.set("index.parse.md",
+        "metatag.author,metatag.description,metatag.keywords");
+    // Not necessary but helpful when debugging the filter.
+    conf.set("http.timeout", "99999999999");
+
+    // Run the document through the parser and index filters.
+    NutchDocument doc = parseAndFilterFile(sampleFile, conf);
+
+    // Check that the value produced by the last replacement has worked.
+    Assert.assertEquals(expectedDescription,
+        doc.getFieldValue("metatag.description"));
+  }
+
+  /**
+   * Test a replacement pattern that uses the flags feature.
+   * 
+   * A 2 is Pattern.CASE_INSENSITIVE. We look for upper case and expect to match
+   * any case.
+   */
+  @Test
+  public void testReplacementsWithFlags() {
+    String expectedDescription = "With this awesome plugin, I control the description! Bwuhuhuhaha!";
+    String indexReplaceProperty = "  metatag.description=/THIS PLUGIN/this awesome plugin/2";
+
+    Configuration conf = NutchConfiguration.create();
+    conf.set(
+        "plugin.includes",
+        "protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)");
+    conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty);
+    conf.set("metatags.names", "author,description,keywords");
+    conf.set("index.parse.md",
+        "metatag.author,metatag.description,metatag.keywords");
+    // Not necessary but helpful when debugging the filter.
+    conf.set("http.timeout", "99999999999");
+
+    // Run the document through the parser and index filters.
+    NutchDocument doc = parseAndFilterFile(sampleFile, conf);
+
+    // Check that the value produced by the case-insensitive replacement has
+    // worked.
+    Assert.assertEquals(expectedDescription,
+        doc.getFieldValue("metatag.description"));
+  }
+
+  /**
+   * Test a replacement pattern that uses the target field feature.
+   * Check that the input is not modifid and that the taret field is added.
+   */
+  @Test
+  public void testReplacementsDifferentTarget() {
+    String expectedDescription = "With this plugin, I control the description! Bwuhuhuhaha!";
+    String expectedTargetDescription = "With this awesome plugin, I control the description! Bwuhuhuhaha!";
+    String indexReplaceProperty = "  metatag.description:new=/this plugin/this awesome plugin/";
+
+    Configuration conf = NutchConfiguration.create();
+    conf.set(
+        "plugin.includes",
+        "protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)");
+    conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty);
+    conf.set("metatags.names", "author,description,keywords");
+    conf.set("index.parse.md",
+        "metatag.author,metatag.description,metatag.keywords");
+    // Not necessary but helpful when debugging the filter.
+    conf.set("http.timeout", "99999999999");
+
+    // Run the document through the parser and index filters.
+    NutchDocument doc = parseAndFilterFile(sampleFile, conf);
+
+    // Check that the input field has not been modified
+    Assert.assertEquals(expectedDescription,
+        doc.getFieldValue("metatag.description"));
+    // Check that the output field has created
+    Assert.assertEquals(expectedTargetDescription,
+        doc.getFieldValue("new"));
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-replace/src/test/resources/testIndexReplace.html
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-replace/src/test/resources/testIndexReplace.html b/nutch-plugins/index-replace/src/test/resources/testIndexReplace.html
new file mode 100644
index 0000000..0b90fc2
--- /dev/null
+++ b/nutch-plugins/index-replace/src/test/resources/testIndexReplace.html
@@ -0,0 +1,12 @@
+<html>
+  <head>
+    <title>Testing the power of the index-replace plugin</title>
+    <meta name="description" content="With this plugin, I control the description! Bwuhuhuhaha!">
+    <meta name="keywords" content="Breathtaking, Riveting, Two Thumbs Up!">
+    <meta name="author" content="Peter Ciuffetti">
+  </head>
+  <body>
+    <p>This html file is used to test the Nutch index-replace regexp replacer plugin.
+    A decidedly boring thing to do.</p>
+  </body>
+</html>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-static/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-static/build.xml b/nutch-plugins/index-static/build.xml
new file mode 100644
index 0000000..0ec5665
--- /dev/null
+++ b/nutch-plugins/index-static/build.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="index-static" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-static/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-static/ivy.xml b/nutch-plugins/index-static/ivy.xml
new file mode 100644
index 0000000..24d7606
--- /dev/null
+++ b/nutch-plugins/index-static/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../../ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-static/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-static/plugin.xml b/nutch-plugins/index-static/plugin.xml
new file mode 100644
index 0000000..539e355
--- /dev/null
+++ b/nutch-plugins/index-static/plugin.xml
@@ -0,0 +1,42 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="index-static"
+   name="Index Static"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+    <runtime>
+      <library name="index-static.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+
+   <extension id="org.apache.nutch.indexer.staticfield"
+              name="Nutch static field index"
+              point="org.apache.nutch.indexer.IndexingFilter">
+      <implementation id="StaticField"
+                      class="org.apache.nutch.indexer.staticfield.StaticFieldIndexer"/>
+   </extension>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-static/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-static/pom.xml b/nutch-plugins/index-static/pom.xml
new file mode 100644
index 0000000..6eaf0ba
--- /dev/null
+++ b/nutch-plugins/index-static/pom.xml
@@ -0,0 +1,38 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>index-static</artifactId>
+    <packaging>jar</packaging>
+
+    <name>index-static</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+</project>

[07/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build for nutch-core and nutch-plugins

Posted by th...@apache.org.

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-htmlunit/src/main/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-htmlunit/src/main/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java b/nutch-plugins/protocol-htmlunit/src/main/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
new file mode 100644
index 0000000..8b1a031
--- /dev/null
+++ b/nutch-plugins/protocol-htmlunit/src/main/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
@@ -0,0 +1,573 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.htmlunit;
+
+import java.io.BufferedInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.EOFException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.io.PushbackInputStream;
+import java.net.InetSocketAddress;
+import java.net.Socket;
+import java.net.URL;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+
+import javax.net.ssl.SSLSocket;
+import javax.net.ssl.SSLSocketFactory;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.metadata.SpellCheckedMetadata;
+import org.apache.nutch.net.protocols.HttpDateFormat;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.protocol.http.api.HttpBase;
+import org.apache.nutch.protocol.http.api.HttpException;
+
+/**
+ * An HTTP response.
+ */
+public class HttpResponse implements Response {
+
+  private Configuration conf;
+  private HttpBase http;
+  private URL url;
+  private String orig;
+  private String base;
+  private byte[] content;
+  private int code;
+  private Metadata headers = new SpellCheckedMetadata();
+  // used for storing the http headers verbatim
+  private StringBuffer httpHeaders;
+
+  protected enum Scheme {
+    HTTP, HTTPS,
+  }
+
+  /**
+   * Default public constructor.
+   *
+   * @param http
+   * @param url
+   * @param datum
+   * @throws ProtocolException
+   * @throws IOException
+   */
+  public HttpResponse(HttpBase http, URL url, CrawlDatum datum)
+      throws ProtocolException, IOException {
+
+    this.http = http;
+    this.url = url;
+    this.orig = url.toString();
+    this.base = url.toString();
+
+    Scheme scheme = null;
+
+    if ("http".equals(url.getProtocol())) {
+      scheme = Scheme.HTTP;
+    } else if ("https".equals(url.getProtocol())) {
+      scheme = Scheme.HTTPS;
+    } else {
+      throw new HttpException("Unknown scheme (not http/https) for url:" + url);
+    }
+
+    if (Http.LOG.isTraceEnabled()) {
+      Http.LOG.trace("fetching " + url);
+    }
+
+    String path = "".equals(url.getFile()) ? "/" : url.getFile();
+
+    // some servers will redirect a request with a host line like
+    // "Host: <hostname>:80" to "http://<hpstname>/<orig_path>"- they
+    // don't want the :80...
+
+    String host = url.getHost();
+    int port;
+    String portString;
+    if (url.getPort() == -1) {
+      if (scheme == Scheme.HTTP) {
+        port = 80;
+      } else {
+        port = 443;
+      }
+      portString = "";
+    } else {
+      port = url.getPort();
+      portString = ":" + port;
+    }
+    Socket socket = null;
+
+    try {
+      socket = new Socket(); // create the socket
+      socket.setSoTimeout(http.getTimeout());
+
+      // connect
+      String sockHost = http.useProxy(url) ? http.getProxyHost() : host;
+      int sockPort = http.useProxy(url) ? http.getProxyPort() : port;
+      InetSocketAddress sockAddr = new InetSocketAddress(sockHost, sockPort);
+      socket.connect(sockAddr, http.getTimeout());
+
+      if (scheme == Scheme.HTTPS) {
+        SSLSocketFactory factory = (SSLSocketFactory) SSLSocketFactory
+            .getDefault();
+        SSLSocket sslsocket = (SSLSocket) factory
+            .createSocket(socket, sockHost, sockPort, true);
+        sslsocket.setUseClientMode(true);
+
+        // Get the protocols and ciphers supported by this JVM
+        Set<String> protocols = new HashSet<String>(
+            Arrays.asList(sslsocket.getSupportedProtocols()));
+        Set<String> ciphers = new HashSet<String>(
+            Arrays.asList(sslsocket.getSupportedCipherSuites()));
+
+        // Intersect with preferred protocols and ciphers
+        protocols.retainAll(http.getTlsPreferredProtocols());
+        ciphers.retainAll(http.getTlsPreferredCipherSuites());
+
+        sslsocket.setEnabledProtocols(
+            protocols.toArray(new String[protocols.size()]));
+        sslsocket.setEnabledCipherSuites(
+            ciphers.toArray(new String[ciphers.size()]));
+
+        sslsocket.startHandshake();
+        socket = sslsocket;
+      }
+
+      this.conf = http.getConf();
+      if (sockAddr != null
+          && conf.getBoolean("store.ip.address", false) == true) {
+        headers.add("_ip_", sockAddr.getAddress().getHostAddress());
+      }
+
+      // make request
+      OutputStream req = socket.getOutputStream();
+
+      StringBuffer reqStr = new StringBuffer("GET ");
+      if (http.useProxy(url)) {
+        reqStr.append(url.getProtocol() + "://" + host + portString + path);
+      } else {
+        reqStr.append(path);
+      }
+
+      reqStr.append(" HTTP/1.0\r\n");
+
+      reqStr.append("Host: ");
+      reqStr.append(host);
+      reqStr.append(portString);
+      reqStr.append("\r\n");
+
+      reqStr.append("Accept-Encoding: x-gzip, gzip, deflate\r\n");
+
+      String userAgent = http.getUserAgent();
+      if ((userAgent == null) || (userAgent.length() == 0)) {
+        if (Http.LOG.isErrorEnabled()) {
+          Http.LOG.error("User-agent is not set!");
+        }
+      } else {
+        reqStr.append("User-Agent: ");
+        reqStr.append(userAgent);
+        reqStr.append("\r\n");
+      }
+
+      reqStr.append("Accept-Language: ");
+      reqStr.append(this.http.getAcceptLanguage());
+      reqStr.append("\r\n");
+
+      reqStr.append("Accept: ");
+      reqStr.append(this.http.getAccept());
+      reqStr.append("\r\n");
+
+      if (http.isIfModifiedSinceEnabled() && datum.getModifiedTime() > 0) {
+        reqStr.append("If-Modified-Since: " + HttpDateFormat
+            .toString(datum.getModifiedTime()));
+        reqStr.append("\r\n");
+      }
+      reqStr.append("\r\n");
+
+      // store the request in the metadata?
+      if (conf.getBoolean("store.http.request", false) == true) {
+        headers.add("_request_", reqStr.toString());
+      }
+
+      byte[] reqBytes = reqStr.toString().getBytes();
+
+      req.write(reqBytes);
+      req.flush();
+
+      PushbackInputStream in = // process response
+          new PushbackInputStream(
+              new BufferedInputStream(socket.getInputStream(),
+                  Http.BUFFER_SIZE), Http.BUFFER_SIZE);
+
+      StringBuffer line = new StringBuffer();
+
+      // store the http headers verbatim
+      if (conf.getBoolean("store.http.headers", false) == true) {
+        httpHeaders = new StringBuffer();
+      }
+
+      headers.add("nutch.fetch.time", Long.toString(System.currentTimeMillis()));
+
+      boolean haveSeenNonContinueStatus = false;
+      while (!haveSeenNonContinueStatus) {
+        // parse status code line
+        this.code = parseStatusLine(in, line);
+        if (httpHeaders != null)
+          httpHeaders.append(line).append("\n");
+        // parse headers
+        parseHeaders(in, line, httpHeaders);
+        haveSeenNonContinueStatus = code != 100; // 100 is "Continue"
+      }
+
+      // Get Content type header
+      String contentType = getHeader(Response.CONTENT_TYPE);
+
+      // handle with HtmlUnit only if content type in HTML or XHTML 
+      if (contentType != null) {
+        if (contentType.contains("text/html") || contentType.contains("application/xhtml")) {
+          readContentFromHtmlUnit(url);
+        } else {
+          String transferEncoding = getHeader(Response.TRANSFER_ENCODING);
+          if (transferEncoding != null && "chunked"
+              .equalsIgnoreCase(transferEncoding.trim())) {
+            readChunkedContent(in, line);
+          } else {
+            readPlainContent(in);
+          }
+
+          String contentEncoding = getHeader(Response.CONTENT_ENCODING);
+          if ("gzip".equals(contentEncoding) || "x-gzip".equals(contentEncoding)) {
+            content = http.processGzipEncoded(content, url);
+          } else if ("deflate".equals(contentEncoding)) {
+            content = http.processDeflateEncoded(content, url);
+          } else {
+            // store the headers verbatim only if the response was not compressed
+            // as the content length reported with not match otherwise
+            if (httpHeaders != null) {
+              headers.add("_response.headers_", httpHeaders.toString());
+            }
+            if (Http.LOG.isTraceEnabled()) {
+              Http.LOG.trace("fetched " + content.length + " bytes from " + url);
+            }
+          }
+        }
+      }
+
+    } finally {
+      if (socket != null)
+        socket.close();
+    }
+
+  }
+
+  /*
+   * ------------------------- * <implementation:Response> *
+   * -------------------------
+   */
+
+  public URL getUrl() {
+    return url;
+  }
+
+  public int getCode() {
+    return code;
+  }
+
+  public String getHeader(String name) {
+    return headers.get(name);
+  }
+
+  public Metadata getHeaders() {
+    return headers;
+  }
+
+  public byte[] getContent() {
+    return content;
+  }
+
+  /*
+   * ------------------------- * <implementation:Response> *
+   * -------------------------
+   */
+
+  private void readContentFromHtmlUnit(URL url) throws IOException {
+    String page = HtmlUnitWebDriver.getHtmlPage(url.toString(), conf);
+    content = page.getBytes("UTF-8");
+  }
+  
+  private void readPlainContent(InputStream in)
+      throws HttpException, IOException {
+
+    int contentLength = Integer.MAX_VALUE; // get content length
+    String contentLengthString = headers.get(Response.CONTENT_LENGTH);
+    if (contentLengthString != null) {
+      contentLengthString = contentLengthString.trim();
+      try {
+        if (!contentLengthString.isEmpty())
+          contentLength = Integer.parseInt(contentLengthString);
+      } catch (NumberFormatException e) {
+        throw new HttpException("bad content length: " + contentLengthString);
+      }
+    }
+    if (http.getMaxContent() >= 0 && contentLength > http
+        .getMaxContent()) // limit
+      // download
+      // size
+      contentLength = http.getMaxContent();
+
+    ByteArrayOutputStream out = new ByteArrayOutputStream(Http.BUFFER_SIZE);
+    byte[] bytes = new byte[Http.BUFFER_SIZE];
+    int length = 0;
+
+    // do not try to read if the contentLength is 0
+    if (contentLength == 0) {
+      content = new byte[0];
+      return;
+    }
+
+    // read content
+    int i = in.read(bytes);
+    while (i != -1) {
+      out.write(bytes, 0, i);
+      length += i;
+      if (length >= contentLength) {
+        break;
+      }
+      if ((length + Http.BUFFER_SIZE) > contentLength) {
+        // reading next chunk may hit contentLength,
+        // must limit number of bytes read
+        i = in.read(bytes, 0, (contentLength - length));
+      } else {
+        i = in.read(bytes);
+      }
+    }
+    content = out.toByteArray();
+  }
+
+  /**
+   * @param in
+   * @param line
+   * @throws HttpException
+   * @throws IOException
+   */
+  private void readChunkedContent(PushbackInputStream in, StringBuffer line)
+      throws HttpException, IOException {
+    boolean doneChunks = false;
+    int contentBytesRead = 0;
+    byte[] bytes = new byte[Http.BUFFER_SIZE];
+    ByteArrayOutputStream out = new ByteArrayOutputStream(Http.BUFFER_SIZE);
+
+    while (!doneChunks) {
+      if (Http.LOG.isTraceEnabled()) {
+        Http.LOG.trace("Http: starting chunk");
+      }
+
+      readLine(in, line, false);
+
+      String chunkLenStr;
+      // if (LOG.isTraceEnabled()) { LOG.trace("chunk-header: '" + line + "'");
+      // }
+
+      int pos = line.indexOf(";");
+      if (pos < 0) {
+        chunkLenStr = line.toString();
+      } else {
+        chunkLenStr = line.substring(0, pos);
+        // if (LOG.isTraceEnabled()) { LOG.trace("got chunk-ext: " +
+        // line.substring(pos+1)); }
+      }
+      chunkLenStr = chunkLenStr.trim();
+      int chunkLen;
+      try {
+        chunkLen = Integer.parseInt(chunkLenStr, 16);
+      } catch (NumberFormatException e) {
+        throw new HttpException("bad chunk length: " + line.toString());
+      }
+
+      if (chunkLen == 0) {
+        doneChunks = true;
+        break;
+      }
+
+      if (http.getMaxContent() >= 0 && (contentBytesRead + chunkLen) > http
+          .getMaxContent())
+        chunkLen = http.getMaxContent() - contentBytesRead;
+
+      // read one chunk
+      int chunkBytesRead = 0;
+      while (chunkBytesRead < chunkLen) {
+
+        int toRead = (chunkLen - chunkBytesRead) < Http.BUFFER_SIZE ?
+            (chunkLen - chunkBytesRead) :
+            Http.BUFFER_SIZE;
+        int len = in.read(bytes, 0, toRead);
+
+        if (len == -1)
+          throw new HttpException("chunk eof after " + contentBytesRead
+              + " bytes in successful chunks" + " and " + chunkBytesRead
+              + " in current chunk");
+
+        // DANGER!!! Will printed GZIPed stuff right to your
+        // terminal!
+        // if (LOG.isTraceEnabled()) { LOG.trace("read: " + new String(bytes, 0,
+        // len)); }
+
+        out.write(bytes, 0, len);
+        chunkBytesRead += len;
+      }
+
+      readLine(in, line, false);
+
+    }
+
+    if (!doneChunks) {
+      if (contentBytesRead != http.getMaxContent())
+        throw new HttpException("chunk eof: !doneChunk && didn't max out");
+      return;
+    }
+
+    content = out.toByteArray();
+    parseHeaders(in, line, null);
+
+  }
+
+  private int parseStatusLine(PushbackInputStream in, StringBuffer line)
+      throws IOException, HttpException {
+    readLine(in, line, false);
+
+    int codeStart = line.indexOf(" ");
+    int codeEnd = line.indexOf(" ", codeStart + 1);
+
+    // handle lines with no plaintext result code, ie:
+    // "HTTP/1.1 200" vs "HTTP/1.1 200 OK"
+    if (codeEnd == -1)
+      codeEnd = line.length();
+
+    int code;
+    try {
+      code = Integer.parseInt(line.substring(codeStart + 1, codeEnd));
+    } catch (NumberFormatException e) {
+      throw new HttpException(
+          "bad status line '" + line + "': " + e.getMessage(), e);
+    }
+
+    return code;
+  }
+
+  private void processHeaderLine(StringBuffer line)
+      throws IOException, HttpException {
+
+    int colonIndex = line.indexOf(":"); // key is up to colon
+    if (colonIndex == -1) {
+      int i;
+      for (i = 0; i < line.length(); i++)
+        if (!Character.isWhitespace(line.charAt(i)))
+          break;
+      if (i == line.length())
+        return;
+      throw new HttpException("No colon in header:" + line);
+    }
+    String key = line.substring(0, colonIndex);
+
+    int valueStart = colonIndex + 1; // skip whitespace
+    while (valueStart < line.length()) {
+      int c = line.charAt(valueStart);
+      if (c != ' ' && c != '\t')
+        break;
+      valueStart++;
+    }
+    String value = line.substring(valueStart);
+    headers.set(key, value);
+  }
+
+  // Adds headers to our headers Metadata
+  private void parseHeaders(PushbackInputStream in, StringBuffer line,
+      StringBuffer httpHeaders) throws IOException, HttpException {
+
+    while (readLine(in, line, true) != 0) {
+
+      if (httpHeaders != null)
+        httpHeaders.append(line).append("\n");
+
+      // handle HTTP responses with missing blank line after headers
+      int pos;
+      if (((pos = line.indexOf("<!DOCTYPE")) != -1) || (
+          (pos = line.indexOf("<HTML")) != -1) || ((pos = line.indexOf("<html"))
+          != -1)) {
+
+        in.unread(line.substring(pos).getBytes("UTF-8"));
+        line.setLength(pos);
+
+        try {
+          // TODO: (CM) We don't know the header names here
+          // since we're just handling them generically. It would
+          // be nice to provide some sort of mapping function here
+          // for the returned header names to the standard metadata
+          // names in the ParseData class
+          processHeaderLine(line);
+        } catch (Exception e) {
+          // fixme:
+          Http.LOG.warn("Error: ", e);
+        }
+        return;
+      }
+
+      processHeaderLine(line);
+    }
+  }
+
+  private static int readLine(PushbackInputStream in, StringBuffer line,
+      boolean allowContinuedLine) throws IOException {
+    line.setLength(0);
+    for (int c = in.read(); c != -1; c = in.read()) {
+      switch (c) {
+      case '\r':
+        if (peek(in) == '\n') {
+          in.read();
+        }
+      case '\n':
+        if (line.length() > 0) {
+          // at EOL -- check for continued line if the current
+          // (possibly continued) line wasn't blank
+          if (allowContinuedLine)
+            switch (peek(in)) {
+            case ' ':
+            case '\t': // line is continued
+              in.read();
+              continue;
+            }
+        }
+        return line.length(); // else complete
+      default:
+        line.append((char) c);
+      }
+    }
+    throw new EOFException();
+  }
+
+  private static int peek(PushbackInputStream in) throws IOException {
+    int value = in.read();
+    in.unread(value);
+    return value;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-htmlunit/src/main/java/org/apache/nutch/protocol/htmlunit/package.html
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-htmlunit/src/main/java/org/apache/nutch/protocol/htmlunit/package.html b/nutch-plugins/protocol-htmlunit/src/main/java/org/apache/nutch/protocol/htmlunit/package.html
new file mode 100644
index 0000000..4181951
--- /dev/null
+++ b/nutch-plugins/protocol-htmlunit/src/main/java/org/apache/nutch/protocol/htmlunit/package.html
@@ -0,0 +1,21 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+<html>
+<body>
+<p>Protocol plugin which supports retrieving documents via the http protocol.</p><p></p>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-http/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-http/build.xml b/nutch-plugins/protocol-http/build.xml
new file mode 100755
index 0000000..30720f1
--- /dev/null
+++ b/nutch-plugins/protocol-http/build.xml
@@ -0,0 +1,50 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="protocol-http" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+  <!-- Build compilation dependencies -->
+  <target name="deps-jar">
+    <ant target="jar" inheritall="false" dir="../lib-http"/>
+  </target>
+
+  <!-- Add compilation dependencies to classpath -->
+  <path id="plugin.deps">
+    <fileset dir="${nutch.root}/build">
+      <include name="**/lib-http/*.jar" />
+    </fileset>
+    <pathelement location="${build.dir}/test/conf"/>
+  </path>
+
+  <!-- Deploy Unit test dependencies -->
+  <target name="deps-test">
+    <ant target="deploy" inheritall="false" dir="../lib-http"/>
+    <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
+    <copy toDir="${build.test}">
+      <fileset dir="${src.test}" excludes="**/*.java"/>
+    </copy>
+  </target>
+
+  <!-- for junit test -->
+  <mkdir dir="${build.test}/data" />
+  <copy todir="${build.test}/data">
+      <fileset dir="jsp"/>
+   </copy>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-http/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-http/ivy.xml b/nutch-plugins/protocol-http/ivy.xml
new file mode 100644
index 0000000..1a86d68
--- /dev/null
+++ b/nutch-plugins/protocol-http/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-http/jsp/basic-http.jsp
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-http/jsp/basic-http.jsp b/nutch-plugins/protocol-http/jsp/basic-http.jsp
new file mode 100644
index 0000000..bf1f8bd
--- /dev/null
+++ b/nutch-plugins/protocol-http/jsp/basic-http.jsp
@@ -0,0 +1,44 @@
+<%--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+  
+  http://www.apache.org/licenses/LICENSE-2.0
+  
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+--%><%--
+  Example JSP Page to Test Protocol-Http Plugin  
+--%><%@ page language="java" import="java.util.*" pageEncoding="UTF-8"%><%
+String path = request.getContextPath();
+String basePath = request.getScheme()+"://"+request.getServerName()+":"+request.getServerPort()+path+"/";
+%>
+
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
+<html>
+  <head>
+    <base href="<%=basePath%>">
+    
+    <title>HelloWorld</title>
+    <meta http-equiv="content-type" content="text/html;charset=utf-8" />
+    <meta name="Language" content="en" />
+	<meta http-equiv="pragma" content="no-cache">
+	<meta http-equiv="cache-control" content="no-cache">
+	<meta http-equiv="expires" content="0">    
+	<meta http-equiv="keywords" content="keyword1,keyword2,keyword3">
+	<meta http-equiv="description" content="This is my page">
+	<!--
+	<link rel="stylesheet" type="text/css" href="styles.css">
+	-->
+  </head>
+  
+  <body>
+    Hello World!!! <br>
+  </body>
+</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-http/jsp/brokenpage.jsp
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-http/jsp/brokenpage.jsp b/nutch-plugins/protocol-http/jsp/brokenpage.jsp
new file mode 100644
index 0000000..f3f7c4a
--- /dev/null
+++ b/nutch-plugins/protocol-http/jsp/brokenpage.jsp
@@ -0,0 +1,47 @@
+<%--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+  
+  http://www.apache.org/licenses/LICENSE-2.0
+  
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+--%><%--
+  Example JSP Page to Test Protocol-Http Plugin
+--%>
+
+@ page language="java" import="java.util.*" pageEncoding="UTF-8"
+
+String path = request.getContextPath();
+String basePath = request.getScheme()+"://"+request.getServerName()+":"+request.getServerPort()+path+"/";
+
+
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
+<html>
+  <head>
+    <base href="<%=basePath%>">
+    
+    <title>HelloWorld</title>
+    <meta http-equiv="content-type" content="text/html;charset=utf-8" />
+    <meta name="Language" content="en" />
+	<meta http-equiv="pragma" content="no-cache">
+	<meta http-equiv="cache-control" content="no-cache">
+	<meta http-equiv="expires" content="0">    
+	<meta http-equiv="keywords" content="keyword1,keyword2,keyword3">
+	<meta http-equiv="description" content="This is my page">
+	<!--
+	<link rel="stylesheet" type="text/css" href="styles.css">
+	-->
+  </head>
+  
+  <body>
+    Hello World!!! <br>
+  </body>
+</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-http/jsp/redirect301.jsp
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-http/jsp/redirect301.jsp b/nutch-plugins/protocol-http/jsp/redirect301.jsp
new file mode 100644
index 0000000..1100b89
--- /dev/null
+++ b/nutch-plugins/protocol-http/jsp/redirect301.jsp
@@ -0,0 +1,49 @@
+<%--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+  
+  http://www.apache.org/licenses/LICENSE-2.0
+  
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+--%><%--
+  Example JSP Page to Test Protocol-Http Plugin
+--%><%@ page language="java" import="java.util.*" pageEncoding="UTF-8"%><%
+String path = request.getContextPath();
+String basePath = request.getScheme()+"://"+request.getServerName()+":"+request.getServerPort()+path+"/";
+%>
+
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
+<html>
+  <head>
+    <base href="<%=basePath%>">
+    
+    <title>My JSP page</title>
+    
+	<meta http-equiv="pragma" content="no-cache">
+	<meta http-equiv="cache-control" content="no-cache">
+	<meta http-equiv="expires" content="0">    
+	<meta http-equiv="keywords" content="keyword1,keyword2,keyword3">
+	<meta http-equiv="description" content="This is my page">
+	<!--
+	<link rel="stylesheet" type="text/css" href="styles.css">
+	-->
+
+  </head>
+  
+  <body>
+       <%
+	response.setStatus(301);
+	response.setHeader( "Location", "http://nutch.apache.org");
+	response.setHeader( "Connection", "close" );
+		%> 
+    You are redirected by JSP<br>
+  </body>
+</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-http/jsp/redirect302.jsp
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-http/jsp/redirect302.jsp b/nutch-plugins/protocol-http/jsp/redirect302.jsp
new file mode 100644
index 0000000..8a250d9
--- /dev/null
+++ b/nutch-plugins/protocol-http/jsp/redirect302.jsp
@@ -0,0 +1,49 @@
+<%--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+  
+  http://www.apache.org/licenses/LICENSE-2.0
+  
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+--%><%--
+  Example JSP Page to Test Protocol-Http Plugin 
+--%><%@ page language="java" import="java.util.*" pageEncoding="UTF-8"%><%
+String path = request.getContextPath();
+String basePath = request.getScheme()+"://"+request.getServerName()+":"+request.getServerPort()+path+"/";
+%>
+
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
+<html>
+  <head>
+    <base href="<%=basePath%>">
+    
+    <title>My JSP page</title>
+    
+	<meta http-equiv="pragma" content="no-cache">
+	<meta http-equiv="cache-control" content="no-cache">
+	<meta http-equiv="expires" content="0">    
+	<meta http-equiv="keywords" content="keyword1,keyword2,keyword3">
+	<meta http-equiv="description" content="This is my page">
+	<!--
+	<link rel="stylesheet" type="text/css" href="styles.css">
+	-->
+
+  </head>
+  
+  <body>
+       <%
+	response.setStatus(302);
+	response.setHeader( "Location", "http://nutch.apache.org");
+	response.setHeader( "Connection", "close" );
+		%> 
+    You are sucessfully redirected by JSP<br>
+  </body>
+</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-http/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-http/plugin.xml b/nutch-plugins/protocol-http/plugin.xml
new file mode 100755
index 0000000..8770b10
--- /dev/null
+++ b/nutch-plugins/protocol-http/plugin.xml
@@ -0,0 +1,51 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="protocol-http"
+   name="Http Protocol Plug-in"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="protocol-http.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+      <import plugin="lib-http"/>
+   </requires>
+
+   <extension id="org.apache.nutch.protocol.http"
+              name="HttpProtocol"
+              point="org.apache.nutch.protocol.Protocol">
+
+      <implementation id="org.apache.nutch.protocol.http.Http"
+                      class="org.apache.nutch.protocol.http.Http">
+        <parameter name="protocolName" value="http"/>
+      </implementation>
+      
+      <implementation id="org.apache.nutch.protocol.http.Http"
+                       class="org.apache.nutch.protocol.http.Http">
+           <parameter name="protocolName" value="https"/>
+      </implementation>
+
+   </extension>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-http/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-http/pom.xml b/nutch-plugins/protocol-http/pom.xml
new file mode 100644
index 0000000..e7ade28
--- /dev/null
+++ b/nutch-plugins/protocol-http/pom.xml
@@ -0,0 +1,57 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>protocol-http</artifactId>
+    <packaging>jar</packaging>
+
+    <name>protocol-http</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+    <dependencies>
+        <dependency>
+            <groupId>org.apache.nutch</groupId>
+            <artifactId>lib-http</artifactId>
+            <version>${project.parent.version}</version>
+        </dependency>
+        <dependency>
+            <groupId> org.mortbay.jetty</groupId>
+            <artifactId>jetty</artifactId>
+            <version>6.1.26</version>
+            <scope>test</scope>
+        </dependency>
+        <dependency>
+            <groupId> org.mortbay.jetty</groupId>
+            <artifactId>jsp-2.1</artifactId>
+            <version>6.1.14</version>
+            <scope>test</scope>
+        </dependency>
+    </dependencies>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-http/src/main/java/org/apache/nutch/protocol/http/Http.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-http/src/main/java/org/apache/nutch/protocol/http/Http.java b/nutch-plugins/protocol-http/src/main/java/org/apache/nutch/protocol/http/Http.java
new file mode 100755
index 0000000..56f9f4f
--- /dev/null
+++ b/nutch-plugins/protocol-http/src/main/java/org/apache/nutch/protocol/http/Http.java
@@ -0,0 +1,73 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.http;
+
+// JDK imports
+import java.io.IOException;
+import java.net.URL;
+
+// Commons Logging imports
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
+
+// Nutch imports
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.protocol.http.api.HttpBase;
+import org.apache.nutch.util.NutchConfiguration;
+
+public class Http extends HttpBase {
+
+  public static final Logger LOG = LoggerFactory.getLogger(Http.class);
+
+  /**
+   * Public default constructor.
+   */
+  public Http() {
+    super(LOG);
+  }
+
+  /**
+   * Set the {@link org.apache.hadoop.conf.Configuration} object.
+   * 
+   * @param conf
+   */
+  public void setConf(Configuration conf) {
+    super.setConf(conf);
+    // Level logLevel = Level.WARNING;
+    // if (conf.getBoolean("http.verbose", false)) {
+    // logLevel = Level.FINE;
+    // }
+    // LOG.setLevel(logLevel);
+  }
+
+  public static void main(String[] args) throws Exception {
+    Http http = new Http();
+    http.setConf(NutchConfiguration.create());
+    main(http, args);
+  }
+
+  protected Response getResponse(URL url, CrawlDatum datum, boolean redirect)
+      throws ProtocolException, IOException {
+    return new HttpResponse(this, url, datum);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-http/src/main/java/org/apache/nutch/protocol/http/HttpResponse.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-http/src/main/java/org/apache/nutch/protocol/http/HttpResponse.java b/nutch-plugins/protocol-http/src/main/java/org/apache/nutch/protocol/http/HttpResponse.java
new file mode 100644
index 0000000..f6d7e4d
--- /dev/null
+++ b/nutch-plugins/protocol-http/src/main/java/org/apache/nutch/protocol/http/HttpResponse.java
@@ -0,0 +1,558 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.http;
+
+import java.io.BufferedInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.EOFException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.io.PushbackInputStream;
+import java.net.InetSocketAddress;
+import java.net.Socket;
+import java.net.URL;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+
+import javax.net.ssl.SSLSocket;
+import javax.net.ssl.SSLSocketFactory;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.metadata.SpellCheckedMetadata;
+import org.apache.nutch.net.protocols.HttpDateFormat;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.protocol.http.api.HttpBase;
+import org.apache.nutch.protocol.http.api.HttpException;
+
+/**
+ * An HTTP response.
+ */
+public class HttpResponse implements Response {
+
+  private Configuration conf;
+  private HttpBase http;
+  private URL url;
+  private String orig;
+  private String base;
+  private byte[] content;
+  private int code;
+  private Metadata headers = new SpellCheckedMetadata();
+  // used for storing the http headers verbatim
+  private StringBuffer httpHeaders;
+
+  protected enum Scheme {
+    HTTP, HTTPS,
+  }
+
+  /**
+   * Default public constructor.
+   *
+   * @param http
+   * @param url
+   * @param datum
+   * @throws ProtocolException
+   * @throws IOException
+   */
+  public HttpResponse(HttpBase http, URL url, CrawlDatum datum)
+      throws ProtocolException, IOException {
+
+    this.http = http;
+    this.url = url;
+    this.orig = url.toString();
+    this.base = url.toString();
+
+    Scheme scheme = null;
+
+    if ("http".equals(url.getProtocol())) {
+      scheme = Scheme.HTTP;
+    } else if ("https".equals(url.getProtocol())) {
+      scheme = Scheme.HTTPS;
+    } else {
+      throw new HttpException("Unknown scheme (not http/https) for url:" + url);
+    }
+
+    if (Http.LOG.isTraceEnabled()) {
+      Http.LOG.trace("fetching " + url);
+    }
+
+    String path = "".equals(url.getFile()) ? "/" : url.getFile();
+
+    // some servers will redirect a request with a host line like
+    // "Host: <hostname>:80" to "http://<hpstname>/<orig_path>"- they
+    // don't want the :80...
+
+    String host = url.getHost();
+    int port;
+    String portString;
+    if (url.getPort() == -1) {
+      if (scheme == Scheme.HTTP) {
+        port = 80;
+      } else {
+        port = 443;
+      }
+      portString = "";
+    } else {
+      port = url.getPort();
+      portString = ":" + port;
+    }
+    Socket socket = null;
+
+    try {
+      socket = new Socket(); // create the socket
+      socket.setSoTimeout(http.getTimeout());
+
+      // connect
+      String sockHost = http.useProxy(url) ? http.getProxyHost() : host;
+      int sockPort = http.useProxy(url) ? http.getProxyPort() : port;
+      InetSocketAddress sockAddr = new InetSocketAddress(sockHost, sockPort);
+      socket.connect(sockAddr, http.getTimeout());
+
+      if (scheme == Scheme.HTTPS) {
+        SSLSocketFactory factory = (SSLSocketFactory) SSLSocketFactory
+            .getDefault();
+        SSLSocket sslsocket = (SSLSocket) factory
+            .createSocket(socket, sockHost, sockPort, true);
+        sslsocket.setUseClientMode(true);
+
+        // Get the protocols and ciphers supported by this JVM
+        Set<String> protocols = new HashSet<String>(
+            Arrays.asList(sslsocket.getSupportedProtocols()));
+        Set<String> ciphers = new HashSet<String>(
+            Arrays.asList(sslsocket.getSupportedCipherSuites()));
+
+        // Intersect with preferred protocols and ciphers
+        protocols.retainAll(http.getTlsPreferredProtocols());
+        ciphers.retainAll(http.getTlsPreferredCipherSuites());
+
+        sslsocket.setEnabledProtocols(
+            protocols.toArray(new String[protocols.size()]));
+        sslsocket.setEnabledCipherSuites(
+            ciphers.toArray(new String[ciphers.size()]));
+
+        sslsocket.startHandshake();
+        socket = sslsocket;
+      }
+
+      this.conf = http.getConf();
+      if (sockAddr != null
+          && conf.getBoolean("store.ip.address", false) == true) {
+        headers.add("_ip_", sockAddr.getAddress().getHostAddress());
+      }
+
+      // make request
+      OutputStream req = socket.getOutputStream();
+
+      StringBuffer reqStr = new StringBuffer("GET ");
+      if (http.useProxy(url)) {
+        reqStr.append(url.getProtocol() + "://" + host + portString + path);
+      } else {
+        reqStr.append(path);
+      }
+
+      reqStr.append(" HTTP/1.0\r\n");
+
+      reqStr.append("Host: ");
+      reqStr.append(host);
+      reqStr.append(portString);
+      reqStr.append("\r\n");
+
+      reqStr.append("Accept-Encoding: x-gzip, gzip, deflate\r\n");
+
+      String userAgent = http.getUserAgent();
+      if ((userAgent == null) || (userAgent.length() == 0)) {
+        if (Http.LOG.isErrorEnabled()) {
+          Http.LOG.error("User-agent is not set!");
+        }
+      } else {
+        reqStr.append("User-Agent: ");
+        reqStr.append(userAgent);
+        reqStr.append("\r\n");
+      }
+
+      reqStr.append("Accept-Language: ");
+      reqStr.append(this.http.getAcceptLanguage());
+      reqStr.append("\r\n");
+
+      reqStr.append("Accept: ");
+      reqStr.append(this.http.getAccept());
+      reqStr.append("\r\n");
+
+      if (http.isIfModifiedSinceEnabled() && datum.getModifiedTime() > 0) {
+        reqStr.append("If-Modified-Since: " + HttpDateFormat
+            .toString(datum.getModifiedTime()));
+        reqStr.append("\r\n");
+      }
+      reqStr.append("\r\n");
+
+      // store the request in the metadata?
+      if (conf.getBoolean("store.http.request", false) == true) {
+        headers.add("_request_", reqStr.toString());
+      }
+
+      byte[] reqBytes = reqStr.toString().getBytes();
+
+      req.write(reqBytes);
+      req.flush();
+
+      PushbackInputStream in = // process response
+          new PushbackInputStream(
+              new BufferedInputStream(socket.getInputStream(),
+                  Http.BUFFER_SIZE), Http.BUFFER_SIZE);
+
+      StringBuffer line = new StringBuffer();
+
+      // store the http headers verbatim
+      if (conf.getBoolean("store.http.headers", false) == true) {
+        httpHeaders = new StringBuffer();
+      }
+
+      headers.add("nutch.fetch.time", Long.toString(System.currentTimeMillis()));
+
+      boolean haveSeenNonContinueStatus = false;
+      while (!haveSeenNonContinueStatus) {
+        // parse status code line
+        this.code = parseStatusLine(in, line);
+        if (httpHeaders != null)
+          httpHeaders.append(line).append("\n");
+        // parse headers
+        parseHeaders(in, line, httpHeaders);
+        haveSeenNonContinueStatus = code != 100; // 100 is "Continue"
+      }
+
+      String transferEncoding = getHeader(Response.TRANSFER_ENCODING);
+      if (transferEncoding != null && "chunked"
+          .equalsIgnoreCase(transferEncoding.trim())) {
+        readChunkedContent(in, line);
+      } else {
+        readPlainContent(in);
+      }
+
+      String contentEncoding = getHeader(Response.CONTENT_ENCODING);
+      if ("gzip".equals(contentEncoding) || "x-gzip".equals(contentEncoding)) {
+        content = http.processGzipEncoded(content, url);
+      } else if ("deflate".equals(contentEncoding)) {
+        content = http.processDeflateEncoded(content, url);
+      } else {
+        // store the headers verbatim only if the response was not compressed
+        // as the content length reported with not match otherwise
+        if (httpHeaders != null) {
+          headers.add("_response.headers_", httpHeaders.toString());
+        }
+        if (Http.LOG.isTraceEnabled()) {
+          Http.LOG.trace("fetched " + content.length + " bytes from " + url);
+        }
+      }
+
+    } finally {
+      if (socket != null)
+        socket.close();
+    }
+
+  }
+
+  /*
+   * ------------------------- * <implementation:Response> *
+   * -------------------------
+   */
+
+  public URL getUrl() {
+    return url;
+  }
+
+  public int getCode() {
+    return code;
+  }
+
+  public String getHeader(String name) {
+    return headers.get(name);
+  }
+
+  public Metadata getHeaders() {
+    return headers;
+  }
+
+  public byte[] getContent() {
+    return content;
+  }
+
+  /*
+   * ------------------------- * <implementation:Response> *
+   * -------------------------
+   */
+
+  private void readPlainContent(InputStream in)
+      throws HttpException, IOException {
+
+    int contentLength = Integer.MAX_VALUE; // get content length
+    String contentLengthString = headers.get(Response.CONTENT_LENGTH);
+    if (contentLengthString != null) {
+      contentLengthString = contentLengthString.trim();
+      try {
+        if (!contentLengthString.isEmpty())
+          contentLength = Integer.parseInt(contentLengthString);
+      } catch (NumberFormatException e) {
+        throw new HttpException("bad content length: " + contentLengthString);
+      }
+    }
+    if (http.getMaxContent() >= 0 && contentLength > http
+        .getMaxContent()) // limit
+      // download
+      // size
+      contentLength = http.getMaxContent();
+
+    ByteArrayOutputStream out = new ByteArrayOutputStream(Http.BUFFER_SIZE);
+    byte[] bytes = new byte[Http.BUFFER_SIZE];
+    int length = 0;
+
+    // do not try to read if the contentLength is 0
+    if (contentLength == 0) {
+      content = new byte[0];
+      return;
+    }
+
+    // read content
+    int i = in.read(bytes);
+    while (i != -1) {
+      out.write(bytes, 0, i);
+      length += i;
+      if (length >= contentLength) {
+        break;
+      }
+      if ((length + Http.BUFFER_SIZE) > contentLength) {
+        // reading next chunk may hit contentLength,
+        // must limit number of bytes read
+        i = in.read(bytes, 0, (contentLength - length));
+      } else {
+        i = in.read(bytes);
+      }
+    }
+    content = out.toByteArray();
+  }
+
+  /**
+   * @param in
+   * @param line
+   * @throws HttpException
+   * @throws IOException
+   */
+  private void readChunkedContent(PushbackInputStream in, StringBuffer line)
+      throws HttpException, IOException {
+    boolean doneChunks = false;
+    int contentBytesRead = 0;
+    byte[] bytes = new byte[Http.BUFFER_SIZE];
+    ByteArrayOutputStream out = new ByteArrayOutputStream(Http.BUFFER_SIZE);
+
+    while (!doneChunks) {
+      if (Http.LOG.isTraceEnabled()) {
+        Http.LOG.trace("Http: starting chunk");
+      }
+
+      readLine(in, line, false);
+
+      String chunkLenStr;
+      // if (LOG.isTraceEnabled()) { LOG.trace("chunk-header: '" + line + "'");
+      // }
+
+      int pos = line.indexOf(";");
+      if (pos < 0) {
+        chunkLenStr = line.toString();
+      } else {
+        chunkLenStr = line.substring(0, pos);
+        // if (LOG.isTraceEnabled()) { LOG.trace("got chunk-ext: " +
+        // line.substring(pos+1)); }
+      }
+      chunkLenStr = chunkLenStr.trim();
+      int chunkLen;
+      try {
+        chunkLen = Integer.parseInt(chunkLenStr, 16);
+      } catch (NumberFormatException e) {
+        throw new HttpException("bad chunk length: " + line.toString());
+      }
+
+      if (chunkLen == 0) {
+        doneChunks = true;
+        break;
+      }
+
+      if (http.getMaxContent() >= 0 && (contentBytesRead + chunkLen) > http
+          .getMaxContent())
+        chunkLen = http.getMaxContent() - contentBytesRead;
+
+      // read one chunk
+      int chunkBytesRead = 0;
+      while (chunkBytesRead < chunkLen) {
+
+        int toRead = (chunkLen - chunkBytesRead) < Http.BUFFER_SIZE ?
+            (chunkLen - chunkBytesRead) :
+            Http.BUFFER_SIZE;
+        int len = in.read(bytes, 0, toRead);
+
+        if (len == -1)
+          throw new HttpException("chunk eof after " + contentBytesRead
+              + " bytes in successful chunks" + " and " + chunkBytesRead
+              + " in current chunk");
+
+        // DANGER!!! Will printed GZIPed stuff right to your
+        // terminal!
+        // if (LOG.isTraceEnabled()) { LOG.trace("read: " + new String(bytes, 0,
+        // len)); }
+
+        out.write(bytes, 0, len);
+        chunkBytesRead += len;
+      }
+
+      readLine(in, line, false);
+
+    }
+
+    if (!doneChunks) {
+      if (contentBytesRead != http.getMaxContent())
+        throw new HttpException("chunk eof: !doneChunk && didn't max out");
+      return;
+    }
+
+    content = out.toByteArray();
+    parseHeaders(in, line, null);
+
+  }
+
+  private int parseStatusLine(PushbackInputStream in, StringBuffer line)
+      throws IOException, HttpException {
+    readLine(in, line, false);
+
+    int codeStart = line.indexOf(" ");
+    int codeEnd = line.indexOf(" ", codeStart + 1);
+
+    // handle lines with no plaintext result code, ie:
+    // "HTTP/1.1 200" vs "HTTP/1.1 200 OK"
+    if (codeEnd == -1)
+      codeEnd = line.length();
+
+    int code;
+    try {
+      code = Integer.parseInt(line.substring(codeStart + 1, codeEnd));
+    } catch (NumberFormatException e) {
+      throw new HttpException(
+          "bad status line '" + line + "': " + e.getMessage(), e);
+    }
+
+    return code;
+  }
+
+  private void processHeaderLine(StringBuffer line)
+      throws IOException, HttpException {
+
+    int colonIndex = line.indexOf(":"); // key is up to colon
+    if (colonIndex == -1) {
+      int i;
+      for (i = 0; i < line.length(); i++)
+        if (!Character.isWhitespace(line.charAt(i)))
+          break;
+      if (i == line.length())
+        return;
+      throw new HttpException("No colon in header:" + line);
+    }
+    String key = line.substring(0, colonIndex);
+
+    int valueStart = colonIndex + 1; // skip whitespace
+    while (valueStart < line.length()) {
+      int c = line.charAt(valueStart);
+      if (c != ' ' && c != '\t')
+        break;
+      valueStart++;
+    }
+    String value = line.substring(valueStart);
+    headers.set(key, value);
+  }
+
+  // Adds headers to our headers Metadata
+  private void parseHeaders(PushbackInputStream in, StringBuffer line,
+      StringBuffer httpHeaders) throws IOException, HttpException {
+
+    while (readLine(in, line, true) != 0) {
+
+      if (httpHeaders != null)
+        httpHeaders.append(line).append("\n");
+
+      // handle HTTP responses with missing blank line after headers
+      int pos;
+      if (((pos = line.indexOf("<!DOCTYPE")) != -1) || (
+          (pos = line.indexOf("<HTML")) != -1) || ((pos = line.indexOf("<html"))
+          != -1)) {
+
+        in.unread(line.substring(pos).getBytes("UTF-8"));
+        line.setLength(pos);
+
+        try {
+          // TODO: (CM) We don't know the header names here
+          // since we're just handling them generically. It would
+          // be nice to provide some sort of mapping function here
+          // for the returned header names to the standard metadata
+          // names in the ParseData class
+          processHeaderLine(line);
+        } catch (Exception e) {
+          // fixme:
+          Http.LOG.warn("Error: ", e);
+        }
+        return;
+      }
+
+      processHeaderLine(line);
+    }
+  }
+
+  private static int readLine(PushbackInputStream in, StringBuffer line,
+      boolean allowContinuedLine) throws IOException {
+    line.setLength(0);
+    for (int c = in.read(); c != -1; c = in.read()) {
+      switch (c) {
+      case '\r':
+        if (peek(in) == '\n') {
+          in.read();
+        }
+      case '\n':
+        if (line.length() > 0) {
+          // at EOL -- check for continued line if the current
+          // (possibly continued) line wasn't blank
+          if (allowContinuedLine)
+            switch (peek(in)) {
+            case ' ':
+            case '\t': // line is continued
+              in.read();
+              continue;
+            }
+        }
+        return line.length(); // else complete
+      default:
+        line.append((char) c);
+      }
+    }
+    throw new EOFException();
+  }
+
+  private static int peek(PushbackInputStream in) throws IOException {
+    int value = in.read();
+    in.unread(value);
+    return value;
+  }
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-http/src/main/java/org/apache/nutch/protocol/http/package.html
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-http/src/main/java/org/apache/nutch/protocol/http/package.html b/nutch-plugins/protocol-http/src/main/java/org/apache/nutch/protocol/http/package.html
new file mode 100644
index 0000000..34d1d1c
--- /dev/null
+++ b/nutch-plugins/protocol-http/src/main/java/org/apache/nutch/protocol/http/package.html
@@ -0,0 +1,5 @@
+<html>
+<body>
+<p>Protocol plugin which supports retrieving documents via the http protocol.</p><p></p>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-http/src/test/conf/nutch-site-test.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-http/src/test/conf/nutch-site-test.xml b/nutch-plugins/protocol-http/src/test/conf/nutch-site-test.xml
new file mode 100644
index 0000000..a9afd78
--- /dev/null
+++ b/nutch-plugins/protocol-http/src/test/conf/nutch-site-test.xml
@@ -0,0 +1,52 @@
+<?xml version="1.0"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<configuration>
+
+<property>
+  <name>http.robots.agents</name>
+  <value>Nutch-Test,*</value>
+  <description></description>
+</property>
+
+<property>
+  <name>http.agent.name</name>
+  <value>Nutch-Test</value>
+  <description></description>
+</property>
+
+<property>
+  <name>http.agent.description</name>
+  <value>Nutch protocol-httpclient test</value>
+  <description></description>
+</property>
+
+<property>
+  <name>http.auth.file</name>
+  <value>httpclient-auth-test.xml</value>
+  <description></description>
+</property>
+
+<property>
+  <name>http.timeout</name>
+  <value>60000</value>
+  <description></description>
+</property>
+
+</configuration>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-http/src/test/java/org/apache/nutch/protocol/http/TestProtocolHttp.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-http/src/test/java/org/apache/nutch/protocol/http/TestProtocolHttp.java b/nutch-plugins/protocol-http/src/test/java/org/apache/nutch/protocol/http/TestProtocolHttp.java
new file mode 100644
index 0000000..7dd9e9b
--- /dev/null
+++ b/nutch-plugins/protocol-http/src/test/java/org/apache/nutch/protocol/http/TestProtocolHttp.java
@@ -0,0 +1,140 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol.http;
+
+import static org.junit.Assert.assertEquals;
+
+import java.net.URL;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ProtocolOutput;
+import org.junit.After;
+import org.junit.Test;
+import org.mortbay.jetty.Server;
+import org.mortbay.jetty.nio.SelectChannelConnector;
+import org.mortbay.jetty.servlet.Context;
+import org.mortbay.jetty.servlet.ServletHolder;
+
+/**
+ * Test cases for protocol-http
+ */
+public class TestProtocolHttp {
+  private static final String RES_DIR = System.getProperty("test.data", ".");
+
+  private Http http;
+  private Server server;
+  private Context root;
+  private Configuration conf;
+  private int port;
+
+  public void setUp(boolean redirection) throws Exception {
+    conf = new Configuration();
+    conf.addResource("nutch-default.xml");
+    conf.addResource("nutch-site-test.xml");
+
+    http = new Http();
+    http.setConf(conf);
+
+    server = new Server();
+
+    if (redirection) {
+      root = new Context(server, "/redirection", Context.SESSIONS);
+      root.setAttribute("newContextURL", "/redirect");
+    } else {
+      root = new Context(server, "/", Context.SESSIONS);
+    }
+
+    ServletHolder sh = new ServletHolder(
+        org.apache.jasper.servlet.JspServlet.class);
+    root.addServlet(sh, "*.jsp");
+    root.setResourceBase(RES_DIR);
+  }
+
+  @After
+  public void tearDown() throws Exception {
+    server.stop();
+  }
+
+  @Test
+  public void testStatusCode() throws Exception {
+    startServer(47504, false);
+    fetchPage("/basic-http.jsp", 200);
+    fetchPage("/redirect301.jsp", 301);
+    fetchPage("/redirect302.jsp", 302);
+    fetchPage("/nonexists.html", 404);
+    fetchPage("/brokenpage.jsp", 500);
+  }
+
+  @Test
+  public void testRedirectionJetty() throws Exception {
+    // Redirection via Jetty
+    startServer(47503, true);
+    fetchPage("/redirection", 302);
+  }
+
+  /**
+   * Starts the Jetty server at a specified port and redirection parameter.
+   * 
+   * @param portno
+   *          Port number.
+   * @param redirection
+   *          whether redirection
+   */
+  private void startServer(int portno, boolean redirection) throws Exception {
+    port = portno;
+    setUp(redirection);
+    SelectChannelConnector connector = new SelectChannelConnector();
+    connector.setHost("127.0.0.1");
+    connector.setPort(port);
+
+    server.addConnector(connector);
+    server.start();
+  }
+
+  /**
+   * Fetches the specified <code>page</code> from the local Jetty server and
+   * checks whether the HTTP response status code matches with the expected
+   * code. Also use jsp pages for redirection.
+   * 
+   * @param page
+   *          Page to be fetched.
+   * @param expectedCode
+   *          HTTP response status code expected while fetching the page.
+   */
+  private void fetchPage(String page, int expectedCode) throws Exception {
+    URL url = new URL("http", "127.0.0.1", port, page);
+    CrawlDatum crawlDatum = new CrawlDatum();
+    Response response = http.getResponse(url, crawlDatum, true);
+    ProtocolOutput out = http.getProtocolOutput(new Text(url.toString()),
+        crawlDatum);
+    Content content = out.getContent();
+    assertEquals("HTTP Status Code for " + url, expectedCode,
+        response.getCode());
+
+    if (page.compareTo("/nonexists.html") != 0
+        && page.compareTo("/brokenpage.jsp") != 0
+        && page.compareTo("/redirection") != 0) {
+      assertEquals("ContentType " + url, "text/html",
+          content.getContentType());
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-httpclient/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-httpclient/build.xml b/nutch-plugins/protocol-httpclient/build.xml
new file mode 100644
index 0000000..b66eb97
--- /dev/null
+++ b/nutch-plugins/protocol-httpclient/build.xml
@@ -0,0 +1,45 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="protocol-httpclient" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+  <target name="deps-jar">
+    <ant target="jar" inheritall="false" dir="../lib-http"/>
+  </target>
+
+  <path id="plugin.deps">
+    <fileset dir="${nutch.root}/build">
+      <include name="**/lib-http/*.jar" />
+    </fileset>
+    <pathelement location="${build.dir}/test/conf"/>
+  </path>
+
+  <target name="deps-test">
+    <copy toDir="${build.test}">
+      <fileset dir="${src.test}" excludes="**/*.java"/>
+    </copy>
+  </target>
+
+  <!-- for junit test -->
+  <mkdir dir="${build.test}/data" />
+  <copy todir="${build.test}/data">
+      <fileset dir="jsp"/>
+   </copy>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-httpclient/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-httpclient/ivy.xml b/nutch-plugins/protocol-httpclient/ivy.xml
new file mode 100644
index 0000000..00b6f07
--- /dev/null
+++ b/nutch-plugins/protocol-httpclient/ivy.xml
@@ -0,0 +1,42 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+    <dependency org="org.jsoup" name="jsoup" rev="1.8.1" />
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-httpclient/jsp/basic.jsp
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-httpclient/jsp/basic.jsp b/nutch-plugins/protocol-httpclient/jsp/basic.jsp
new file mode 100644
index 0000000..c5bfb89
--- /dev/null
+++ b/nutch-plugins/protocol-httpclient/jsp/basic.jsp
@@ -0,0 +1,74 @@
+<%--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+  
+  http://www.apache.org/licenses/LICENSE-2.0
+  
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+--%><%--
+  This JSP demonstrates basic authentication. When this JSP page is
+  requested with no query parameters, then the user must enter the
+  username as 'userx' and password as 'passx' when prompted for
+  authentication. Apart from this there are a few other test cases,
+  which can be used by passing a test case number as query parameter in
+  the following manner: basic.jsp?case=1, basic.jsp?case=2, etc.
+  The credentials for each test case can be easily figured out from the
+  code below.
+
+  Author: Susam Pal
+--%><%@ page
+    import = "sun.misc.BASE64Decoder"
+%><%
+  String authHeader = request.getHeader("Authorization");
+  String realm = null;
+  String username = null;
+  String password = null;
+  int testCase = 0;
+  try {
+    testCase = Integer.parseInt(request.getParameter("case"));
+  } catch (Exception ex) {
+    // do nothing
+  }
+  switch (testCase) {
+    case 1:
+      realm = "realm1"; username = "user1"; password = "pass1";
+      break;
+
+    case 2:
+      realm = "realm2"; username = "user2"; password = "pass2";
+      break;
+
+    default:
+      realm = "realmx"; username = "userx"; password = "passx";
+      break;
+  }
+
+  boolean authenticated = false;
+  if (authHeader != null && authHeader.toUpperCase().startsWith("BASIC")) {
+    String creds[] = new String(new BASE64Decoder().decodeBuffer(
+        authHeader.substring(6))).split(":", 2);
+    if (creds[0].equals(username) && creds[1].equals(password))
+          authenticated = true;
+  }
+  if (!authenticated) {
+    response.setHeader("WWW-Authenticate", "Basic realm=\"" + realm + "\"");
+    response.sendError(response.SC_UNAUTHORIZED);
+  } else {
+%>
+<html>
+<head><title>Basic Authentication Test</title></head>
+<body>
+<p>Hi <%= username %>, you have been successfully authenticated.</p>
+</body>
+</html>
+<%
+  }
+%>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-httpclient/jsp/cookies.jsp
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-httpclient/jsp/cookies.jsp b/nutch-plugins/protocol-httpclient/jsp/cookies.jsp
new file mode 100644
index 0000000..ae2ace2
--- /dev/null
+++ b/nutch-plugins/protocol-httpclient/jsp/cookies.jsp
@@ -0,0 +1,63 @@
+<%--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+  
+  http://www.apache.org/licenses/LICENSE-2.0
+  
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+--%><%--
+  This JSP tests whether the client can remember cookies. When the JSP
+  is fetched for the first time without any query parameters, it sets
+  a few cookies in the client. On a second request, with the query
+  parameter, 'cookie=yes', it checks whether all the client has sent
+  the cookies. If the cookies are found, HTTP 200 response is returned.
+  If the cookies are not found, HTTP 403 response is returned.
+
+  Author: Susam Pal
+--%><%
+  String cookieParam = request.getParameter("cookie");
+  if (!"yes".equals(cookieParam)) { // Send cookies
+    response.addCookie(new Cookie("var1", "val1"));
+    response.addCookie(new Cookie("var2", "val2"));
+%>
+<html>
+<head><title>Cookies Set</title></head>
+<body><p>Cookies have been set.</p></body>
+</html>
+<%
+  } else { // Check cookies
+    int cookiesCount = 0;
+
+    Cookie[] cookies = request.getCookies();
+    if (cookies != null) {
+      for (int i = 0; i < cookies.length; i++) {
+        if (cookies[i].getName().equals("var1")
+            && cookies[i].getValue().equals("val1"))
+          cookiesCount++;
+
+        if (cookies[i].getName().equals("var2")
+            && cookies[i].getValue().equals("val2"))
+          cookiesCount++;
+      }
+    }
+
+    if (cookiesCount != 2) {
+      response.sendError(response.SC_FORBIDDEN);
+    } else {
+%>
+<html>
+<head><title>Cookies Found</title></head>
+<body><p>Cookies found!</p></body>
+</html>
+<%
+    }
+  }
+%>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-httpclient/jsp/digest.jsp
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-httpclient/jsp/digest.jsp b/nutch-plugins/protocol-httpclient/jsp/digest.jsp
new file mode 100644
index 0000000..c657484
--- /dev/null
+++ b/nutch-plugins/protocol-httpclient/jsp/digest.jsp
@@ -0,0 +1,68 @@
+<%--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+  
+  http://www.apache.org/licenses/LICENSE-2.0
+  
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+--%><%--
+  This JSP tests digest authentication. It generates an HTTP response
+  with authorization header for digest authentication and checks the
+  user-name supplied by the client. It does not check the other
+  parameters and hashes as controlled JUnit tests would be performed
+  against this and only the proper submission of credentials need to
+  be tested.
+
+  Author: Susam Pal
+--%><%@ page
+    import = "java.util.StringTokenizer"
+    import = "java.util.HashMap"
+%><%
+  String username = "digest_user";
+  String authHeader = request.getHeader("Authorization");
+  
+  boolean authenticated = false;
+  if (authHeader != null && authHeader.toUpperCase().startsWith("DIGEST")) {
+    HashMap map = new HashMap();
+    StringTokenizer tokenizer = new StringTokenizer(
+        authHeader.substring(7).trim(), ",");
+    while (tokenizer.hasMoreTokens()) {
+      String[] param = tokenizer.nextToken().trim().split("=", 2);
+      if (param[1].charAt(0) == '"') {
+        param[1] = param[1].substring(1, param[1].length() - 1);
+      }
+      map.put(param[0], param[1]);
+    }
+
+    if (username.equals((String)map.get("username")))
+      authenticated = true;
+  }
+
+  if (!authenticated) {
+    String realm = "realm=\"realm1\"";
+    String qop   = "qop=\"auth,auth-int\"";
+    String nonce = "nonce=\"dcd98b7102dd2f0e8b11d0f600bfb0c093\"";
+    String opaque = "opaque=\"5ccc069c403ebaf9f0171e9517f40e41\"";
+
+    response.setHeader("WWW-Authenticate", "Digest " + realm + ", "
+        + qop + ", " + nonce + ", " + opaque);
+    response.sendError(response.SC_UNAUTHORIZED);
+  } else {
+%>
+<html>
+<head><title>Digest Authentication Test</title></head>
+<body>
+<p>Hi <%= username %>, you have been successfully authenticated.</p>
+</body>
+</html>
+<%
+  }
+%>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-httpclient/jsp/noauth.jsp
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-httpclient/jsp/noauth.jsp b/nutch-plugins/protocol-httpclient/jsp/noauth.jsp
new file mode 100644
index 0000000..c726b0f
--- /dev/null
+++ b/nutch-plugins/protocol-httpclient/jsp/noauth.jsp
@@ -0,0 +1,36 @@
+<%--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+  
+  http://www.apache.org/licenses/LICENSE-2.0
+  
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+--%><%--
+  This JSP tests whether the client is sending any pre-emptive
+  authentication headers. The client is expected not to send pre-emptive
+  authentication headers. If such authentication headers are found, this
+  JSP will return an HTTP 403 response; HTTP 200 response otherwise.
+
+  Author: Susam Pal
+--%><%
+  if (request.getHeader("Authorization") != null) {
+    response.sendError(response.SC_UNAUTHORIZED);
+  } else {
+%>
+<html>
+<head><title>No authorization headers found</title></head>
+<body>
+<p>No authorization headers found.</p>
+</body>
+</html>
+<%
+  }
+%>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-httpclient/jsp/ntlm.jsp
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-httpclient/jsp/ntlm.jsp b/nutch-plugins/protocol-httpclient/jsp/ntlm.jsp
new file mode 100644
index 0000000..6ad921e
--- /dev/null
+++ b/nutch-plugins/protocol-httpclient/jsp/ntlm.jsp
@@ -0,0 +1,89 @@
+<%--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+  
+  http://www.apache.org/licenses/LICENSE-2.0
+  
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+--%><%--
+  This JSP tests NTLM authentication. It generates an HTTP response
+  with authorization header for NTLM authentication and checks the
+  user-name supplied by the client. It does not check the other
+  parameters and hashes as controlled JUnit tests would be performed
+  against this and only the proper submission of credentials need to
+  be tested.
+
+  Author: Susam Pal
+--%><%@ page
+    import = "sun.misc.BASE64Decoder"
+    import = "sun.misc.BASE64Encoder"
+%><%
+  String authHeader = request.getHeader("Authorization");
+  String username = null;
+  String domain = null;
+  String host = null;
+
+  boolean authenticated = false;
+  if (authHeader != null && authHeader.startsWith("NTLM")) {
+    byte[] msg = new BASE64Decoder().decodeBuffer(
+        authHeader.substring(5));
+    if (msg[8] == 1) {
+      byte[] type2msg = {
+          'N', 'T', 'L', 'M', 'S', 'S', 'P', 0, // NTLMSSP Signature
+          2, 0, 0, 0,                           // Type 2 Indicator
+          10, 0, 10, 0, 32, 0, 0, 0,            // length, offset
+          0x00, 0x02, (byte) 0x81, 0,           // Flags
+          1, 2, 3, 4, 5, 6, 7, 8,               // Challenge
+          'N', 'U', 'T', 'C', 'H' // NUTCH (Domain)
+      };
+      response.setHeader("WWW-Authenticate", "NTLM "
+          + new BASE64Encoder().encodeBuffer(type2msg));
+      response.sendError(response.SC_UNAUTHORIZED);
+      return;
+    } else if (msg[8] == 3) {
+      int length;
+      int offset;
+
+      // Get domain name
+      length = msg[30] + msg[31] * 256;
+      offset = msg[32] + msg[33] * 256;
+      domain = new String(msg, offset, length);
+
+      // Get user name
+      length = msg[38] + msg[39] * 256;
+      offset = msg[40] + msg[41] * 256;
+      username = new String(msg, offset, length);
+
+      // Get password
+      length = msg[46] + msg[47] * 256;
+      offset = msg[48] + msg[49] * 256;
+      host = new String(msg, offset, length);
+
+      if ("ntlm_user".equalsIgnoreCase(username)
+          && "NUTCH".equalsIgnoreCase(domain))
+        authenticated = true;
+    }
+  }
+
+  if (!authenticated) {
+    response.setHeader("WWW-Authenticate", "NTLM");
+    response.sendError(response.SC_UNAUTHORIZED);
+  } else {
+%>
+<html>
+<head>NTLM Authentication Test</head>
+<body>
+<p>Hi <%= username %>, You have been successfully authenticated.</p>
+</body>
+</html>
+<%
+  }
+%>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-httpclient/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-httpclient/plugin.xml b/nutch-plugins/protocol-httpclient/plugin.xml
new file mode 100644
index 0000000..1747713
--- /dev/null
+++ b/nutch-plugins/protocol-httpclient/plugin.xml
@@ -0,0 +1,58 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+   
+   http://www.apache.org/licenses/LICENSE-2.0
+   
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+<plugin
+   id="protocol-httpclient"
+   name="Http / Https Protocol Plug-in"
+   version="1.0.0"
+   provider-name="nutch.org">
+   
+   <runtime>
+      <library name="protocol-httpclient.jar">
+         <export name="*"/>
+      </library>
+      <library name="jsoup-1.8.1.jar"/>
+   </runtime>
+   
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+      <import plugin="lib-http"/>
+   </requires>
+   
+   <extension id="org.apache.nutch.protocol.httpclient"
+      name="HttpProtocol"
+      point="org.apache.nutch.protocol.Protocol">
+      
+      <implementation id="org.apache.nutch.protocol.httpclient.Http"
+         class="org.apache.nutch.protocol.httpclient.Http">
+         <parameter name="protocolName" value="http"/>
+      </implementation>
+      
+   </extension>
+   
+   <extension id="org.apache.nutch.protocol.https"
+      name="HttpsProtocol"
+      point="org.apache.nutch.protocol.Protocol">
+      
+      <implementation id="org.apache.nutch.protocol.httpclient.Http"
+         class="org.apache.nutch.protocol.httpclient.Http">
+         <parameter name="protocolName" value="https"/>
+      </implementation>
+      
+   </extension>
+   
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-httpclient/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-httpclient/pom.xml b/nutch-plugins/protocol-httpclient/pom.xml
new file mode 100644
index 0000000..2f2fc7c
--- /dev/null
+++ b/nutch-plugins/protocol-httpclient/pom.xml
@@ -0,0 +1,62 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>protocol-httpclient</artifactId>
+    <packaging>jar</packaging>
+
+    <name>protocol-httpclient</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+    <dependencies>
+        <dependency>
+            <groupId>org.jsoup</groupId>
+            <artifactId>jsoup</artifactId>
+            <version>1.8.1</version>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.nutch</groupId>
+            <artifactId>lib-http</artifactId>
+            <version>${project.parent.version}</version>
+        </dependency>
+        <dependency>
+            <groupId> org.mortbay.jetty</groupId>
+            <artifactId>jetty</artifactId>
+            <version>6.1.26</version>
+            <scope>test</scope>
+        </dependency>
+        <dependency>
+            <groupId> org.mortbay.jetty</groupId>
+            <artifactId>jsp-2.1</artifactId>
+            <version>6.1.14</version>
+            <scope>test</scope>
+        </dependency>
+    </dependencies>
+
+</project>

[16/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build for nutch-core and nutch-plugins

Posted by th...@apache.org.

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-http/src/main/java/org/apache/nutch/protocol/http/api/HttpBase.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-http/src/main/java/org/apache/nutch/protocol/http/api/HttpBase.java b/nutch-plugins/lib-http/src/main/java/org/apache/nutch/protocol/http/api/HttpBase.java
new file mode 100644
index 0000000..9f616fe
--- /dev/null
+++ b/nutch-plugins/lib-http/src/main/java/org/apache/nutch/protocol/http/api/HttpBase.java
@@ -0,0 +1,587 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.http.api;
+
+// JDK imports
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.Reader;
+import java.net.URL;
+import java.util.*;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.concurrent.ThreadLocalRandom;
+// Logging imports
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+// Nutch imports
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.protocol.ProtocolOutput;
+import org.apache.nutch.protocol.ProtocolStatus;
+import org.apache.nutch.util.GZIPUtils;
+import org.apache.nutch.util.DeflateUtils;
+import org.apache.hadoop.util.StringUtils;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.Text;
+
+// crawler-commons imports
+import crawlercommons.robots.BaseRobotRules;
+
+public abstract class HttpBase implements Protocol {
+
+  public static final Text RESPONSE_TIME = new Text("_rs_");
+
+  public static final int BUFFER_SIZE = 8 * 1024;
+
+  private static final byte[] EMPTY_CONTENT = new byte[0];
+
+  private HttpRobotRulesParser robots = null;
+
+  private ArrayList<String> userAgentNames = null;
+
+  /** The proxy hostname. */
+  protected String proxyHost = null;
+
+  /** The proxy port. */
+  protected int proxyPort = 8080;
+  
+  /** The proxy exception list. */
+  protected HashMap proxyException = new HashMap(); 
+
+  /** Indicates if a proxy is used */
+  protected boolean useProxy = false;
+
+  /** The network timeout in millisecond */
+  protected int timeout = 10000;
+
+  /** The length limit for downloaded content, in bytes. */
+  protected int maxContent = 64 * 1024;
+
+  /** The Nutch 'User-Agent' request header */
+  protected String userAgent = getAgentString("NutchCVS", null, "Nutch",
+      "http://nutch.apache.org/bot.html", "agent@nutch.apache.org");
+
+  /** The "Accept-Language" request header value. */
+  protected String acceptLanguage = "en-us,en-gb,en;q=0.7,*;q=0.3";
+
+  /** The "Accept" request header value. */
+  protected String accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
+
+  /** The default logger */
+  private final static Logger LOGGER = LoggerFactory.getLogger(HttpBase.class);
+
+  /** The specified logger */
+  private Logger logger = LOGGER;
+
+  /** The nutch configuration */
+  private Configuration conf = null;
+
+  /** Do we use HTTP/1.1? */
+  protected boolean useHttp11 = false;
+
+  /**
+   * Record response time in CrawlDatum's meta data, see property
+   * http.store.responsetime.
+   */
+  protected boolean responseTime = true;
+
+  /** Skip page if Crawl-Delay longer than this value. */
+  protected long maxCrawlDelay = -1L;
+
+  /** Which TLS/SSL protocols to support */
+  protected Set<String> tlsPreferredProtocols;
+
+  /** Which TLS/SSL cipher suites to support */
+  protected Set<String> tlsPreferredCipherSuites;
+  
+  /** Configuration directive for If-Modified-Since HTTP header */
+  public boolean enableIfModifiedsinceHeader = true;
+
+  /** Creates a new instance of HttpBase */
+  public HttpBase() {
+    this(null);
+  }
+
+  /** Creates a new instance of HttpBase */
+  public HttpBase(Logger logger) {
+    if (logger != null) {
+      this.logger = logger;
+    }
+    robots = new HttpRobotRulesParser();
+  }
+
+  // Inherited Javadoc
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    this.proxyHost = conf.get("http.proxy.host");
+    this.proxyPort = conf.getInt("http.proxy.port", 8080);
+    this.proxyException = arrayToMap(conf.getStrings("http.proxy.exception.list"));
+    this.useProxy = (proxyHost != null && proxyHost.length() > 0);
+    this.timeout = conf.getInt("http.timeout", 10000);
+    this.maxContent = conf.getInt("http.content.limit", 64 * 1024);
+    this.userAgent = getAgentString(conf.get("http.agent.name"),
+        conf.get("http.agent.version"), conf.get("http.agent.description"),
+        conf.get("http.agent.url"), conf.get("http.agent.email"));
+    this.acceptLanguage = conf.get("http.accept.language", acceptLanguage);
+    this.accept = conf.get("http.accept", accept);
+    // backward-compatible default setting
+    this.useHttp11 = conf.getBoolean("http.useHttp11", false);
+    this.responseTime = conf.getBoolean("http.store.responsetime", true);
+    this.enableIfModifiedsinceHeader = conf.getBoolean("http.enable.if.modified.since.header", true);
+    this.robots.setConf(conf);
+
+    // NUTCH-1941: read list of alternating agent names
+    if (conf.getBoolean("http.agent.rotate", false)) {
+      String agentsFile = conf.get("http.agent.rotate.file", "agents.txt");
+      BufferedReader br = null;
+      try {
+        Reader reader = conf.getConfResourceAsReader(agentsFile);
+        br = new BufferedReader(reader);
+        userAgentNames = new ArrayList<String>();
+        String word = "";
+        while ((word = br.readLine()) != null) {
+          if (!word.trim().isEmpty())
+            userAgentNames.add(word.trim());
+        }
+
+        if (userAgentNames.size() == 0) {
+          logger.warn("Empty list of user agents in http.agent.rotate.file {}",
+              agentsFile);
+          userAgentNames = null;
+        }
+
+      } catch (Exception e) {
+        logger.warn("Failed to read http.agent.rotate.file {}: {}", agentsFile,
+            StringUtils.stringifyException(e));
+        userAgentNames = null;
+      } finally {
+        if (br != null) {
+          try {
+            br.close();
+          } catch (IOException e) {
+            // ignore
+          }
+        }
+      }
+      if (userAgentNames == null) {
+        logger
+            .warn("Falling back to fixed user agent set via property http.agent.name");
+      }
+    }
+
+    String[] protocols = conf.getStrings("http.tls.supported.protocols",
+        "TLSv1.2", "TLSv1.1", "TLSv1", "SSLv3");
+    String[] ciphers = conf.getStrings("http.tls.supported.cipher.suites",
+        "TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA384",
+        "TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA384",
+        "TLS_RSA_WITH_AES_256_CBC_SHA256",
+        "TLS_ECDH_ECDSA_WITH_AES_256_CBC_SHA384",
+        "TLS_ECDH_RSA_WITH_AES_256_CBC_SHA384",
+        "TLS_DHE_RSA_WITH_AES_256_CBC_SHA256",
+        "TLS_DHE_DSS_WITH_AES_256_CBC_SHA256",
+        "TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA",
+        "TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA", "TLS_RSA_WITH_AES_256_CBC_SHA",
+        "TLS_ECDH_ECDSA_WITH_AES_256_CBC_SHA",
+        "TLS_ECDH_RSA_WITH_AES_256_CBC_SHA",
+        "TLS_DHE_RSA_WITH_AES_256_CBC_SHA", "TLS_DHE_DSS_WITH_AES_256_CBC_SHA",
+        "TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256",
+        "TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256",
+        "TLS_RSA_WITH_AES_128_CBC_SHA256",
+        "TLS_ECDH_ECDSA_WITH_AES_128_CBC_SHA256",
+        "TLS_ECDH_RSA_WITH_AES_128_CBC_SHA256",
+        "TLS_DHE_RSA_WITH_AES_128_CBC_SHA256",
+        "TLS_DHE_DSS_WITH_AES_128_CBC_SHA256",
+        "TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA",
+        "TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA", "TLS_RSA_WITH_AES_128_CBC_SHA",
+        "TLS_ECDH_ECDSA_WITH_AES_128_CBC_SHA",
+        "TLS_ECDH_RSA_WITH_AES_128_CBC_SHA",
+        "TLS_DHE_RSA_WITH_AES_128_CBC_SHA", "TLS_DHE_DSS_WITH_AES_128_CBC_SHA",
+        "TLS_ECDHE_ECDSA_WITH_RC4_128_SHA", "TLS_ECDHE_RSA_WITH_RC4_128_SHA",
+        "SSL_RSA_WITH_RC4_128_SHA", "TLS_ECDH_ECDSA_WITH_RC4_128_SHA",
+        "TLS_ECDH_RSA_WITH_RC4_128_SHA",
+        "TLS_ECDHE_ECDSA_WITH_3DES_EDE_CBC_SHA",
+        "TLS_ECDHE_RSA_WITH_3DES_EDE_CBC_SHA", "SSL_RSA_WITH_3DES_EDE_CBC_SHA",
+        "TLS_ECDH_ECDSA_WITH_3DES_EDE_CBC_SHA",
+        "TLS_ECDH_RSA_WITH_3DES_EDE_CBC_SHA",
+        "SSL_DHE_RSA_WITH_3DES_EDE_CBC_SHA",
+        "SSL_DHE_DSS_WITH_3DES_EDE_CBC_SHA", "SSL_RSA_WITH_RC4_128_MD5",
+        "TLS_EMPTY_RENEGOTIATION_INFO_SCSV", "TLS_RSA_WITH_NULL_SHA256",
+        "TLS_ECDHE_ECDSA_WITH_NULL_SHA", "TLS_ECDHE_RSA_WITH_NULL_SHA",
+        "SSL_RSA_WITH_NULL_SHA", "TLS_ECDH_ECDSA_WITH_NULL_SHA",
+        "TLS_ECDH_RSA_WITH_NULL_SHA", "SSL_RSA_WITH_NULL_MD5",
+        "SSL_RSA_WITH_DES_CBC_SHA", "SSL_DHE_RSA_WITH_DES_CBC_SHA",
+        "SSL_DHE_DSS_WITH_DES_CBC_SHA", "TLS_KRB5_WITH_RC4_128_SHA",
+        "TLS_KRB5_WITH_RC4_128_MD5", "TLS_KRB5_WITH_3DES_EDE_CBC_SHA",
+        "TLS_KRB5_WITH_3DES_EDE_CBC_MD5", "TLS_KRB5_WITH_DES_CBC_SHA",
+        "TLS_KRB5_WITH_DES_CBC_MD5");
+
+    tlsPreferredProtocols = new HashSet<String>(Arrays.asList(protocols));
+    tlsPreferredCipherSuites = new HashSet<String>(Arrays.asList(ciphers));
+
+    logConf();
+  }
+
+  // Inherited Javadoc
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+  public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) {
+
+    String urlString = url.toString();
+    try {
+      URL u = new URL(urlString);
+
+      long startTime = System.currentTimeMillis();
+      Response response = getResponse(u, datum, false); // make a request
+
+      if (this.responseTime) {
+        int elapsedTime = (int) (System.currentTimeMillis() - startTime);
+        datum.getMetaData().put(RESPONSE_TIME, new IntWritable(elapsedTime));
+      }
+
+      int code = response.getCode();
+      datum.getMetaData().put(Nutch.PROTOCOL_STATUS_CODE_KEY,
+        new Text(Integer.toString(code)));
+
+      byte[] content = response.getContent();
+      Content c = new Content(u.toString(), u.toString(),
+          (content == null ? EMPTY_CONTENT : content),
+          response.getHeader("Content-Type"), response.getHeaders(), this.conf);
+
+      if (code == 200) { // got a good response
+        return new ProtocolOutput(c); // return it
+
+      } else if (code >= 300 && code < 400) { // handle redirect
+        String location = response.getHeader("Location");
+        // some broken servers, such as MS IIS, use lowercase header name...
+        if (location == null)
+          location = response.getHeader("location");
+        if (location == null)
+          location = "";
+        u = new URL(u, location);
+        int protocolStatusCode;
+        switch (code) {
+        case 300: // multiple choices, preferred value in Location
+          protocolStatusCode = ProtocolStatus.MOVED;
+          break;
+        case 301: // moved permanently
+        case 305: // use proxy (Location is URL of proxy)
+          protocolStatusCode = ProtocolStatus.MOVED;
+          break;
+        case 302: // found (temporarily moved)
+        case 303: // see other (redirect after POST)
+        case 307: // temporary redirect
+          protocolStatusCode = ProtocolStatus.TEMP_MOVED;
+          break;
+        case 304: // not modified
+          protocolStatusCode = ProtocolStatus.NOTMODIFIED;
+          break;
+        default:
+          protocolStatusCode = ProtocolStatus.MOVED;
+        }
+        // handle this in the higher layer.
+        return new ProtocolOutput(c, new ProtocolStatus(protocolStatusCode, u));
+      } else if (code == 400) { // bad request, mark as GONE
+        if (logger.isTraceEnabled()) {
+          logger.trace("400 Bad request: " + u);
+        }
+        return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, u));
+      } else if (code == 401) { // requires authorization, but no valid auth
+                                // provided.
+        if (logger.isTraceEnabled()) {
+          logger.trace("401 Authentication Required");
+        }
+        return new ProtocolOutput(c, new ProtocolStatus(
+            ProtocolStatus.ACCESS_DENIED, "Authentication required: "
+                + urlString));
+      } else if (code == 404) {
+        return new ProtocolOutput(c, new ProtocolStatus(
+            ProtocolStatus.NOTFOUND, u));
+      } else if (code == 410) { // permanently GONE
+        return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE,
+            "Http: " + code + " url=" + u));
+      } else {
+        return new ProtocolOutput(c, new ProtocolStatus(
+            ProtocolStatus.EXCEPTION, "Http code=" + code + ", url=" + u));
+      }
+    } catch (Throwable e) {
+      logger.error("Failed to get protocol output", e);
+      return new ProtocolOutput(null, new ProtocolStatus(e));
+    }
+  }
+
+  /*
+   * -------------------------- * </implementation:Protocol> *
+   * --------------------------
+   */
+
+  public String getProxyHost() {
+    return proxyHost;
+  }
+
+  public int getProxyPort() {
+    return proxyPort;
+  }
+
+  public boolean useProxy(URL url) {
+    if (!useProxy){
+      return false;
+    } else if (proxyException.get(url.getHost())!=null){
+      return false;
+    }
+    return useProxy;
+  }
+
+  public int getTimeout() {
+    return timeout;
+  }
+  
+  public boolean isIfModifiedSinceEnabled() {
+    return enableIfModifiedsinceHeader;
+  }
+
+  public int getMaxContent() {
+    return maxContent;
+  }
+
+  public String getUserAgent() {
+    if (userAgentNames!=null) {
+      return userAgentNames.get(ThreadLocalRandom.current().nextInt(userAgentNames.size()-1));
+    }
+    return userAgent;
+  }
+
+  /**
+   * Value of "Accept-Language" request header sent by Nutch.
+   * 
+   * @return The value of the header "Accept-Language" header.
+   */
+  public String getAcceptLanguage() {
+    return acceptLanguage;
+  }
+
+  public String getAccept() {
+    return accept;
+  }
+
+  public boolean getUseHttp11() {
+    return useHttp11;
+  }
+
+  public Set<String> getTlsPreferredCipherSuites() {
+    return tlsPreferredCipherSuites;
+  }
+
+  public Set<String> getTlsPreferredProtocols() {
+    return tlsPreferredProtocols;
+  }
+
+  private static String getAgentString(String agentName, String agentVersion,
+      String agentDesc, String agentURL, String agentEmail) {
+
+    if ((agentName == null) || (agentName.trim().length() == 0)) {
+      // TODO : NUTCH-258
+      if (LOGGER.isErrorEnabled()) {
+        LOGGER.error("No User-Agent string set (http.agent.name)!");
+      }
+    }
+
+    StringBuffer buf = new StringBuffer();
+
+    buf.append(agentName);
+    if (agentVersion != null) {
+      buf.append("/");
+      buf.append(agentVersion);
+    }
+    if (((agentDesc != null) && (agentDesc.length() != 0))
+        || ((agentEmail != null) && (agentEmail.length() != 0))
+        || ((agentURL != null) && (agentURL.length() != 0))) {
+      buf.append(" (");
+
+      if ((agentDesc != null) && (agentDesc.length() != 0)) {
+        buf.append(agentDesc);
+        if ((agentURL != null) || (agentEmail != null))
+          buf.append("; ");
+      }
+
+      if ((agentURL != null) && (agentURL.length() != 0)) {
+        buf.append(agentURL);
+        if (agentEmail != null)
+          buf.append("; ");
+      }
+
+      if ((agentEmail != null) && (agentEmail.length() != 0))
+        buf.append(agentEmail);
+
+      buf.append(")");
+    }
+    return buf.toString();
+  }
+
+  protected void logConf() {
+    if (logger.isInfoEnabled()) {
+      logger.info("http.proxy.host = " + proxyHost);
+      logger.info("http.proxy.port = " + proxyPort);
+      logger.info("http.proxy.exception.list = " + useProxy);
+      logger.info("http.timeout = " + timeout);
+      logger.info("http.content.limit = " + maxContent);
+      logger.info("http.agent = " + userAgent);
+      logger.info("http.accept.language = " + acceptLanguage);
+      logger.info("http.accept = " + accept);
+    }
+  }
+
+  public byte[] processGzipEncoded(byte[] compressed, URL url)
+      throws IOException {
+
+    if (LOGGER.isTraceEnabled()) {
+      LOGGER.trace("uncompressing....");
+    }
+
+    // content can be empty (i.e. redirection) in which case
+    // there is nothing to unzip
+    if (compressed.length == 0)
+      return compressed;
+
+    byte[] content;
+    if (getMaxContent() >= 0) {
+      content = GZIPUtils.unzipBestEffort(compressed, getMaxContent());
+    } else {
+      content = GZIPUtils.unzipBestEffort(compressed);
+    }
+
+    if (content == null)
+      throw new IOException("unzipBestEffort returned null");
+
+    if (LOGGER.isTraceEnabled()) {
+      LOGGER.trace("fetched " + compressed.length
+          + " bytes of compressed content (expanded to " + content.length
+          + " bytes) from " + url);
+    }
+    return content;
+  }
+
+  public byte[] processDeflateEncoded(byte[] compressed, URL url)
+      throws IOException {
+
+    // content can be empty (i.e. redirection) in which case
+    // there is nothing to deflate
+    if (compressed.length == 0)
+      return compressed;
+
+    if (LOGGER.isTraceEnabled()) {
+      LOGGER.trace("inflating....");
+    }
+
+    byte[] content;
+    if (getMaxContent() >= 0) {
+      content = DeflateUtils.inflateBestEffort(compressed, getMaxContent());
+    } else {
+      content = DeflateUtils.inflateBestEffort(compressed);
+    }
+
+    if (content == null)
+      throw new IOException("inflateBestEffort returned null");
+
+    if (LOGGER.isTraceEnabled()) {
+      LOGGER.trace("fetched " + compressed.length
+          + " bytes of compressed content (expanded to " + content.length
+          + " bytes) from " + url);
+    }
+    return content;
+  }
+
+  protected static void main(HttpBase http, String[] args) throws Exception {
+    boolean verbose = false;
+    String url = null;
+
+    String usage = "Usage: Http [-verbose] [-timeout N] url";
+
+    if (args.length == 0) {
+      System.err.println(usage);
+      System.exit(-1);
+    }
+
+    for (int i = 0; i < args.length; i++) { // parse command line
+      if (args[i].equals("-timeout")) { // found -timeout option
+        http.timeout = Integer.parseInt(args[++i]) * 1000;
+      } else if (args[i].equals("-verbose")) { // found -verbose option
+        verbose = true;
+      } else if (i != args.length - 1) {
+        System.err.println(usage);
+        System.exit(-1);
+      } else
+        // root is required parameter
+        url = args[i];
+    }
+
+    // if (verbose) {
+    // LOGGER.setLevel(Level.FINE);
+    // }
+
+    ProtocolOutput out = http
+        .getProtocolOutput(new Text(url), new CrawlDatum());
+    Content content = out.getContent();
+
+    System.out.println("Status: " + out.getStatus());
+    if (content != null) {
+      System.out.println("Content Type: " + content.getContentType());
+      System.out.println("Content Length: "
+          + content.getMetadata().get(Response.CONTENT_LENGTH));
+      System.out.println("Content:");
+      String text = new String(content.getContent());
+      System.out.println(text);
+    }
+  }
+
+  protected abstract Response getResponse(URL url, CrawlDatum datum,
+      boolean followRedirects) throws ProtocolException, IOException;
+
+  public BaseRobotRules getRobotRules(Text url, CrawlDatum datum) {
+    return robots.getRobotRulesSet(this, url);
+  }
+  
+  /**
+   * Transforming a String[] into a HashMap for faster searching
+   * @param input String[]
+   * @return a new HashMap
+   */
+  private HashMap arrayToMap(String[]input){
+    if (input==null ||input.length==0) {
+      return new HashMap();
+    }
+    HashMap hm=new HashMap();
+    for (int i=0;i<input.length;i++){
+      if (!"".equals(input[i].trim())){
+        hm.put(input[i],input[i]);
+      }
+    }
+    return hm;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-http/src/main/java/org/apache/nutch/protocol/http/api/HttpException.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-http/src/main/java/org/apache/nutch/protocol/http/api/HttpException.java b/nutch-plugins/lib-http/src/main/java/org/apache/nutch/protocol/http/api/HttpException.java
new file mode 100644
index 0000000..ff7ef5b
--- /dev/null
+++ b/nutch-plugins/lib-http/src/main/java/org/apache/nutch/protocol/http/api/HttpException.java
@@ -0,0 +1,40 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.http.api;
+
+// Nutch imports
+import org.apache.nutch.protocol.ProtocolException;
+
+public class HttpException extends ProtocolException {
+
+  public HttpException() {
+    super();
+  }
+
+  public HttpException(String message) {
+    super(message);
+  }
+
+  public HttpException(String message, Throwable cause) {
+    super(message, cause);
+  }
+
+  public HttpException(Throwable cause) {
+    super(cause);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-http/src/main/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-http/src/main/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java b/nutch-plugins/lib-http/src/main/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
new file mode 100644
index 0000000..185ca15
--- /dev/null
+++ b/nutch-plugins/lib-http/src/main/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
@@ -0,0 +1,167 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol.http.api;
+
+import java.net.URL;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.RobotRulesParser;
+
+import crawlercommons.robots.BaseRobotRules;
+
+/**
+ * This class is used for parsing robots for urls belonging to HTTP protocol. It
+ * extends the generic {@link RobotRulesParser} class and contains Http protocol
+ * specific implementation for obtaining the robots file.
+ */
+public class HttpRobotRulesParser extends RobotRulesParser {
+
+  public static final Logger LOG = LoggerFactory
+      .getLogger(HttpRobotRulesParser.class);
+  protected boolean allowForbidden = false;
+
+  HttpRobotRulesParser() {
+  }
+
+  public HttpRobotRulesParser(Configuration conf) {
+    setConf(conf);
+  }
+
+  public void setConf(Configuration conf) {
+    super.setConf(conf);
+    allowForbidden = conf.getBoolean("http.robots.403.allow", true);
+  }
+
+  /** Compose unique key to store and access robot rules in cache for given URL */
+  protected static String getCacheKey(URL url) {
+    String protocol = url.getProtocol().toLowerCase(); // normalize to lower
+                                                       // case
+    String host = url.getHost().toLowerCase(); // normalize to lower case
+    int port = url.getPort();
+    if (port == -1) {
+      port = url.getDefaultPort();
+    }
+    /*
+     * Robot rules apply only to host, protocol, and port where robots.txt is
+     * hosted (cf. NUTCH-1752). Consequently
+     */
+    String cacheKey = protocol + ":" + host + ":" + port;
+    return cacheKey;
+  }
+
+  /**
+   * Get the rules from robots.txt which applies for the given {@code url}.
+   * Robot rules are cached for a unique combination of host, protocol, and
+   * port. If no rules are found in the cache, a HTTP request is send to fetch
+   * {{protocol://host:port/robots.txt}}. The robots.txt is then parsed and the
+   * rules are cached to avoid re-fetching and re-parsing it again.
+   * 
+   * @param http
+   *          The {@link Protocol} object
+   * @param url
+   *          URL robots.txt applies to
+   * 
+   * @return {@link BaseRobotRules} holding the rules from robots.txt
+   */
+  public BaseRobotRules getRobotRulesSet(Protocol http, URL url) {
+
+    if (LOG.isTraceEnabled() && isWhiteListed(url)) {
+      LOG.trace("Ignoring robots.txt (host is whitelisted) for URL: {}", url);
+    }
+
+    String cacheKey = getCacheKey(url);
+    BaseRobotRules robotRules = CACHE.get(cacheKey);
+
+    if (robotRules != null) {
+      return robotRules; // cached rule
+    } else if (LOG.isTraceEnabled()) {
+      LOG.trace("cache miss " + url);
+    }
+
+    boolean cacheRule = true;
+    URL redir = null;
+
+    if (isWhiteListed(url)) {
+      // check in advance whether a host is whitelisted
+      // (we do not need to fetch robots.txt)
+      robotRules = EMPTY_RULES;
+      LOG.info("Whitelisted host found for: {}", url);
+      LOG.info("Ignoring robots.txt for all URLs from whitelisted host: {}",
+          url.getHost());
+
+    } else {
+      try {
+        Response response = ((HttpBase) http).getResponse(new URL(url,
+            "/robots.txt"), new CrawlDatum(), true);
+        // try one level of redirection ?
+        if (response.getCode() == 301 || response.getCode() == 302) {
+          String redirection = response.getHeader("Location");
+          if (redirection == null) {
+            // some versions of MS IIS are known to mangle this header
+            redirection = response.getHeader("location");
+          }
+          if (redirection != null) {
+            if (!redirection.startsWith("http")) {
+              // RFC says it should be absolute, but apparently it isn't
+              redir = new URL(url, redirection);
+            } else {
+              redir = new URL(redirection);
+            }
+
+            response = ((HttpBase) http).getResponse(redir, new CrawlDatum(),
+                true);
+          }
+        }
+
+        if (response.getCode() == 200) // found rules: parse them
+          robotRules = parseRules(url.toString(), response.getContent(),
+              response.getHeader("Content-Type"), agentNames);
+
+        else if ((response.getCode() == 403) && (!allowForbidden))
+          robotRules = FORBID_ALL_RULES; // use forbid all
+        else if (response.getCode() >= 500) {
+          cacheRule = false; // try again later to fetch robots.txt
+          robotRules = EMPTY_RULES;
+        } else
+          robotRules = EMPTY_RULES; // use default rules
+      } catch (Throwable t) {
+        if (LOG.isInfoEnabled()) {
+          LOG.info("Couldn't get robots.txt for " + url + ": " + t.toString());
+        }
+        cacheRule = false; // try again later to fetch robots.txt
+        robotRules = EMPTY_RULES;
+      }
+    }
+
+    if (cacheRule) {
+      CACHE.put(cacheKey, robotRules); // cache rules for host
+      if (redir != null && !redir.getHost().equalsIgnoreCase(url.getHost())) {
+        // cache also for the redirected host
+        CACHE.put(getCacheKey(redir), robotRules);
+      }
+    }
+
+    return robotRules;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-http/src/main/java/org/apache/nutch/protocol/http/api/package.html
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-http/src/main/java/org/apache/nutch/protocol/http/api/package.html b/nutch-plugins/lib-http/src/main/java/org/apache/nutch/protocol/http/api/package.html
new file mode 100644
index 0000000..972bb3c
--- /dev/null
+++ b/nutch-plugins/lib-http/src/main/java/org/apache/nutch/protocol/http/api/package.html
@@ -0,0 +1,6 @@
+<html>
+<body>
+<p>Common API used by HTTP plugins ({@link org.apache.nutch.protocol.http http},
+{@link org.apache.nutch.protocol.httpclient httpclient})</p>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-http/src/test/java/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-http/src/test/java/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java b/nutch-plugins/lib-http/src/test/java/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
new file mode 100644
index 0000000..23e4ef6
--- /dev/null
+++ b/nutch-plugins/lib-http/src/test/java/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
@@ -0,0 +1,123 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol.http.api;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import crawlercommons.robots.BaseRobotRules;
+
+/**
+ * JUnit test case which tests 1. that robots filtering is performed correctly
+ * as per the agent name 2. that crawl delay is extracted correctly from the
+ * robots file
+ * 
+ */
+public class TestRobotRulesParser {
+
+  private static final String CONTENT_TYPE = "text/plain";
+  private static final String SINGLE_AGENT = "Agent1";
+  private static final String MULTIPLE_AGENTS = "Agent2, Agent1";
+  private static final String UNKNOWN_AGENT = "AgentABC";
+  private static final String CR = "\r";
+
+  private static final String ROBOTS_STRING = "User-Agent: Agent1 #foo" + CR
+      + "Disallow: /a" + CR + "Disallow: /b/a" + CR + "#Disallow: /c"
+      + CR
+      + "Crawl-delay: 10"
+      + CR // set crawl delay for Agent1 as 10 sec
+      + "" + CR + "" + CR + "User-Agent: Agent2" + CR + "Disallow: /a/bloh"
+      + CR + "Disallow: /c" + CR + "Disallow: /foo" + CR + "Crawl-delay: 20"
+      + CR + "" + CR + "User-Agent: *" + CR + "Disallow: /foo/bar/" + CR; // no
+                                                                          // crawl
+                                                                          // delay
+                                                                          // for
+                                                                          // other
+                                                                          // agents
+
+  private static final String[] TEST_PATHS = new String[] {
+      "http://example.com/a", "http://example.com/a/bloh/foo.html",
+      "http://example.com/b", "http://example.com/c",
+      "http://example.com/b/a/index.html",
+      "http://example.com/foo/bar/baz.html" };
+
+  private static final boolean[] RESULTS = new boolean[] { false, // /a
+      false, // /a/bloh/foo.html
+      true, // /b
+      true, // /c
+      false, // /b/a/index.html
+      true // /foo/bar/baz.html
+  };
+
+  private HttpRobotRulesParser parser;
+  private BaseRobotRules rules;
+
+  public TestRobotRulesParser() {
+    parser = new HttpRobotRulesParser();
+  }
+
+  /**
+   * Test that the robots rules are interpreted correctly by the robots rules
+   * parser.
+   */
+  @Test
+  public void testRobotsAgent() {
+    rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(),
+        CONTENT_TYPE, SINGLE_AGENT);
+
+    for (int counter = 0; counter < TEST_PATHS.length; counter++) {
+      Assert.assertTrue(
+          "testing on agent (" + SINGLE_AGENT + "), and " + "path "
+              + TEST_PATHS[counter] + " got "
+              + rules.isAllowed(TEST_PATHS[counter]),
+          rules.isAllowed(TEST_PATHS[counter]) == RESULTS[counter]);
+    }
+
+    rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(),
+        CONTENT_TYPE, MULTIPLE_AGENTS);
+
+    for (int counter = 0; counter < TEST_PATHS.length; counter++) {
+      Assert.assertTrue(
+          "testing on agents (" + MULTIPLE_AGENTS + "), and " + "path "
+              + TEST_PATHS[counter] + " got "
+              + rules.isAllowed(TEST_PATHS[counter]),
+          rules.isAllowed(TEST_PATHS[counter]) == RESULTS[counter]);
+    }
+  }
+
+  /**
+   * Test that the crawl delay is extracted from the robots file for respective
+   * agent. If its not specified for a given agent, default value must be
+   * returned.
+   */
+  @Test
+  public void testCrawlDelay() {
+    // for SINGLE_AGENT, the crawl delay of 10 sec ie. 10000 msec must be
+    // returned by the parser
+    rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(),
+        CONTENT_TYPE, SINGLE_AGENT);
+    Assert.assertTrue("testing crawl delay for agent " + SINGLE_AGENT + " : ",
+        (rules.getCrawlDelay() == 10000));
+
+    // for UNKNOWN_AGENT, the default crawl delay must be returned.
+    rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(),
+        CONTENT_TYPE, UNKNOWN_AGENT);
+    Assert.assertTrue("testing crawl delay for agent " + UNKNOWN_AGENT + " : ",
+        (rules.getCrawlDelay() == Long.MIN_VALUE));
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-nekohtml/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-nekohtml/build.xml b/nutch-plugins/lib-nekohtml/build.xml
new file mode 100644
index 0000000..4bca1af
--- /dev/null
+++ b/nutch-plugins/lib-nekohtml/build.xml
@@ -0,0 +1,30 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="lib-nekohtml" default="jar">
+
+  <import file="../build-plugin.xml"/>
+
+  <!--
+   ! Override the compile and jar targets,
+   ! since there is nothing to compile here.
+   ! -->
+  <target name="compile" depends="init, resolve-default"/>
+
+  <target name="jar" depends="compile"/>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-nekohtml/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-nekohtml/ivy.xml b/nutch-plugins/lib-nekohtml/ivy.xml
new file mode 100644
index 0000000..ed70b80
--- /dev/null
+++ b/nutch-plugins/lib-nekohtml/ivy.xml
@@ -0,0 +1,42 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+    <dependency org="net.sourceforge.nekohtml" name="nekohtml" rev="1.9.19" conf="*->master"/>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-nekohtml/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-nekohtml/plugin.xml b/nutch-plugins/lib-nekohtml/plugin.xml
new file mode 100644
index 0000000..513c9a7
--- /dev/null
+++ b/nutch-plugins/lib-nekohtml/plugin.xml
@@ -0,0 +1,38 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<!--
+ ! NekoHTML is a simple HTML scanner and tag balancer that enables 
+ ! application programmers to parse HTML documents and access the 
+ ! information using standard XML interfaces.
+ ! (http://sourceforge.net/projects/nekohtml/)
+ ! 
+ ! License : https://nekohtml.svn.sourceforge.net/svnroot/nekohtml/trunk/LICENSE.txt
+ !-->
+<plugin
+   id="lib-nekohtml"
+   name="CyberNeko HTML Parser"
+   version="1.9.19"
+   provider-name="net.sourceforge.nekohtml">
+
+   <runtime>
+     <library name="nekohtml-1.9.19.jar">
+        <export name="*"/>
+     </library>
+   </runtime>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-nekohtml/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-nekohtml/pom.xml b/nutch-plugins/lib-nekohtml/pom.xml
new file mode 100644
index 0000000..df544bb
--- /dev/null
+++ b/nutch-plugins/lib-nekohtml/pom.xml
@@ -0,0 +1,45 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>lib-nekohtml</artifactId>
+    <packaging>jar</packaging>
+
+    <name>lib-nekohtml</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+    <dependencies>
+        <dependency>
+            <groupId>net.sourceforge.nekohtml</groupId>
+            <artifactId>nekohtml</artifactId>
+            <version>1.9.22</version>
+        </dependency>
+    </dependencies>
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-regex-filter/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-regex-filter/build.xml b/nutch-plugins/lib-regex-filter/build.xml
new file mode 100644
index 0000000..9702ca2
--- /dev/null
+++ b/nutch-plugins/lib-regex-filter/build.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="lib-regex-filter" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-regex-filter/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-regex-filter/ivy.xml b/nutch-plugins/lib-regex-filter/ivy.xml
new file mode 100644
index 0000000..1a86d68
--- /dev/null
+++ b/nutch-plugins/lib-regex-filter/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-regex-filter/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-regex-filter/plugin.xml b/nutch-plugins/lib-regex-filter/plugin.xml
new file mode 100644
index 0000000..42de8f1
--- /dev/null
+++ b/nutch-plugins/lib-regex-filter/plugin.xml
@@ -0,0 +1,33 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<!--
+ ! A common framework for RegExp based URL filters
+ !-->
+<plugin
+   id="lib-regex-filter"
+   name="Regex URL Filter Framework"
+   version="1.0"
+   provider-name="org.apache.nutch">
+
+   <runtime>
+     <library name="lib-regex-filter.jar">
+        <export name="*"/>
+     </library>
+   </runtime>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-regex-filter/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-regex-filter/pom.xml b/nutch-plugins/lib-regex-filter/pom.xml
new file mode 100644
index 0000000..1074ad7
--- /dev/null
+++ b/nutch-plugins/lib-regex-filter/pom.xml
@@ -0,0 +1,54 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>lib-regex-filter</artifactId>
+    <packaging>jar</packaging>
+
+    <name>lib-regex-filter</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+    <build>
+        <plugins>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-jar-plugin</artifactId>
+                <version>2.6</version>
+                <executions>
+                    <execution>
+                        <goals>
+                            <goal>test-jar</goal>
+                        </goals>
+                    </execution>
+                </executions>
+            </plugin>
+        </plugins>
+    </build>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-regex-filter/src/main/java/org/apache/nutch/urlfilter/api/RegexRule.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-regex-filter/src/main/java/org/apache/nutch/urlfilter/api/RegexRule.java b/nutch-plugins/lib-regex-filter/src/main/java/org/apache/nutch/urlfilter/api/RegexRule.java
new file mode 100644
index 0000000..e408586
--- /dev/null
+++ b/nutch-plugins/lib-regex-filter/src/main/java/org/apache/nutch/urlfilter/api/RegexRule.java
@@ -0,0 +1,102 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlfilter.api;
+
+/**
+ * A generic regular expression rule.
+ * 
+ * @author J&eacute;r&ocirc;me Charron
+ */
+public abstract class RegexRule {
+
+  private final boolean sign;
+  
+  private final String hostOrDomain;
+  
+  private final String regex;
+
+  /**
+   * Constructs a new regular expression rule.
+   * 
+   * @param sign
+   *          specifies if this rule must filter-in or filter-out. A
+   *          <code>true</code> value means that any url matching this rule must
+   *          be accepted, a <code>false</code> value means that any url
+   *          matching this rule must be rejected.
+   * @param regex
+   *          is the regular expression used for matching (see
+   *          {@link #match(String)} method).
+   */
+  protected RegexRule(boolean sign, String regex) {
+    this(sign, regex, null);
+  }
+  
+  /**
+   * Constructs a new regular expression rule.
+   * 
+   * @param sign
+   *          specifies if this rule must filter-in or filter-out. A
+   *          <code>true</code> value means that any url matching this rule must
+   *          be accepted, a <code>false</code> value means that any url
+   *          matching this rule must be rejected.
+   * @param regex
+   *          is the regular expression used for matching (see
+   *          {@link #match(String)} method).
+   * @param hostOrDomain
+   *          the host or domain to which this regex belongs
+   */
+  protected RegexRule(boolean sign, String regex, String hostOrDomain) {
+    this.sign = sign;
+    this.hostOrDomain = hostOrDomain;
+    this.regex = regex;
+  }
+
+  /**
+   * Return if this rule is used for filtering-in or out.
+   * 
+   * @return <code>true</code> if any url matching this rule must be accepted,
+   *         otherwise <code>false</code>.
+   */
+  protected boolean accept() {
+    return sign;
+  }
+
+  /**
+   * Return if this rule is used for filtering-in or out.
+   *
+   * @return host or domain this regex rule belongs to
+   */
+  protected String hostOrDomain() { return hostOrDomain; }
+  
+  /**
+   * Return if this rule's regex.
+   *
+   * @return this regex
+   */
+  protected String regex() { return regex; }
+
+  /**
+   * Checks if a url matches this rule.
+   * 
+   * @param url
+   *          is the url to check.
+   * @return <code>true</code> if the specified url matches this rule, otherwise
+   *         <code>false</code>.
+   */
+  protected abstract boolean match(String url);
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-regex-filter/src/main/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-regex-filter/src/main/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java b/nutch-plugins/lib-regex-filter/src/main/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
new file mode 100644
index 0000000..f5cc081
--- /dev/null
+++ b/nutch-plugins/lib-regex-filter/src/main/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
@@ -0,0 +1,315 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlfilter.api;
+
+// JDK imports
+import java.io.File;
+import java.io.Reader;
+import java.io.FileReader;
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
+import java.io.IOException;
+import java.io.StringReader;
+import java.net.MalformedURLException;
+import java.util.List;
+import java.util.ArrayList;
+
+// Commons Logging imports
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
+
+// Nutch imports
+import org.apache.nutch.net.*;
+import org.apache.nutch.util.URLUtil;
+
+/**
+ * Generic {@link org.apache.nutch.net.URLFilter URL filter} based on regular
+ * expressions.
+ * 
+ * <p>
+ * The regular expressions rules are expressed in a file. The file of rules is
+ * determined for each implementation using the
+ * {@link #getRulesReader(Configuration conf)} method.
+ * </p>
+ * 
+ * <p>
+ * The format of this file is made of many rules (one per line):<br/>
+ * <code>
+ * [+-]&lt;regex&gt;
+ * </code><br/>
+ * where plus (<code>+</code>)means go ahead and index it and minus (
+ * <code>-</code>)means no.
+ * </p>
+ * 
+ * @author J&eacute;r&ocirc;me Charron
+ */
+public abstract class RegexURLFilterBase implements URLFilter {
+
+  /** My logger */
+  private final static Logger LOG = LoggerFactory
+      .getLogger(RegexURLFilterBase.class);
+
+  /** An array of applicable rules */
+  private List<RegexRule> rules;
+
+  /** The current configuration */
+  private Configuration conf;
+
+  /**
+   * Constructs a new empty RegexURLFilterBase
+   */
+  public RegexURLFilterBase() {
+  }
+
+  /**
+   * Constructs a new RegexURLFilter and init it with a file of rules.
+   * 
+   * @param filename
+   *          is the name of rules file.
+   */
+  public RegexURLFilterBase(File filename) throws IOException,
+      IllegalArgumentException {
+    this(new FileReader(filename));
+  }
+
+  /**
+   * Constructs a new RegexURLFilter and inits it with a list of rules.
+   * 
+   * @param rules
+   *          string with a list of rules, one rule per line
+   * @throws IOException
+   * @throws IllegalArgumentException
+   */
+  public RegexURLFilterBase(String rules) throws IOException,
+      IllegalArgumentException {
+    this(new StringReader(rules));
+  }
+
+  /**
+   * Constructs a new RegexURLFilter and init it with a Reader of rules.
+   * 
+   * @param reader
+   *          is a reader of rules.
+   */
+  protected RegexURLFilterBase(Reader reader) throws IOException,
+      IllegalArgumentException {
+    rules = readRules(reader);
+  }
+
+  /**
+   * Creates a new {@link RegexRule}.
+   * 
+   * @param sign
+   *          of the regular expression. A <code>true</code> value means that
+   *          any URL matching this rule must be included, whereas a
+   *          <code>false</code> value means that any URL matching this rule
+   *          must be excluded.
+   * @param regex
+   *          is the regular expression associated to this rule.
+   */
+  protected abstract RegexRule createRule(boolean sign, String regex);
+  
+  /**
+   * Creates a new {@link RegexRule}.
+   * @param 
+   *        sign of the regular expression.
+   *        A <code>true</code> value means that any URL matching this rule
+   *        must be included, whereas a <code>false</code>
+   *        value means that any URL matching this rule must be excluded.
+   * @param regex
+   *        is the regular expression associated to this rule.
+   * @param hostOrDomain
+   *        the host or domain to which this regex belongs
+   */
+  protected abstract RegexRule createRule(boolean sign, String regex, String hostOrDomain);
+
+  /**
+   * Returns the name of the file of rules to use for a particular
+   * implementation.
+   * 
+   * @param conf
+   *          is the current configuration.
+   * @return the name of the resource containing the rules to use.
+   */
+  protected abstract Reader getRulesReader(Configuration conf)
+      throws IOException;
+
+  /*
+   * -------------------------- * <implementation:URLFilter> *
+   * --------------------------
+   */
+
+  // Inherited Javadoc
+  public String filter(String url) {
+    String host = URLUtil.getHost(url);
+    String domain = null;
+    
+    try {
+      domain = URLUtil.getDomainName(url);
+    } catch (MalformedURLException e) {
+      // shouldnt happen here right?
+    }
+    
+    if (LOG.isDebugEnabled()) {
+      LOG.debug("URL belongs to host " + host + " and domain " + domain);
+    }
+
+    for (RegexRule rule : rules) {
+      // Skip the skip for rules that don't share the same host and domain
+      if (rule.hostOrDomain() != null &&
+            !rule.hostOrDomain().equals(host) &&
+            !rule.hostOrDomain().equals(domain)) {
+        if (LOG.isDebugEnabled()) {
+          LOG.debug("Skipping rule [" + rule.regex() + "] for host: " + rule.hostOrDomain());
+        }
+
+        continue;
+      }
+    
+      if (LOG.isDebugEnabled()) {
+        LOG.debug("Applying rule [" + rule.regex() + "] for host: " + host + " and domain " + domain);
+      }
+
+      if (rule.match(url)) {
+        return rule.accept() ? url : null;
+      }
+    }
+    ;
+    return null;
+  }
+
+  /*
+   * --------------------------- * </implementation:URLFilter> *
+   * ---------------------------
+   */
+
+  /*
+   * ----------------------------- * <implementation:Configurable> *
+   * -----------------------------
+   */
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    Reader reader = null;
+    try {
+      reader = getRulesReader(conf);
+    } catch (Exception e) {
+      if (LOG.isErrorEnabled()) {
+        LOG.error(e.getMessage());
+      }
+      throw new RuntimeException(e.getMessage(), e);
+    }
+    try {
+      rules = readRules(reader);
+    } catch (IOException e) {
+      if (LOG.isErrorEnabled()) {
+        LOG.error(e.getMessage());
+      }
+      throw new RuntimeException(e.getMessage(), e);
+    }
+  }
+
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+  /*
+   * ------------------------------ * </implementation:Configurable> *
+   * ------------------------------
+   */
+
+  /**
+   * Read the specified file of rules.
+   * 
+   * @param reader
+   *          is a reader of regular expressions rules.
+   * @return the corresponding {@RegexRule rules}.
+   */
+  private List<RegexRule> readRules(Reader reader) throws IOException,
+      IllegalArgumentException {
+
+    BufferedReader in = new BufferedReader(reader);
+    List<RegexRule> rules = new ArrayList<RegexRule>();
+    String line;
+    String hostOrDomain = null;
+    
+    while ((line = in.readLine()) != null) {
+      if (line.length() == 0) {
+        continue;
+      }
+      char first = line.charAt(0);
+      boolean sign = false;
+      switch (first) {
+      case '+':
+        sign = true;
+        break;
+      case '-':
+        sign = false;
+        break;
+      case ' ':
+      case '\n':
+      case '#': // skip blank & comment lines
+        continue;
+      case '>':
+        hostOrDomain = line.substring(1).trim();
+        continue;
+      case '<':
+        hostOrDomain = null;
+        continue;
+      default:
+        throw new IOException("Invalid first character: " + line);
+      }
+
+      String regex = line.substring(1);
+      if (LOG.isTraceEnabled()) {
+        LOG.trace("Adding rule [" + regex + "] for " + hostOrDomain);
+      }
+      RegexRule rule = createRule(sign, regex, hostOrDomain);
+      rules.add(rule);
+    }
+    return rules;
+  }
+
+  /**
+   * Filter the standard input using a RegexURLFilterBase.
+   * 
+   * @param filter
+   *          is the RegexURLFilterBase to use for filtering the standard input.
+   * @param args
+   *          some optional parameters (not used).
+   */
+  public static void main(RegexURLFilterBase filter, String args[])
+      throws IOException, IllegalArgumentException {
+
+    BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
+    String line;
+    while ((line = in.readLine()) != null) {
+      String out = filter.filter(line);
+      if (out != null) {
+        System.out.print("+");
+        System.out.println(out);
+      } else {
+        System.out.print("-");
+        System.out.println(line);
+      }
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-regex-filter/src/main/java/org/apache/nutch/urlfilter/api/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-regex-filter/src/main/java/org/apache/nutch/urlfilter/api/package-info.java b/nutch-plugins/lib-regex-filter/src/main/java/org/apache/nutch/urlfilter/api/package-info.java
new file mode 100644
index 0000000..b849353
--- /dev/null
+++ b/nutch-plugins/lib-regex-filter/src/main/java/org/apache/nutch/urlfilter/api/package-info.java
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Generic {@link org.apache.nutch.net.URLFilter URL filter} library,
+ * abstracting away from regular expression implementations.
+ */
+package org.apache.nutch.urlfilter.api;
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-regex-filter/src/test/java/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-regex-filter/src/test/java/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java b/nutch-plugins/lib-regex-filter/src/test/java/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java
new file mode 100644
index 0000000..0b58231
--- /dev/null
+++ b/nutch-plugins/lib-regex-filter/src/test/java/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java
@@ -0,0 +1,134 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlfilter.api;
+
+// JDK imports
+import java.io.BufferedReader;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.Reader;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.junit.Assert;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+// Nutch imports
+import org.apache.nutch.net.URLFilter;
+
+/**
+ * JUnit based test of class <code>RegexURLFilterBase</code>.
+ * 
+ * @author J&eacute;r&ocirc;me Charron
+ */
+public abstract class RegexURLFilterBaseTest {
+
+  /** My logger */
+  protected static final Logger LOG = LoggerFactory
+      .getLogger(RegexURLFilterBaseTest.class);
+
+  private final static String SEPARATOR = System.getProperty("file.separator");
+  private final static String SAMPLES = System.getProperty("test.data", ".");
+
+  protected abstract URLFilter getURLFilter(Reader rules);
+
+  protected void bench(int loops, String file) {
+    try {
+      bench(loops, new FileReader(SAMPLES + SEPARATOR + file + ".rules"),
+          new FileReader(SAMPLES + SEPARATOR + file + ".urls"));
+    } catch (Exception e) {
+      Assert.fail(e.toString());
+    }
+  }
+
+  protected void bench(int loops, Reader rules, Reader urls) {
+    long start = System.currentTimeMillis();
+    try {
+      URLFilter filter = getURLFilter(rules);
+      FilteredURL[] expected = readURLFile(urls);
+      for (int i = 0; i < loops; i++) {
+        test(filter, expected);
+      }
+    } catch (Exception e) {
+      Assert.fail(e.toString());
+    }
+    LOG.info("bench time (" + loops + ") "
+        + (System.currentTimeMillis() - start) + "ms");
+  }
+
+  protected void test(String file) {
+    try {
+      test(new FileReader(SAMPLES + SEPARATOR + file + ".rules"),
+          new FileReader(SAMPLES + SEPARATOR + file + ".urls"));
+    } catch (Exception e) {
+      Assert.fail(e.toString());
+    }
+  }
+
+  protected void test(Reader rules, Reader urls) {
+    try {
+      test(getURLFilter(rules), readURLFile(urls));
+    } catch (Exception e) {
+      Assert.fail(e.toString());
+    }
+  }
+
+  protected void test(URLFilter filter, FilteredURL[] expected) {
+    for (int i = 0; i < expected.length; i++) {
+      String result = filter.filter(expected[i].url);
+      if (result != null) {
+        Assert.assertTrue(expected[i].url, expected[i].sign);
+      } else {
+        Assert.assertFalse(expected[i].url, expected[i].sign);
+      }
+    }
+  }
+
+  private static FilteredURL[] readURLFile(Reader reader) throws IOException {
+    BufferedReader in = new BufferedReader(reader);
+    List<FilteredURL> list = new ArrayList<FilteredURL>();
+    String line;
+    while ((line = in.readLine()) != null) {
+      if (line.length() != 0) {
+        list.add(new FilteredURL(line));
+      }
+    }
+    return (FilteredURL[]) list.toArray(new FilteredURL[list.size()]);
+  }
+
+  private static class FilteredURL {
+
+    boolean sign;
+    String url;
+
+    FilteredURL(String line) {
+      switch (line.charAt(0)) {
+      case '+':
+        sign = true;
+        break;
+      case '-':
+        sign = false;
+        break;
+      default:
+        // Simply ignore...
+      }
+      url = line.substring(1);
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-selenium/build-ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-selenium/build-ivy.xml b/nutch-plugins/lib-selenium/build-ivy.xml
new file mode 100644
index 0000000..3abcf6d
--- /dev/null
+++ b/nutch-plugins/lib-selenium/build-ivy.xml
@@ -0,0 +1,54 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="lib-selenium" default="deps-jar" xmlns:ivy="antlib:org.apache.ivy.ant">
+
+    <property name="ivy.install.version" value="2.1.0" />
+    <condition property="ivy.home" value="${env.IVY_HOME}">
+      <isset property="env.IVY_HOME" />
+    </condition>
+    <property name="ivy.home" value="${user.home}/.ant" />
+    <property name="ivy.checksums" value="" />
+    <property name="ivy.jar.dir" value="${ivy.home}/lib" />
+    <property name="ivy.jar.file" value="${ivy.jar.dir}/ivy.jar" />
+
+    <target name="download-ivy" unless="offline">
+
+        <mkdir dir="${ivy.jar.dir}"/>
+        <!-- download Ivy from web site so that it can be used even without any special installation -->
+        <get src="http://repo2.maven.org/maven2/org/apache/ivy/ivy/${ivy.install.version}/ivy-${ivy.install.version}.jar" 
+             dest="${ivy.jar.file}" usetimestamp="true"/>
+    </target>
+
+    <target name="init-ivy" depends="download-ivy">
+      <!-- try to load ivy here from ivy home, in case the user has not already dropped
+              it into ant's lib dir (note that the latter copy will always take precedence).
+              We will not fail as long as local lib dir exists (it may be empty) and
+              ivy is in at least one of ant's lib dir or the local lib dir. -->
+        <path id="ivy.lib.path">
+            <fileset dir="${ivy.jar.dir}" includes="*.jar"/>
+
+        </path>
+        <taskdef resource="org/apache/ivy/ant/antlib.xml"
+                 uri="antlib:org.apache.ivy.ant" classpathref="ivy.lib.path"/>
+    </target>
+
+  <target name="deps-jar" depends="init-ivy">
+    <ivy:retrieve pattern="lib/[artifact]-[revision].[ext]" sync="true"/>
+  </target>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-selenium/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-selenium/build.xml b/nutch-plugins/lib-selenium/build.xml
new file mode 100644
index 0000000..7c6d98d
--- /dev/null
+++ b/nutch-plugins/lib-selenium/build.xml
@@ -0,0 +1,28 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="lib-selenium" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+  <!-- Add compilation dependencies to classpath -->
+  <path id="plugin.deps">    
+    <fileset dir="${nutch.root}/build">
+      <include name="**/lib-http/*.jar" />
+    </fileset>
+  </path>
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-selenium/howto_upgrade_selenium.txt
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-selenium/howto_upgrade_selenium.txt b/nutch-plugins/lib-selenium/howto_upgrade_selenium.txt
new file mode 100644
index 0000000..1892a62
--- /dev/null
+++ b/nutch-plugins/lib-selenium/howto_upgrade_selenium.txt
@@ -0,0 +1,15 @@
+1. Upgrade various driver versions dependency in src/plugin/lib-selenium/ivy.xml
+
+2. Upgrade Selenium's own dependencies in src/plugin/lib-selenium/plugin.xml
+
+   To get a list of dependencies and their versions execute:
+    $ ant -f ./build-ivy.xml
+    $ ls lib | sed 's/^/     <library name="/g' | sed 's/$/">\n       <export name="*"\/>\n     <\/library>/g'
+
+   Note that all dependent libraries are exported for a "library" plugin ("lib-selenium").
+
+   N.B. The above Regex + Sed commands may not work if you are using MacOSX's Sed. In this instance you can instal GNU Sed as follows
+
+   $ brew install gnu-sed --with-default-names
+
+   You can then restart your terminal and the Regex + Sed command should work just fine!

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-selenium/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-selenium/ivy.xml b/nutch-plugins/lib-selenium/ivy.xml
new file mode 100644
index 0000000..701b725
--- /dev/null
+++ b/nutch-plugins/lib-selenium/ivy.xml
@@ -0,0 +1,52 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../../ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+    <!-- begin selenium dependencies -->
+    <dependency org="org.seleniumhq.selenium" name="selenium-java" rev="2.48.2" />
+    
+    <dependency org="com.opera" name="operadriver" rev="1.5">
+      <exclude org="org.seleniumhq.selenium" name="selenium-remote-driver" />
+    </dependency>
+    <dependency org="com.codeborne" name="phantomjsdriver" rev="1.2.1" >
+      <exclude org="org.seleniumhq.selenium" name="selenium-remote-driver" />
+      <exclude org="org.seleniumhq.selenium" name="selenium-java" />
+    </dependency>
+    <!-- end selenium dependencies -->
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-selenium/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-selenium/plugin.xml b/nutch-plugins/lib-selenium/plugin.xml
new file mode 100644
index 0000000..a86d665
--- /dev/null
+++ b/nutch-plugins/lib-selenium/plugin.xml
@@ -0,0 +1,175 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<!--
+ ! A common framework for http protocol implementations
+ !-->
+<plugin
+   id="lib-selenium"
+   name="HTTP Framework"
+   version="1.0"
+   provider-name="org.apache.nutch">
+
+   <runtime>
+     <library name="lib-selenium.jar">
+        <export name="*"/>
+     </library>
+     <!-- all classes from dependent libraries are exported -->
+     <library name="cglib-nodep-2.1_3.jar">
+       <export name="*"/>
+     </library>
+     <library name="commons-codec-1.10.jar">
+       <export name="*"/>
+     </library>
+     <library name="commons-collections-3.2.1.jar">
+       <export name="*"/>
+     </library>
+     <library name="commons-exec-1.3.jar">
+       <export name="*"/>
+     </library>
+     <library name="commons-io-2.4.jar">
+       <export name="*"/>
+     </library>
+     <library name="commons-jxpath-1.3.jar">
+       <export name="*"/>
+     </library>
+     <library name="commons-lang3-3.4.jar">
+       <export name="*"/>
+     </library>
+     <library name="commons-logging-1.2.jar">
+       <export name="*"/>
+     </library>
+     <library name="cssparser-0.9.16.jar">
+       <export name="*"/>
+     </library>
+     <library name="gson-2.3.1.jar">
+       <export name="*"/>
+     </library>
+     <library name="guava-18.0.jar">
+       <export name="*"/>
+     </library>
+     <library name="htmlunit-2.18.jar">
+       <export name="*"/>
+     </library>
+     <library name="htmlunit-core-js-2.17.jar">
+       <export name="*"/>
+     </library>
+     <library name="httpclient-4.5.1.jar">
+       <export name="*"/>
+     </library>
+     <library name="httpcore-4.4.3.jar">
+       <export name="*"/>
+     </library>
+     <library name="httpmime-4.5.jar">
+       <export name="*"/>
+     </library>
+     <library name="ini4j-0.5.2.jar">
+       <export name="*"/>
+     </library>
+     <library name="jetty-io-9.2.12.v20150709.jar">
+       <export name="*"/>
+     </library>
+     <library name="jetty-util-9.2.12.v20150709.jar">
+       <export name="*"/>
+     </library>
+     <library name="jna-4.1.0.jar">
+       <export name="*"/>
+     </library>
+     <library name="jna-platform-4.1.0.jar">
+       <export name="*"/>
+     </library>
+     <library name="nekohtml-1.9.22.jar">
+       <export name="*"/>
+     </library>
+     <library name="netty-3.5.2.Final.jar">
+       <export name="*"/>
+     </library>
+     <library name="operadriver-1.5.jar">
+       <export name="*"/>
+     </library>
+     <library name="operalaunchers-1.1.jar">
+       <export name="*"/>
+     </library>
+     <library name="phantomjsdriver-1.2.1.jar">
+       <export name="*"/>
+     </library>
+     <library name="protobuf-java-2.4.1.jar">
+       <export name="*"/>
+     </library>
+     <library name="sac-1.3.jar">
+       <export name="*"/>
+     </library>
+     <library name="selenium-api-2.48.2.jar">
+       <export name="*"/>
+     </library>
+     <library name="selenium-chrome-driver-2.48.2.jar">
+       <export name="*"/>
+     </library>
+     <library name="selenium-edge-driver-2.48.2.jar">
+       <export name="*"/>
+     </library>
+     <library name="selenium-firefox-driver-2.48.2.jar">
+       <export name="*"/>
+     </library>
+     <library name="selenium-htmlunit-driver-2.48.2.jar">
+       <export name="*"/>
+     </library>
+     <library name="selenium-ie-driver-2.48.2.jar">
+       <export name="*"/>
+     </library>
+     <library name="selenium-java-2.48.2.jar">
+       <export name="*"/>
+     </library>
+     <library name="selenium-leg-rc-2.48.2.jar">
+       <export name="*"/>
+     </library>
+     <library name="selenium-remote-driver-2.48.2.jar">
+       <export name="*"/>
+     </library>
+     <library name="selenium-safari-driver-2.48.2.jar">
+       <export name="*"/>
+     </library>
+     <library name="selenium-support-2.48.2.jar">
+       <export name="*"/>
+     </library>
+     <library name="serializer-2.7.2.jar">
+       <export name="*"/>
+     </library>
+     <library name="webbit-0.4.14.jar">
+       <export name="*"/>
+     </library>
+     <library name="websocket-api-9.2.12.v20150709.jar">
+       <export name="*"/>
+     </library>
+     <library name="websocket-client-9.2.12.v20150709.jar">
+       <export name="*"/>
+     </library>
+     <library name="websocket-common-9.2.12.v20150709.jar">
+       <export name="*"/>
+     </library>
+     <library name="xalan-2.7.2.jar">
+       <export name="*"/>
+     </library>
+     <library name="xercesImpl-2.11.0.jar">
+       <export name="*"/>
+     </library>
+     <library name="xml-apis-1.4.01.jar">
+       <export name="*"/>
+     </library>
+   </runtime>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-selenium/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-selenium/pom.xml b/nutch-plugins/lib-selenium/pom.xml
new file mode 100644
index 0000000..fed912d
--- /dev/null
+++ b/nutch-plugins/lib-selenium/pom.xml
@@ -0,0 +1,49 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>lib-selenium</artifactId>
+    <packaging>jar</packaging>
+
+    <name>lib-selenium</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+    <dependencies>
+        <dependency>
+            <groupId>org.seleniumhq.selenium</groupId> <artifactId>selenium-java</artifactId> <version>2.48.2</version>
+        </dependency>
+        <dependency>
+            <groupId>com.opera</groupId> <artifactId>operadriver</artifactId> <version>1.5</version>
+        </dependency>
+        <dependency>
+            <groupId>com.codeborne</groupId> <artifactId>phantomjsdriver</artifactId> <version>1.2.1</version>
+        </dependency>
+    </dependencies>
+
+</project>

[51/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build for nutch-core and nutch-plugins

Posted by th...@apache.org.

NUTCH-2292 : Mavenize the build for nutch-core and nutch-plugins


Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/0bf453e5
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/0bf453e5
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/0bf453e5

Branch: refs/heads/NUTCH-2292
Commit: 0bf453e5754967541a0798585dbe115630679c5f
Parents: 5943d11
Author: Thamme Gowda <th...@apache.org>
Authored: Sat Jul 16 12:47:08 2016 -0700
Committer: Thamme Gowda <th...@apache.org>
Committed: Sat Jul 16 12:47:08 2016 -0700

----------------------------------------------------------------------
 .gitignore                                      |   6 +
 bin/crawl                                       | 281 ++++++
 bin/nutch                                       | 324 +++++++
 nutch-core/pom.xml                              | 522 +++++++++++
 .../nutch/crawl/AbstractFetchSchedule.java      | 227 +++++
 .../nutch/crawl/AdaptiveFetchSchedule.java      | 203 +++++
 .../java/org/apache/nutch/crawl/CrawlDatum.java | 572 ++++++++++++
 .../java/org/apache/nutch/crawl/CrawlDb.java    | 349 ++++++++
 .../org/apache/nutch/crawl/CrawlDbFilter.java   | 111 +++
 .../org/apache/nutch/crawl/CrawlDbMerger.java   | 216 +++++
 .../org/apache/nutch/crawl/CrawlDbReader.java   | 887 +++++++++++++++++++
 .../org/apache/nutch/crawl/CrawlDbReducer.java  | 339 +++++++
 .../apache/nutch/crawl/DeduplicationJob.java    | 389 ++++++++
 .../nutch/crawl/DefaultFetchSchedule.java       |  45 +
 .../org/apache/nutch/crawl/FetchSchedule.java   | 208 +++++
 .../nutch/crawl/FetchScheduleFactory.java       |  53 ++
 .../java/org/apache/nutch/crawl/Generator.java  | 859 ++++++++++++++++++
 .../java/org/apache/nutch/crawl/Injector.java   | 510 +++++++++++
 .../java/org/apache/nutch/crawl/Inlink.java     |  83 ++
 .../java/org/apache/nutch/crawl/Inlinks.java    | 110 +++
 .../java/org/apache/nutch/crawl/LinkDb.java     | 428 +++++++++
 .../org/apache/nutch/crawl/LinkDbFilter.java    | 128 +++
 .../org/apache/nutch/crawl/LinkDbMerger.java    | 204 +++++
 .../org/apache/nutch/crawl/LinkDbReader.java    | 203 +++++
 .../org/apache/nutch/crawl/MD5Signature.java    |  39 +
 .../nutch/crawl/MimeAdaptiveFetchSchedule.java  | 236 +++++
 .../org/apache/nutch/crawl/NutchWritable.java   |  66 ++
 .../java/org/apache/nutch/crawl/Signature.java  |  37 +
 .../apache/nutch/crawl/SignatureComparator.java |  57 ++
 .../apache/nutch/crawl/SignatureFactory.java    |  62 ++
 .../apache/nutch/crawl/TextMD5Signature.java    |  42 +
 .../nutch/crawl/TextProfileSignature.java       | 199 +++++
 .../org/apache/nutch/crawl/URLPartitioner.java  |  96 ++
 .../java/org/apache/nutch/crawl/package.html    |   5 +
 .../org/apache/nutch/fetcher/FetchItem.java     | 118 +++
 .../apache/nutch/fetcher/FetchItemQueue.java    | 139 +++
 .../apache/nutch/fetcher/FetchItemQueues.java   | 212 +++++
 .../org/apache/nutch/fetcher/FetchNode.java     |  59 ++
 .../org/apache/nutch/fetcher/FetchNodeDb.java   |  49 +
 .../java/org/apache/nutch/fetcher/Fetcher.java  | 600 +++++++++++++
 .../nutch/fetcher/FetcherOutputFormat.java      | 123 +++
 .../org/apache/nutch/fetcher/FetcherThread.java | 768 ++++++++++++++++
 .../org/apache/nutch/fetcher/QueueFeeder.java   | 104 +++
 .../java/org/apache/nutch/fetcher/package.html  |   5 +
 .../java/org/apache/nutch/hostdb/HostDatum.java | 324 +++++++
 .../org/apache/nutch/hostdb/ReadHostDb.java     | 240 +++++
 .../org/apache/nutch/hostdb/ResolverThread.java | 121 +++
 .../org/apache/nutch/hostdb/UpdateHostDb.java   | 259 ++++++
 .../apache/nutch/hostdb/UpdateHostDbMapper.java | 239 +++++
 .../nutch/hostdb/UpdateHostDbReducer.java       | 427 +++++++++
 .../org/apache/nutch/indexer/CleaningJob.java   | 210 +++++
 .../org/apache/nutch/indexer/IndexWriter.java   |  47 +
 .../org/apache/nutch/indexer/IndexWriters.java  | 145 +++
 .../apache/nutch/indexer/IndexerMapReduce.java  | 422 +++++++++
 .../nutch/indexer/IndexerOutputFormat.java      |  57 ++
 .../apache/nutch/indexer/IndexingException.java |  39 +
 .../apache/nutch/indexer/IndexingFilter.java    |  61 ++
 .../apache/nutch/indexer/IndexingFilters.java   |  60 ++
 .../nutch/indexer/IndexingFiltersChecker.java   | 371 ++++++++
 .../org/apache/nutch/indexer/IndexingJob.java   | 358 ++++++++
 .../org/apache/nutch/indexer/NutchDocument.java | 144 +++
 .../org/apache/nutch/indexer/NutchField.java    | 137 +++
 .../apache/nutch/indexer/NutchIndexAction.java  |  58 ++
 .../java/org/apache/nutch/indexer/package.html  |  10 +
 .../apache/nutch/metadata/CreativeCommons.java  |  35 +
 .../org/apache/nutch/metadata/DublinCore.java   | 161 ++++
 .../java/org/apache/nutch/metadata/Feed.java    |  38 +
 .../org/apache/nutch/metadata/HttpHeaders.java  |  51 ++
 .../org/apache/nutch/metadata/MetaWrapper.java  | 120 +++
 .../org/apache/nutch/metadata/Metadata.java     | 280 ++++++
 .../java/org/apache/nutch/metadata/Nutch.java   |  98 ++
 .../nutch/metadata/SpellCheckedMetadata.java    | 150 ++++
 .../java/org/apache/nutch/metadata/package.html |   6 +
 .../apache/nutch/net/URLExemptionFilter.java    |  43 +
 .../apache/nutch/net/URLExemptionFilters.java   |  64 ++
 .../java/org/apache/nutch/net/URLFilter.java    |  40 +
 .../org/apache/nutch/net/URLFilterChecker.java  | 134 +++
 .../apache/nutch/net/URLFilterException.java    |  39 +
 .../java/org/apache/nutch/net/URLFilters.java   |  44 +
 .../org/apache/nutch/net/URLNormalizer.java     |  37 +
 .../apache/nutch/net/URLNormalizerChecker.java  | 117 +++
 .../org/apache/nutch/net/URLNormalizers.java    | 325 +++++++
 .../java/org/apache/nutch/net/package-info.java |  23 +
 .../nutch/net/protocols/HttpDateFormat.java     | 124 +++
 .../nutch/net/protocols/ProtocolException.java  |  47 +
 .../apache/nutch/net/protocols/Response.java    |  46 +
 .../nutch/net/protocols/package-info.java       |  23 +
 .../org/apache/nutch/parse/HTMLMetaTags.java    | 203 +++++
 .../org/apache/nutch/parse/HtmlParseFilter.java |  45 +
 .../apache/nutch/parse/HtmlParseFilters.java    |  62 ++
 .../java/org/apache/nutch/parse/Outlink.java    | 135 +++
 .../apache/nutch/parse/OutlinkExtractor.java    | 145 +++
 .../main/java/org/apache/nutch/parse/Parse.java |  38 +
 .../org/apache/nutch/parse/ParseCallable.java   |  37 +
 .../java/org/apache/nutch/parse/ParseData.java  | 255 ++++++
 .../org/apache/nutch/parse/ParseException.java  |  39 +
 .../java/org/apache/nutch/parse/ParseImpl.java  |  87 ++
 .../apache/nutch/parse/ParseOutputFormat.java   | 398 +++++++++
 .../org/apache/nutch/parse/ParsePluginList.java |  71 ++
 .../apache/nutch/parse/ParsePluginsReader.java  | 278 ++++++
 .../org/apache/nutch/parse/ParseResult.java     | 178 ++++
 .../org/apache/nutch/parse/ParseSegment.java    | 309 +++++++
 .../org/apache/nutch/parse/ParseStatus.java     | 311 +++++++
 .../java/org/apache/nutch/parse/ParseText.java  | 119 +++
 .../java/org/apache/nutch/parse/ParseUtil.java  | 181 ++++
 .../java/org/apache/nutch/parse/Parser.java     |  58 ++
 .../org/apache/nutch/parse/ParserChecker.java   | 270 ++++++
 .../org/apache/nutch/parse/ParserFactory.java   | 428 +++++++++
 .../org/apache/nutch/parse/ParserNotFound.java  |  47 +
 .../org/apache/nutch/parse/package-info.java    |  22 +
 .../plugin/CircularDependencyException.java     |  36 +
 .../java/org/apache/nutch/plugin/Extension.java | 194 ++++
 .../org/apache/nutch/plugin/ExtensionPoint.java | 123 +++
 .../plugin/MissingDependencyException.java      |  36 +
 .../java/org/apache/nutch/plugin/Pluggable.java |  31 +
 .../java/org/apache/nutch/plugin/Plugin.java    |  95 ++
 .../apache/nutch/plugin/PluginClassLoader.java  |  80 ++
 .../apache/nutch/plugin/PluginDescriptor.java   | 363 ++++++++
 .../nutch/plugin/PluginManifestParser.java      | 303 +++++++
 .../apache/nutch/plugin/PluginRepository.java   | 523 +++++++++++
 .../nutch/plugin/PluginRuntimeException.java    |  37 +
 .../java/org/apache/nutch/plugin/package.html   |  40 +
 .../java/org/apache/nutch/protocol/Content.java | 296 +++++++
 .../org/apache/nutch/protocol/Protocol.java     |  68 ++
 .../nutch/protocol/ProtocolException.java       |  39 +
 .../apache/nutch/protocol/ProtocolFactory.java  | 119 +++
 .../apache/nutch/protocol/ProtocolNotFound.java |  36 +
 .../apache/nutch/protocol/ProtocolOutput.java   |  55 ++
 .../apache/nutch/protocol/ProtocolStatus.java   | 297 +++++++
 .../apache/nutch/protocol/RobotRulesParser.java | 325 +++++++
 .../org/apache/nutch/protocol/package-info.java |  23 +
 .../nutch/scoring/AbstractScoringFilter.java    |  68 ++
 .../org/apache/nutch/scoring/ScoringFilter.java | 213 +++++
 .../nutch/scoring/ScoringFilterException.java   |  43 +
 .../apache/nutch/scoring/ScoringFilters.java    | 118 +++
 .../org/apache/nutch/scoring/package-info.java  |  22 +
 .../nutch/scoring/webgraph/LinkDatum.java       | 140 +++
 .../nutch/scoring/webgraph/LinkDumper.java      | 433 +++++++++
 .../apache/nutch/scoring/webgraph/LinkRank.java | 677 ++++++++++++++
 .../org/apache/nutch/scoring/webgraph/Node.java | 102 +++
 .../nutch/scoring/webgraph/NodeDumper.java      | 433 +++++++++
 .../nutch/scoring/webgraph/NodeReader.java      | 136 +++
 .../nutch/scoring/webgraph/ScoreUpdater.java    | 253 ++++++
 .../apache/nutch/scoring/webgraph/WebGraph.java | 783 ++++++++++++++++
 .../nutch/scoring/webgraph/package-info.java    |  24 +
 .../nutch/segment/ContentAsTextInputFormat.java | 104 +++
 .../apache/nutch/segment/SegmentChecker.java    | 136 +++
 .../nutch/segment/SegmentMergeFilter.java       |  47 +
 .../nutch/segment/SegmentMergeFilters.java      |  84 ++
 .../org/apache/nutch/segment/SegmentMerger.java | 793 +++++++++++++++++
 .../org/apache/nutch/segment/SegmentPart.java   | 113 +++
 .../org/apache/nutch/segment/SegmentReader.java | 719 +++++++++++++++
 .../org/apache/nutch/segment/package-info.java  |  23 +
 .../org/apache/nutch/service/ConfManager.java   |  39 +
 .../org/apache/nutch/service/JobManager.java    |  44 +
 .../org/apache/nutch/service/NutchReader.java   |  37 +
 .../org/apache/nutch/service/NutchServer.java   | 224 +++++
 .../nutch/service/impl/ConfManagerImpl.java     | 132 +++
 .../apache/nutch/service/impl/JobFactory.java   |  75 ++
 .../nutch/service/impl/JobManagerImpl.java      |  95 ++
 .../apache/nutch/service/impl/JobWorker.java    | 114 +++
 .../apache/nutch/service/impl/LinkReader.java   | 175 ++++
 .../apache/nutch/service/impl/NodeReader.java   | 184 ++++
 .../service/impl/NutchServerPoolExecutor.java   | 131 +++
 .../nutch/service/impl/SequenceReader.java      | 171 ++++
 .../nutch/service/model/request/DbQuery.java    |  56 ++
 .../nutch/service/model/request/JobConfig.java  |  71 ++
 .../service/model/request/NutchConfig.java      |  51 ++
 .../service/model/request/ReaderConfig.java     |  30 +
 .../nutch/service/model/request/SeedList.java   |  93 ++
 .../nutch/service/model/request/SeedUrl.java    |  89 ++
 .../service/model/response/FetchNodeDbInfo.java | 103 +++
 .../nutch/service/model/response/JobInfo.java   | 102 +++
 .../service/model/response/NutchServerInfo.java |  55 ++
 .../service/resources/AbstractResource.java     |  45 +
 .../nutch/service/resources/AdminResource.java  |  85 ++
 .../nutch/service/resources/ConfigResource.java | 137 +++
 .../nutch/service/resources/DbResource.java     | 143 +++
 .../nutch/service/resources/JobResource.java    |  99 +++
 .../nutch/service/resources/ReaderResouce.java  | 177 ++++
 .../nutch/service/resources/SeedResource.java   | 111 +++
 .../nutch/tools/AbstractCommonCrawlFormat.java  | 393 ++++++++
 .../java/org/apache/nutch/tools/Benchmark.java  | 284 ++++++
 .../apache/nutch/tools/CommonCrawlConfig.java   | 147 +++
 .../nutch/tools/CommonCrawlDataDumper.java      | 716 +++++++++++++++
 .../apache/nutch/tools/CommonCrawlFormat.java   |  87 ++
 .../nutch/tools/CommonCrawlFormatFactory.java   |  74 ++
 .../nutch/tools/CommonCrawlFormatJackson.java   | 109 +++
 .../nutch/tools/CommonCrawlFormatJettinson.java | 122 +++
 .../nutch/tools/CommonCrawlFormatSimple.java    | 174 ++++
 .../nutch/tools/CommonCrawlFormatWARC.java      | 286 ++++++
 .../java/org/apache/nutch/tools/DmozParser.java | 391 ++++++++
 .../java/org/apache/nutch/tools/FileDumper.java | 419 +++++++++
 .../org/apache/nutch/tools/FreeGenerator.java   | 214 +++++
 .../org/apache/nutch/tools/ResolveUrls.java     | 204 +++++
 .../java/org/apache/nutch/tools/WARCUtils.java  | 154 ++++
 .../apache/nutch/tools/arc/ArcInputFormat.java  |  51 ++
 .../apache/nutch/tools/arc/ArcRecordReader.java | 299 +++++++
 .../nutch/tools/arc/ArcSegmentCreator.java      | 426 +++++++++
 .../apache/nutch/tools/arc/package-info.java    |  23 +
 .../org/apache/nutch/tools/package-info.java    |  22 +
 .../apache/nutch/tools/warc/WARCExporter.java   | 333 +++++++
 .../apache/nutch/tools/warc/package-info.java   |  23 +
 .../org/apache/nutch/util/CommandRunner.java    | 291 ++++++
 .../apache/nutch/util/CrawlCompletionStats.java | 245 +++++
 .../org/apache/nutch/util/DeflateUtils.java     | 140 +++
 .../java/org/apache/nutch/util/DomUtil.java     | 104 +++
 .../org/apache/nutch/util/DumpFileUtil.java     | 147 +++
 .../org/apache/nutch/util/EncodingDetector.java | 386 ++++++++
 .../java/org/apache/nutch/util/FSUtils.java     | 106 +++
 .../java/org/apache/nutch/util/GZIPUtils.java   | 148 ++++
 .../nutch/util/GenericWritableConfigurable.java |  60 ++
 .../org/apache/nutch/util/HadoopFSUtil.java     |  72 ++
 .../java/org/apache/nutch/util/JexlUtil.java    |  76 ++
 .../java/org/apache/nutch/util/LockUtil.java    |  84 ++
 .../java/org/apache/nutch/util/MimeUtil.java    | 279 ++++++
 .../java/org/apache/nutch/util/NodeWalker.java  | 129 +++
 .../apache/nutch/util/NutchConfiguration.java   | 104 +++
 .../java/org/apache/nutch/util/NutchJob.java    |  30 +
 .../java/org/apache/nutch/util/NutchTool.java   | 109 +++
 .../java/org/apache/nutch/util/ObjectCache.java |  56 ++
 .../apache/nutch/util/PrefixStringMatcher.java  | 119 +++
 .../nutch/util/ProtocolStatusStatistics.java    | 179 ++++
 .../java/org/apache/nutch/util/StringUtil.java  | 155 ++++
 .../apache/nutch/util/SuffixStringMatcher.java  | 114 +++
 .../java/org/apache/nutch/util/TableUtil.java   | 161 ++++
 .../java/org/apache/nutch/util/TimingUtil.java  |  72 ++
 .../apache/nutch/util/TrieStringMatcher.java    | 202 +++++
 .../java/org/apache/nutch/util/URLUtil.java     | 533 +++++++++++
 .../nutch/util/domain/DomainStatistics.java     | 234 +++++
 .../apache/nutch/util/domain/DomainSuffix.java  |  79 ++
 .../nutch/util/domain/DomainSuffixes.java       |  86 ++
 .../nutch/util/domain/DomainSuffixesReader.java | 164 ++++
 .../nutch/util/domain/TopLevelDomain.java       |  67 ++
 .../org/apache/nutch/util/domain/package.html   |  14 +
 .../org/apache/nutch/util/package-info.java     |  22 +
 .../apache/nutch/webui/NutchUiApplication.java  |  75 ++
 .../nutch/webui/NutchUiApplication.properties   |  63 ++
 .../org/apache/nutch/webui/NutchUiServer.java   | 104 +++
 .../apache/nutch/webui/client/NutchClient.java  |  49 +
 .../nutch/webui/client/NutchClientFactory.java  |  52 ++
 .../nutch/webui/client/impl/CrawlingCycle.java  |  82 ++
 .../client/impl/CrawlingCycleListener.java      |  31 +
 .../webui/client/impl/NutchClientImpl.java      |  99 +++
 .../nutch/webui/client/impl/RemoteCommand.java  |  76 ++
 .../webui/client/impl/RemoteCommandBuilder.java |  64 ++
 .../client/impl/RemoteCommandExecutor.java      | 110 +++
 .../client/impl/RemoteCommandsBatchFactory.java |  97 ++
 .../webui/client/model/ConnectionStatus.java    |  21 +
 .../apache/nutch/webui/client/model/Crawl.java  | 126 +++
 .../nutch/webui/client/model/JobConfig.java     |  77 ++
 .../nutch/webui/client/model/JobInfo.java       | 104 +++
 .../nutch/webui/client/model/NutchStatus.java   |  62 ++
 .../nutch/webui/config/CustomDaoFactory.java    |  58 ++
 .../nutch/webui/config/CustomTableCreator.java  |  83 ++
 .../webui/config/NutchGuiConfiguration.java     |  33 +
 .../nutch/webui/config/SpringConfiguration.java |  91 ++
 .../apache/nutch/webui/model/NutchConfig.java   |  24 +
 .../apache/nutch/webui/model/NutchInstance.java | 118 +++
 .../org/apache/nutch/webui/model/SeedList.java  | 106 +++
 .../org/apache/nutch/webui/model/SeedUrl.java   |  96 ++
 .../nutch/webui/pages/AbstractBasePage.html     |  33 +
 .../nutch/webui/pages/AbstractBasePage.java     | 206 +++++
 .../apache/nutch/webui/pages/DashboardPage.html |  52 ++
 .../apache/nutch/webui/pages/DashboardPage.java |  65 ++
 .../apache/nutch/webui/pages/LogOutPage.java    |  21 +
 .../nutch/webui/pages/SchedulingPage.java       |  21 +
 .../apache/nutch/webui/pages/SearchPage.java    |  21 +
 .../nutch/webui/pages/StatisticsPage.java       |  21 +
 .../nutch/webui/pages/UrlsUploadPage.java       |  21 +
 .../nutch/webui/pages/UserSettingsPage.java     |  21 +
 .../webui/pages/assets/NutchUiCssReference.java |  39 +
 .../nutch/webui/pages/assets/nutch-style.css    | 149 ++++
 .../webui/pages/components/ColorEnumLabel.java  |  71 ++
 .../pages/components/ColorEnumLabelBuilder.java |  49 +
 .../pages/components/CpmIteratorAdapter.java    |  41 +
 .../nutch/webui/pages/crawls/CrawlPanel.html    |  58 ++
 .../nutch/webui/pages/crawls/CrawlPanel.java    |  98 ++
 .../nutch/webui/pages/crawls/CrawlsPage.html    |  90 ++
 .../nutch/webui/pages/crawls/CrawlsPage.java    | 139 +++
 .../webui/pages/instances/InstancePanel.html    |  46 +
 .../webui/pages/instances/InstancePanel.java    |  62 ++
 .../webui/pages/instances/InstancesPage.html    |  66 ++
 .../webui/pages/instances/InstancesPage.java    | 127 +++
 .../nutch/webui/pages/menu/VerticalMenu.html    |  48 +
 .../nutch/webui/pages/menu/VerticalMenu.java    |  27 +
 .../nutch/webui/pages/seed/SeedListsPage.html   |  75 ++
 .../nutch/webui/pages/seed/SeedListsPage.java   |  79 ++
 .../apache/nutch/webui/pages/seed/SeedPage.html |  91 ++
 .../apache/nutch/webui/pages/seed/SeedPage.java | 153 ++++
 .../webui/pages/settings/SettingsPage.html      |  43 +
 .../webui/pages/settings/SettingsPage.java      |  59 ++
 .../nutch/webui/service/CrawlService.java       |  33 +
 .../webui/service/NutchInstanceService.java     |  33 +
 .../nutch/webui/service/NutchService.java       |  31 +
 .../nutch/webui/service/SeedListService.java    |  33 +
 .../webui/service/impl/CrawlServiceImpl.java    | 132 +++
 .../service/impl/NutchInstanceServiceImpl.java  |  76 ++
 .../webui/service/impl/NutchServiceImpl.java    |  82 ++
 .../webui/service/impl/SeedListServiceImpl.java |  77 ++
 nutch-core/src/main/java/overview.html          |   9 +
 .../nutch/crawl/ContinuousCrawlTestUtil.java    | 270 ++++++
 .../org/apache/nutch/crawl/CrawlDBTestUtil.java | 179 ++++
 .../nutch/crawl/CrawlDbUpdateTestDriver.java    | 138 +++
 .../apache/nutch/crawl/CrawlDbUpdateUtil.java   | 166 ++++
 .../org/apache/nutch/crawl/DummyWritable.java   |  32 +
 .../nutch/crawl/TODOTestCrawlDbStates.java      | 171 ++++
 .../nutch/crawl/TestAdaptiveFetchSchedule.java  | 121 +++
 .../apache/nutch/crawl/TestCrawlDbFilter.java   | 148 ++++
 .../apache/nutch/crawl/TestCrawlDbMerger.java   | 163 ++++
 .../apache/nutch/crawl/TestCrawlDbStates.java   | 569 ++++++++++++
 .../org/apache/nutch/crawl/TestGenerator.java   | 373 ++++++++
 .../org/apache/nutch/crawl/TestInjector.java    | 184 ++++
 .../apache/nutch/crawl/TestLinkDbMerger.java    | 160 ++++
 .../nutch/crawl/TestSignatureFactory.java       |  35 +
 .../org/apache/nutch/fetcher/TestFetcher.java   | 210 +++++
 .../nutch/indexer/TestIndexerMapReduce.java     | 190 ++++
 .../nutch/indexer/TestIndexingFilters.java      | 113 +++
 .../org/apache/nutch/metadata/TestMetadata.java | 281 ++++++
 .../metadata/TestSpellCheckedMetadata.java      | 303 +++++++
 .../org/apache/nutch/net/TestURLFilters.java    |  44 +
 .../apache/nutch/net/TestURLNormalizers.java    |  86 ++
 .../nutch/parse/TestOutlinkExtractor.java       |  99 +++
 .../org/apache/nutch/parse/TestParseData.java   |  58 ++
 .../org/apache/nutch/parse/TestParseText.java   |  34 +
 .../apache/nutch/parse/TestParserFactory.java   | 108 +++
 .../apache/nutch/parse/parse-plugin-test.xml    |  58 ++
 .../nutch/plugin/HelloWorldExtension.java       |  36 +
 .../org/apache/nutch/plugin/ITestExtension.java |  27 +
 .../apache/nutch/plugin/SimpleTestPlugin.java   |  57 ++
 .../apache/nutch/plugin/TestPluginSystem.java   | 305 +++++++
 .../org/apache/nutch/protocol/TestContent.java  |  94 ++
 .../nutch/protocol/TestProtocolFactory.java     |  88 ++
 .../apache/nutch/segment/TestSegmentMerger.java | 131 +++
 .../segment/TestSegmentMergerCrawlDatums.java   | 427 +++++++++
 .../apache/nutch/service/TestNutchServer.java   |  65 ++
 .../org/apache/nutch/test/IntegrationTest.java  |   6 +
 .../java/org/apache/nutch/test/TestUtils.java   |  29 +
 .../nutch/tools/TestCommonCrawlDataDumper.java  | 126 +++
 .../tools/proxy/AbstractTestbedHandler.java     |  49 +
 .../apache/nutch/tools/proxy/DelayHandler.java  |  56 ++
 .../apache/nutch/tools/proxy/FakeHandler.java   | 102 +++
 .../nutch/tools/proxy/LogDebugHandler.java      |  64 ++
 .../nutch/tools/proxy/NotFoundHandler.java      |  40 +
 .../apache/nutch/tools/proxy/ProxyTestbed.java  | 156 ++++
 .../nutch/tools/proxy/SegmentHandler.java       | 255 ++++++
 .../apache/nutch/tools/proxy/package-info.java  |  22 +
 .../org/apache/nutch/util/DumpFileUtilTest.java |  68 ++
 .../apache/nutch/util/TestEncodingDetector.java |  90 ++
 .../org/apache/nutch/util/TestGZIPUtils.java    | 241 +++++
 .../org/apache/nutch/util/TestMimeUtil.java     | 135 +++
 .../org/apache/nutch/util/TestNodeWalker.java   | 107 +++
 .../nutch/util/TestPrefixStringMatcher.java     | 115 +++
 .../org/apache/nutch/util/TestStringUtil.java   |  61 ++
 .../nutch/util/TestSuffixStringMatcher.java     | 114 +++
 .../org/apache/nutch/util/TestTableUtil.java    |  75 ++
 .../java/org/apache/nutch/util/TestURLUtil.java | 281 ++++++
 .../apache/nutch/util/WritableTestUtils.java    |  55 ++
 nutch-core/src/test/resources/crawl-tests.xml   |  62 ++
 .../src/test/resources/domain-urlfilter.txt     |  22 +
 .../resources/fetch-test-site/dup_of_pagea.html |  11 +
 .../resources/fetch-test-site/exception.html    |  13 +
 .../test/resources/fetch-test-site/index.html   |  13 +
 .../fetch-test-site/nested_spider_trap.html     |  23 +
 .../test/resources/fetch-test-site/pagea.html   |  11 +
 .../test/resources/fetch-test-site/pageb.html   |  11 +
 .../test/resources/fetch-test-site/robots.txt   |   0
 nutch-core/src/test/resources/filter-all.txt    |   7 +
 nutch-core/src/test/resources/log4j.properties  |   7 +
 nutch-core/src/test/resources/nutch-site.xml    |  19 +
 .../src/test/resources/test-mime-util/test.xlsx | Bin 0 -> 3950 bytes
 .../20150309101625/content/part-00000/.data.crc | Bin 0 -> 124 bytes
 .../content/part-00000/.index.crc               | Bin 0 -> 12 bytes
 .../20150309101625/content/part-00000/data      | Bin 0 -> 14452 bytes
 .../20150309101625/content/part-00000/index     | Bin 0 -> 217 bytes
 .../crawl_fetch/part-00000/.data.crc            | Bin 0 -> 12 bytes
 .../crawl_fetch/part-00000/.index.crc           | Bin 0 -> 12 bytes
 .../20150309101625/crawl_fetch/part-00000/data  | Bin 0 -> 293 bytes
 .../20150309101625/crawl_fetch/part-00000/index | Bin 0 -> 217 bytes
 .../crawl_generate/.part-00000.crc              | Bin 0 -> 12 bytes
 .../20150309101625/crawl_generate/part-00000    | Bin 0 -> 169 bytes
 .../20150309101625/crawl_parse/.part-00000.crc  | Bin 0 -> 68 bytes
 .../20150309101625/crawl_parse/part-00000       | Bin 0 -> 7627 bytes
 .../parse_data/part-00000/.data.crc             | Bin 0 -> 24 bytes
 .../parse_data/part-00000/.index.crc            | Bin 0 -> 12 bytes
 .../20150309101625/parse_data/part-00000/data   | Bin 0 -> 1985 bytes
 .../20150309101625/parse_data/part-00000/index  | Bin 0 -> 217 bytes
 .../parse_text/part-00000/.data.crc             | Bin 0 -> 60 bytes
 .../parse_text/part-00000/.index.crc            | Bin 0 -> 12 bytes
 .../20150309101625/parse_text/part-00000/data   | Bin 0 -> 6554 bytes
 .../20150309101625/parse_text/part-00000/index  | Bin 0 -> 217 bytes
 .../20150309101656/content/part-00000/.data.crc | Bin 0 -> 3372 bytes
 .../content/part-00000/.index.crc               | Bin 0 -> 12 bytes
 .../20150309101656/content/part-00000/data      | Bin 0 -> 430250 bytes
 .../20150309101656/content/part-00000/index     | Bin 0 -> 220 bytes
 .../crawl_fetch/part-00000/.data.crc            | Bin 0 -> 104 bytes
 .../crawl_fetch/part-00000/.index.crc           | Bin 0 -> 12 bytes
 .../20150309101656/crawl_fetch/part-00000/data  | Bin 0 -> 12121 bytes
 .../20150309101656/crawl_fetch/part-00000/index | Bin 0 -> 220 bytes
 .../crawl_generate/.part-00000.crc              | Bin 0 -> 52 bytes
 .../20150309101656/crawl_generate/part-00000    | Bin 0 -> 5590 bytes
 .../20150309101656/crawl_parse/.part-00000.crc  | Bin 0 -> 1652 bytes
 .../20150309101656/crawl_parse/part-00000       | Bin 0 -> 210047 bytes
 .../parse_data/part-00000/.data.crc             | Bin 0 -> 460 bytes
 .../parse_data/part-00000/.index.crc            | Bin 0 -> 12 bytes
 .../20150309101656/parse_data/part-00000/data   | Bin 0 -> 57355 bytes
 .../20150309101656/parse_data/part-00000/index  | Bin 0 -> 220 bytes
 .../parse_text/part-00000/.data.crc             | Bin 0 -> 1260 bytes
 .../parse_text/part-00000/.index.crc            | Bin 0 -> 12 bytes
 .../20150309101656/parse_text/part-00000/data   | Bin 0 -> 159920 bytes
 .../20150309101656/parse_text/part-00000/index  | Bin 0 -> 220 bytes
 nutch-plugins/build-plugin.xml                  | 255 ++++++
 nutch-plugins/build.xml                         | 213 +++++
 nutch-plugins/creativecommons/README.txt        |   1 +
 nutch-plugins/creativecommons/build.xml         |  28 +
 .../creativecommons/conf/crawl-urlfilter.txt    |  18 +
 .../creativecommons/conf/nutch-site.xml         |  50 ++
 nutch-plugins/creativecommons/ivy.xml           |  41 +
 nutch-plugins/creativecommons/plugin.xml        |  48 +
 nutch-plugins/creativecommons/pom.xml           |  38 +
 .../creativecommons/nutch/CCIndexingFilter.java | 124 +++
 .../creativecommons/nutch/CCParseFilter.java    | 300 +++++++
 .../java/org/creativecommons/nutch/package.html |   5 +
 .../nutch/TestCCParseFilter.java                |  73 ++
 .../src/test/resources/anchor.html              |   9 +
 .../creativecommons/src/test/resources/rdf.html |  35 +
 .../creativecommons/src/test/resources/rel.html |   6 +
 nutch-plugins/feed/build.xml                    |  45 +
 nutch-plugins/feed/ivy.xml                      |  43 +
 nutch-plugins/feed/plugin.xml                   |  49 +
 nutch-plugins/feed/pom.xml                      |  45 +
 .../nutch/indexer/feed/FeedIndexingFilter.java  | 129 +++
 .../apache/nutch/indexer/feed/package-info.java |  22 +
 .../org/apache/nutch/parse/feed/FeedParser.java | 374 ++++++++
 .../apache/nutch/parse/feed/package-info.java   |  22 +
 .../apache/nutch/parse/feed/TestFeedParser.java | 124 +++
 .../feed/src/test/resources/rsstest.rss         |  36 +
 nutch-plugins/headings/build.xml                |  22 +
 nutch-plugins/headings/ivy.xml                  |  41 +
 nutch-plugins/headings/plugin.xml               |  45 +
 nutch-plugins/headings/pom.xml                  |  38 +
 .../parse/headings/HeadingsParseFilter.java     | 124 +++
 .../nutch/parse/headings/package-info.java      |  22 +
 nutch-plugins/index-anchor/build.xml            |  22 +
 nutch-plugins/index-anchor/ivy.xml              |  41 +
 nutch-plugins/index-anchor/plugin.xml           |  38 +
 nutch-plugins/index-anchor/pom.xml              |  38 +
 .../indexer/anchor/AnchorIndexingFilter.java    | 107 +++
 .../apache/nutch/indexer/anchor/package.html    |   5 +
 .../anchor/TestAnchorIndexingFilter.java        |  67 ++
 nutch-plugins/index-basic/build.xml             |  22 +
 nutch-plugins/index-basic/ivy.xml               |  41 +
 nutch-plugins/index-basic/plugin.xml            |  42 +
 nutch-plugins/index-basic/pom.xml               |  38 +
 .../indexer/basic/BasicIndexingFilter.java      | 158 ++++
 .../org/apache/nutch/indexer/basic/package.html |   5 +
 .../indexer/basic/TestBasicIndexingFilter.java  |  99 +++
 nutch-plugins/index-geoip/build-ivy.xml         |  54 ++
 nutch-plugins/index-geoip/build.xml             |  27 +
 nutch-plugins/index-geoip/ivy.xml               |  46 +
 nutch-plugins/index-geoip/plugin.xml            |  51 ++
 nutch-plugins/index-geoip/pom.xml               |  55 ++
 .../indexer/geoip/GeoIPDocumentCreator.java     | 210 +++++
 .../indexer/geoip/GeoIPIndexingFilter.java      | 241 +++++
 .../nutch/indexer/geoip/package-info.java       |  28 +
 nutch-plugins/index-links/build.xml             |  22 +
 nutch-plugins/index-links/ivy.xml               |  41 +
 nutch-plugins/index-links/plugin.xml            |  41 +
 nutch-plugins/index-links/pom.xml               |  38 +
 .../indexer/links/LinksIndexingFilter.java      | 167 ++++
 .../indexer/links/TestLinksIndexingFilter.java  | 218 +++++
 .../org/apache/nutch/parse/TestOutlinks.java    |  54 ++
 nutch-plugins/index-metadata/build.xml          |  22 +
 nutch-plugins/index-metadata/ivy.xml            |  41 +
 nutch-plugins/index-metadata/plugin.xml         |  42 +
 nutch-plugins/index-metadata/pom.xml            |  38 +
 .../nutch/indexer/metadata/MetadataIndexer.java | 104 +++
 .../nutch/indexer/metadata/package-info.java    |  23 +
 nutch-plugins/index-more/build.xml              |  22 +
 nutch-plugins/index-more/ivy.xml                |  41 +
 nutch-plugins/index-more/plugin.xml             |  42 +
 nutch-plugins/index-more/pom.xml                |  38 +
 .../nutch/indexer/more/MoreIndexingFilter.java  | 344 +++++++
 .../org/apache/nutch/indexer/more/package.html  |   6 +
 .../indexer/more/TestMoreIndexingFilter.java    | 123 +++
 nutch-plugins/index-replace/README.txt          |  95 ++
 nutch-plugins/index-replace/build.xml           |  55 ++
 nutch-plugins/index-replace/ivy.xml             |  41 +
 nutch-plugins/index-replace/plugin.xml          |  22 +
 nutch-plugins/index-replace/pom.xml             |  50 ++
 .../nutch/indexer/replace/FieldReplacer.java    | 196 ++++
 .../nutch/indexer/replace/ReplaceIndexer.java   | 330 +++++++
 .../nutch/indexer/replace/package-info.java     |  22 +
 .../nutch/indexer/replace/TestIndexReplace.java | 456 ++++++++++
 .../src/test/resources/testIndexReplace.html    |  12 +
 nutch-plugins/index-static/build.xml            |  22 +
 nutch-plugins/index-static/ivy.xml              |  41 +
 nutch-plugins/index-static/plugin.xml           |  42 +
 nutch-plugins/index-static/pom.xml              |  38 +
 .../indexer/staticfield/StaticFieldIndexer.java | 143 +++
 .../nutch/indexer/staticfield/package.html      |   5 +
 .../staticfield/TestStaticFieldIndexerTest.java | 194 ++++
 nutch-plugins/indexer-cloudsearch/README.md     |  58 ++
 nutch-plugins/indexer-cloudsearch/build.xml     |  22 +
 .../indexer-cloudsearch/createCSDomain.sh       |  22 +
 nutch-plugins/indexer-cloudsearch/ivy.xml       |  41 +
 nutch-plugins/indexer-cloudsearch/plugin.xml    |  50 ++
 nutch-plugins/indexer-cloudsearch/pom.xml       |  45 +
 .../cloudsearch/CloudSearchConstants.java       |  27 +
 .../cloudsearch/CloudSearchIndexWriter.java     | 382 ++++++++
 .../cloudsearch/CloudSearchUtils.java           |  73 ++
 nutch-plugins/indexer-dummy/build.xml           |  22 +
 nutch-plugins/indexer-dummy/ivy.xml             |  41 +
 nutch-plugins/indexer-dummy/plugin.xml          |  38 +
 nutch-plugins/indexer-dummy/pom.xml             |  38 +
 .../indexwriter/dummy/DummyIndexWriter.java     | 103 +++
 .../nutch/indexwriter/dummy/package-info.java   |  23 +
 nutch-plugins/indexer-elastic/build-ivy.xml     |  54 ++
 nutch-plugins/indexer-elastic/build.xml         |  22 +
 .../indexer-elastic/howto_upgrade_es.txt        |   6 +
 nutch-plugins/indexer-elastic/ivy.xml           |  43 +
 nutch-plugins/indexer-elastic/plugin.xml        |  71 ++
 nutch-plugins/indexer-elastic/pom.xml           |  45 +
 .../indexwriter/elastic/ElasticConstants.java   |  28 +
 .../indexwriter/elastic/ElasticIndexWriter.java | 279 ++++++
 .../nutch/indexwriter/elastic/package-info.java |  22 +
 nutch-plugins/indexer-solr/build-ivy.xml        |  54 ++
 nutch-plugins/indexer-solr/build.xml            |  22 +
 nutch-plugins/indexer-solr/ivy.xml              |  44 +
 nutch-plugins/indexer-solr/plugin.xml           |  48 +
 nutch-plugins/indexer-solr/pom.xml              |  55 ++
 .../nutch/indexwriter/solr/SolrConstants.java   |  56 ++
 .../nutch/indexwriter/solr/SolrIndexWriter.java | 277 ++++++
 .../indexwriter/solr/SolrMappingReader.java     | 147 +++
 .../nutch/indexwriter/solr/SolrUtils.java       |  97 ++
 .../nutch/indexwriter/solr/package-info.java    |  22 +
 nutch-plugins/language-identifier/build.xml     |  38 +
 nutch-plugins/language-identifier/ivy.xml       |  41 +
 nutch-plugins/language-identifier/plugin.xml    |  49 +
 nutch-plugins/language-identifier/pom.xml       |  38 +
 .../nutch/analysis/lang/HTMLLanguageParser.java | 320 +++++++
 .../analysis/lang/LanguageIndexingFilter.java   |  89 ++
 .../nutch/analysis/lang/langmappings.properties | 188 ++++
 .../org/apache/nutch/analysis/lang/package.html |   6 +
 .../analysis/lang/TestHTMLLanguageParser.java   | 149 ++++
 .../java/org/apache/nutch/analysis/lang/da.test | 108 +++
 .../java/org/apache/nutch/analysis/lang/de.test | 104 +++
 .../java/org/apache/nutch/analysis/lang/el.test | 109 +++
 .../java/org/apache/nutch/analysis/lang/en.test | 105 +++
 .../java/org/apache/nutch/analysis/lang/es.test | 107 +++
 .../java/org/apache/nutch/analysis/lang/fi.test | 106 +++
 .../java/org/apache/nutch/analysis/lang/fr.test | 105 +++
 .../java/org/apache/nutch/analysis/lang/it.test | 109 +++
 .../java/org/apache/nutch/analysis/lang/nl.test | 105 +++
 .../java/org/apache/nutch/analysis/lang/pt.test | 105 +++
 .../java/org/apache/nutch/analysis/lang/sv.test | 108 +++
 .../nutch/analysis/lang/test-referencial.txt    |  10 +
 nutch-plugins/lib-htmlunit/build-ivy.xml        |  54 ++
 nutch-plugins/lib-htmlunit/build.xml            |  28 +
 nutch-plugins/lib-htmlunit/ivy.xml              |  52 ++
 nutch-plugins/lib-htmlunit/plugin.xml           | 166 ++++
 nutch-plugins/lib-htmlunit/pom.xml              |  55 ++
 .../protocol/htmlunit/HtmlUnitWebDriver.java    | 189 ++++
 .../htmlunit/HtmlUnitWebWindowListener.java     |  53 ++
 nutch-plugins/lib-http/build.xml                |  22 +
 nutch-plugins/lib-http/ivy.xml                  |  41 +
 nutch-plugins/lib-http/plugin.xml               |  33 +
 nutch-plugins/lib-http/pom.xml                  |  38 +
 .../protocol/http/api/BlockedException.java     |  26 +
 .../nutch/protocol/http/api/HttpBase.java       | 587 ++++++++++++
 .../nutch/protocol/http/api/HttpException.java  |  40 +
 .../protocol/http/api/HttpRobotRulesParser.java | 167 ++++
 .../apache/nutch/protocol/http/api/package.html |   6 +
 .../protocol/http/api/TestRobotRulesParser.java | 123 +++
 nutch-plugins/lib-nekohtml/build.xml            |  30 +
 nutch-plugins/lib-nekohtml/ivy.xml              |  42 +
 nutch-plugins/lib-nekohtml/plugin.xml           |  38 +
 nutch-plugins/lib-nekohtml/pom.xml              |  45 +
 nutch-plugins/lib-regex-filter/build.xml        |  22 +
 nutch-plugins/lib-regex-filter/ivy.xml          |  41 +
 nutch-plugins/lib-regex-filter/plugin.xml       |  33 +
 nutch-plugins/lib-regex-filter/pom.xml          |  54 ++
 .../apache/nutch/urlfilter/api/RegexRule.java   | 102 +++
 .../nutch/urlfilter/api/RegexURLFilterBase.java | 315 +++++++
 .../nutch/urlfilter/api/package-info.java       |  23 +
 .../urlfilter/api/RegexURLFilterBaseTest.java   | 134 +++
 nutch-plugins/lib-selenium/build-ivy.xml        |  54 ++
 nutch-plugins/lib-selenium/build.xml            |  28 +
 .../lib-selenium/howto_upgrade_selenium.txt     |  15 +
 nutch-plugins/lib-selenium/ivy.xml              |  52 ++
 nutch-plugins/lib-selenium/plugin.xml           | 175 ++++
 nutch-plugins/lib-selenium/pom.xml              |  49 +
 .../nutch/protocol/selenium/HttpWebClient.java  | 236 +++++
 nutch-plugins/lib-xml/build.xml                 |  36 +
 nutch-plugins/lib-xml/ivy.xml                   |  44 +
 nutch-plugins/lib-xml/plugin.xml                |  65 ++
 nutch-plugins/lib-xml/pom.xml                   |  38 +
 nutch-plugins/microformats-reltag/build.xml     |  27 +
 nutch-plugins/microformats-reltag/ivy.xml       |  41 +
 nutch-plugins/microformats-reltag/plugin.xml    |  49 +
 nutch-plugins/microformats-reltag/pom.xml       |  38 +
 .../reltag/RelTagIndexingFilter.java            |  77 ++
 .../nutch/microformats/reltag/RelTagParser.java | 148 ++++
 .../nutch/microformats/reltag/package.html      |   8 +
 nutch-plugins/mimetype-filter/build.xml         |  28 +
 nutch-plugins/mimetype-filter/ivy.xml           |  41 +
 nutch-plugins/mimetype-filter/plugin.xml        |  37 +
 nutch-plugins/mimetype-filter/pom.xml           |  38 +
 .../indexer/filter/MimeTypeIndexingFilter.java  | 273 ++++++
 .../filter/MimeTypeIndexingFilterTest.java      | 114 +++
 .../src/test/resources/allow-images.txt         |  34 +
 .../src/test/resources/block-html.txt           |  34 +
 nutch-plugins/nutch-extensionpoints/build.xml   |  30 +
 nutch-plugins/nutch-extensionpoints/ivy.xml     |  41 +
 nutch-plugins/nutch-extensionpoints/plugin.xml  |  67 ++
 nutch-plugins/nutch-extensionpoints/pom.xml     |  38 +
 nutch-plugins/parse-ext/build.xml               |  32 +
 nutch-plugins/parse-ext/command                 |  24 +
 nutch-plugins/parse-ext/ivy.xml                 |  41 +
 nutch-plugins/parse-ext/plugin.xml              |  60 ++
 nutch-plugins/parse-ext/pom.xml                 |  38 +
 .../org/apache/nutch/parse/ext/ExtParser.java   | 183 ++++
 .../apache/nutch/parse/ext/package-info.java    |  22 +
 .../apache/nutch/parse/ext/TestExtParser.java   | 130 +++
 nutch-plugins/parse-html/build.xml              |  40 +
 nutch-plugins/parse-html/ivy.xml                |  42 +
 nutch-plugins/parse-html/plugin.xml             |  48 +
 nutch-plugins/parse-html/pom.xml                |  49 +
 .../org/apache/nutch/parse/html/DOMBuilder.java | 766 ++++++++++++++++
 .../nutch/parse/html/DOMContentUtils.java       | 400 +++++++++
 .../nutch/parse/html/HTMLMetaProcessor.java     | 214 +++++
 .../org/apache/nutch/parse/html/HtmlParser.java | 352 ++++++++
 .../parse/html/XMLCharacterRecognizer.java      | 112 +++
 .../org/apache/nutch/parse/html/package.html    |   5 +
 .../nutch/parse/html/TestDOMContentUtils.java   | 347 ++++++++
 .../apache/nutch/parse/html/TestHtmlParser.java | 122 +++
 .../parse/html/TestRobotsMetaProcessor.java     | 155 ++++
 nutch-plugins/parse-js/build.xml                |  22 +
 nutch-plugins/parse-js/ivy.xml                  |  41 +
 nutch-plugins/parse-js/plugin.xml               |  53 ++
 nutch-plugins/parse-js/pom.xml                  |  38 +
 .../apache/nutch/parse/js/JSParseFilter.java    | 301 +++++++
 .../org/apache/nutch/parse/js/package-info.java |  23 +
 nutch-plugins/parse-metatags/README.txt         |  17 +
 nutch-plugins/parse-metatags/build.xml          |  37 +
 nutch-plugins/parse-metatags/ivy.xml            |  41 +
 nutch-plugins/parse-metatags/plugin.xml         |  22 +
 nutch-plugins/parse-metatags/pom.xml            |  38 +
 .../nutch/parse/metatags/MetaTagsParser.java    | 124 +++
 .../nutch/parse/metatags/package-info.java      |  24 +
 .../nutch/parse/metatags/TestMetatagParser.java | 104 +++
 .../src/test/resources/testMetatags.html        |   9 +
 .../test/resources/testMultivalueMetatags.html  |  12 +
 nutch-plugins/parse-replace/README.txt          |  91 ++
 nutch-plugins/parse-replace/build.xml           |  37 +
 nutch-plugins/parse-replace/ivy.xml             |  41 +
 nutch-plugins/parse-replace/plugin.xml          |  22 +
 nutch-plugins/parse-replace/pom.xml             |  38 +
 .../nutch/parse/replace/ReplaceParser.java      |  74 ++
 .../nutch/parse/replace/package-info.java       |  22 +
 .../nutch/parse/replace/TestParseReplace.java   |  68 ++
 .../src/test/resources/testParseReplace.html    |  11 +
 nutch-plugins/parse-swf/build.xml               |  38 +
 nutch-plugins/parse-swf/ivy.xml                 |  41 +
 nutch-plugins/parse-swf/lib/javaswf-LICENSE.txt |  33 +
 nutch-plugins/parse-swf/lib/javaswf.jar         | Bin 0 -> 125369 bytes
 nutch-plugins/parse-swf/plugin.xml              |  44 +
 nutch-plugins/parse-swf/pom.xml                 |  46 +
 .../org/apache/nutch/parse/swf/SWFParser.java   | 685 ++++++++++++++
 .../apache/nutch/parse/swf/package-info.java    |  22 +
 .../apache/nutch/parse/swf/TestSWFParser.java   |  94 ++
 .../parse-swf/src/test/resources/test1.swf      | Bin 0 -> 21054 bytes
 .../parse-swf/src/test/resources/test1.txt      |  60 ++
 .../parse-swf/src/test/resources/test2.swf      | Bin 0 -> 42534 bytes
 .../parse-swf/src/test/resources/test2.txt      |   5 +
 .../parse-swf/src/test/resources/test3.swf      | Bin 0 -> 51562 bytes
 .../parse-swf/src/test/resources/test3.txt      |  11 +
 nutch-plugins/parse-tika/build-ivy.xml          |  54 ++
 nutch-plugins/parse-tika/build.xml              |  55 ++
 nutch-plugins/parse-tika/howto_upgrade_tika.txt |   8 +
 nutch-plugins/parse-tika/ivy.xml                |  46 +
 nutch-plugins/parse-tika/plugin.xml             | 136 +++
 nutch-plugins/parse-tika/pom.xml                |  54 ++
 .../tika/BoilerpipeExtractorRepository.java     |  62 ++
 .../org/apache/nutch/parse/tika/DOMBuilder.java | 794 +++++++++++++++++
 .../nutch/parse/tika/DOMContentUtils.java       | 402 +++++++++
 .../nutch/parse/tika/HTMLMetaProcessor.java     | 214 +++++
 .../org/apache/nutch/parse/tika/TikaParser.java | 286 ++++++
 .../parse/tika/XMLCharacterRecognizer.java      | 112 +++
 .../apache/nutch/parse/tika/package-info.java   |  23 +
 .../apache/nutch/tika/TestDOMContentUtils.java  | 337 +++++++
 .../org/apache/nutch/tika/TestFeedParser.java   | 121 +++
 .../apache/nutch/tika/TestImageMetadata.java    |  67 ++
 .../org/apache/nutch/tika/TestMSWordParser.java |  92 ++
 .../org/apache/nutch/tika/TestOOParser.java     | 107 +++
 .../org/apache/nutch/tika/TestPdfParser.java    |  73 ++
 .../org/apache/nutch/tika/TestRTFParser.java    |  81 ++
 .../nutch/tika/TestRobotsMetaProcessor.java     | 156 ++++
 .../parse-tika/src/test/resources/encrypted.pdf | Bin 0 -> 3431 bytes
 .../parse-tika/src/test/resources/nutch.html    | 519 +++++++++++
 .../src/test/resources/nutch_logo_tm.gif        | Bin 0 -> 2747 bytes
 .../parse-tika/src/test/resources/ootest.odt    | Bin 0 -> 20753 bytes
 .../parse-tika/src/test/resources/ootest.sxw    | Bin 0 -> 20125 bytes
 .../parse-tika/src/test/resources/ootest.txt    |  30 +
 .../parse-tika/src/test/resources/pdftest.pdf   | 157 ++++
 .../parse-tika/src/test/resources/rsstest.rss   |  37 +
 .../parse-tika/src/test/resources/test.rtf      |  17 +
 .../parse-tika/src/test/resources/word97.doc    | Bin 0 -> 8192 bytes
 nutch-plugins/parse-zip/build.xml               |  38 +
 nutch-plugins/parse-zip/ivy.xml                 |  41 +
 nutch-plugins/parse-zip/plugin.xml              |  46 +
 nutch-plugins/parse-zip/pom.xml                 |  38 +
 .../org/apache/nutch/parse/zip/ZipParser.java   | 144 +++
 .../nutch/parse/zip/ZipTextExtractor.java       | 120 +++
 .../apache/nutch/parse/zip/package-info.java    |  22 +
 .../apache/nutch/parse/zip/TestZipParser.java   |  71 ++
 .../parse-zip/src/test/resources/test.zip       | Bin 0 -> 182 bytes
 .../parsefilter-naivebayes/build-ivy.xml        |  54 ++
 nutch-plugins/parsefilter-naivebayes/build.xml  |  22 +
 nutch-plugins/parsefilter-naivebayes/ivy.xml    |  49 +
 nutch-plugins/parsefilter-naivebayes/plugin.xml |  56 ++
 nutch-plugins/parsefilter-naivebayes/pom.xml    |  38 +
 .../nutch/parsefilter/naivebayes/Classify.java  | 120 +++
 .../naivebayes/NaiveBayesParseFilter.java       | 197 ++++
 .../nutch/parsefilter/naivebayes/Train.java     | 148 ++++
 .../parsefilter/naivebayes/package-info.java    |  28 +
 nutch-plugins/parsefilter-regex/build.xml       |  27 +
 nutch-plugins/parsefilter-regex/ivy.xml         |  37 +
 nutch-plugins/parsefilter-regex/plugin.xml      |  42 +
 nutch-plugins/parsefilter-regex/pom.xml         |  38 +
 .../parsefilter/regex/RegexParseFilter.java     | 199 +++++
 .../nutch/parsefilter/regex/package-info.java   |  23 +
 .../parsefilter/regex/TestRegexParseFilter.java |  77 ++
 .../src/test/resources/regex-parsefilter.txt    |  10 +
 nutch-plugins/plugin.dtd                        | 206 +++++
 nutch-plugins/plugin/pom.xml                    |  38 +
 nutch-plugins/pom.xml                           | 164 ++++
 nutch-plugins/protocol-file/build.xml           |  29 +
 nutch-plugins/protocol-file/ivy.xml             |  41 +
 nutch-plugins/protocol-file/plugin.xml          |  46 +
 nutch-plugins/protocol-file/pom.xml             |  38 +
 .../org/apache/nutch/protocol/file/File.java    | 228 +++++
 .../apache/nutch/protocol/file/FileError.java   |  36 +
 .../nutch/protocol/file/FileException.java      |  40 +
 .../nutch/protocol/file/FileResponse.java       | 317 +++++++
 .../org/apache/nutch/protocol/file/package.html |   5 +
 .../nutch/protocol/file/TestProtocolFile.java   |  99 +++
 .../src/test/resources/testprotocolfile.txt     |   1 +
 .../resources/testprotocolfile_(encoded).txt    |   1 +
 nutch-plugins/protocol-ftp/build.xml            |  22 +
 nutch-plugins/protocol-ftp/ivy.xml              |  42 +
 nutch-plugins/protocol-ftp/plugin.xml           |  46 +
 nutch-plugins/protocol-ftp/pom.xml              |  38 +
 .../org/apache/nutch/protocol/ftp/Client.java   | 595 +++++++++++++
 .../java/org/apache/nutch/protocol/ftp/Ftp.java | 267 ++++++
 .../org/apache/nutch/protocol/ftp/FtpError.java |  36 +
 .../apache/nutch/protocol/ftp/FtpException.java |  46 +
 .../ftp/FtpExceptionBadSystResponse.java        |  29 +
 .../FtpExceptionCanNotHaveDataConnection.java   |  29 +
 ...ExceptionControlClosedByForcedDataClose.java |  30 +
 .../ftp/FtpExceptionUnknownForcedDataClose.java |  30 +
 .../apache/nutch/protocol/ftp/FtpResponse.java  | 521 +++++++++++
 .../nutch/protocol/ftp/FtpRobotRulesParser.java | 121 +++
 .../protocol/ftp/PrintCommandListener.java      |  71 ++
 .../org/apache/nutch/protocol/ftp/package.html  |   5 +
 nutch-plugins/protocol-htmlunit/build.xml       |  37 +
 nutch-plugins/protocol-htmlunit/ivy.xml         |  38 +
 nutch-plugins/protocol-htmlunit/plugin.xml      |  51 ++
 nutch-plugins/protocol-htmlunit/pom.xml         |  51 ++
 .../apache/nutch/protocol/htmlunit/Http.java    |  63 ++
 .../nutch/protocol/htmlunit/HttpResponse.java   | 573 ++++++++++++
 .../apache/nutch/protocol/htmlunit/package.html |  21 +
 nutch-plugins/protocol-http/build.xml           |  50 ++
 nutch-plugins/protocol-http/ivy.xml             |  41 +
 nutch-plugins/protocol-http/jsp/basic-http.jsp  |  44 +
 nutch-plugins/protocol-http/jsp/brokenpage.jsp  |  47 +
 nutch-plugins/protocol-http/jsp/redirect301.jsp |  49 +
 nutch-plugins/protocol-http/jsp/redirect302.jsp |  49 +
 nutch-plugins/protocol-http/plugin.xml          |  51 ++
 nutch-plugins/protocol-http/pom.xml             |  57 ++
 .../org/apache/nutch/protocol/http/Http.java    |  73 ++
 .../nutch/protocol/http/HttpResponse.java       | 558 ++++++++++++
 .../org/apache/nutch/protocol/http/package.html |   5 +
 .../src/test/conf/nutch-site-test.xml           |  52 ++
 .../nutch/protocol/http/TestProtocolHttp.java   | 140 +++
 nutch-plugins/protocol-httpclient/build.xml     |  45 +
 nutch-plugins/protocol-httpclient/ivy.xml       |  42 +
 nutch-plugins/protocol-httpclient/jsp/basic.jsp |  74 ++
 .../protocol-httpclient/jsp/cookies.jsp         |  63 ++
 .../protocol-httpclient/jsp/digest.jsp          |  68 ++
 .../protocol-httpclient/jsp/noauth.jsp          |  36 +
 nutch-plugins/protocol-httpclient/jsp/ntlm.jsp  |  89 ++
 nutch-plugins/protocol-httpclient/plugin.xml    |  58 ++
 nutch-plugins/protocol-httpclient/pom.xml       |  62 ++
 .../DummySSLProtocolSocketFactory.java          | 163 ++++
 .../httpclient/DummyX509TrustManager.java       |  92 ++
 .../apache/nutch/protocol/httpclient/Http.java  | 572 ++++++++++++
 .../protocol/httpclient/HttpAuthentication.java |  45 +
 .../httpclient/HttpAuthenticationException.java |  71 ++
 .../httpclient/HttpAuthenticationFactory.java   |  98 ++
 .../httpclient/HttpBasicAuthentication.java     | 199 +++++
 .../httpclient/HttpFormAuthConfigurer.java      | 106 +++
 .../httpclient/HttpFormAuthentication.java      | 223 +++++
 .../nutch/protocol/httpclient/HttpResponse.java | 216 +++++
 .../nutch/protocol/httpclient/package.html      |   9 +
 .../src/test/conf/httpclient-auth-test.xml      |  58 ++
 .../src/test/conf/nutch-site-test.xml           |  52 ++
 .../httpclient/TestProtocolHttpClient.java      | 217 +++++
 .../protocol-interactiveselenium/README.md      |  38 +
 .../protocol-interactiveselenium/build-ivy.xml  |  54 ++
 .../protocol-interactiveselenium/build.xml      |  37 +
 .../protocol-interactiveselenium/ivy.xml        |  42 +
 .../protocol-interactiveselenium/plugin.xml     |  47 +
 .../protocol-interactiveselenium/pom.xml        |  50 ++
 .../protocol/interactiveselenium/Http.java      |  59 ++
 .../interactiveselenium/HttpResponse.java       | 399 +++++++++
 .../DefalultMultiInteractionHandler.java        |  53 ++
 .../DefaultClickAllAjaxLinksHandler.java        |  88 ++
 .../handlers/DefaultHandler.java                |  30 +
 .../handlers/InteractiveSeleniumHandler.java    |  25 +
 .../protocol/interactiveselenium/package.html   |   5 +
 nutch-plugins/protocol-selenium/README.md       | 208 +++++
 nutch-plugins/protocol-selenium/build-ivy.xml   |  54 ++
 nutch-plugins/protocol-selenium/build.xml       |  36 +
 nutch-plugins/protocol-selenium/ivy.xml         |  42 +
 nutch-plugins/protocol-selenium/plugin.xml      |  47 +
 nutch-plugins/protocol-selenium/pom.xml         |  50 ++
 .../apache/nutch/protocol/selenium/Http.java    |  59 ++
 .../nutch/protocol/selenium/HttpResponse.java   | 360 ++++++++
 .../apache/nutch/protocol/selenium/package.html |   5 +
 nutch-plugins/scoring-depth/build.xml           |   6 +
 nutch-plugins/scoring-depth/ivy.xml             |  41 +
 nutch-plugins/scoring-depth/plugin.xml          |  24 +
 nutch-plugins/scoring-depth/pom.xml             |  38 +
 .../nutch/scoring/depth/DepthScoringFilter.java | 207 +++++
 .../nutch/scoring/depth/package-info.java       |  23 +
 nutch-plugins/scoring-link/build.xml            |  27 +
 nutch-plugins/scoring-link/ivy.xml              |  41 +
 nutch-plugins/scoring-link/plugin.xml           |  39 +
 nutch-plugins/scoring-link/pom.xml              |  38 +
 .../scoring/link/LinkAnalysisScoringFilter.java |  95 ++
 .../apache/nutch/scoring/link/package-info.java |  23 +
 nutch-plugins/scoring-opic/build.xml            |  27 +
 nutch-plugins/scoring-opic/ivy.xml              |  41 +
 nutch-plugins/scoring-opic/plugin.xml           |  39 +
 nutch-plugins/scoring-opic/pom.xml              |  38 +
 .../nutch/scoring/opic/OPICScoringFilter.java   | 173 ++++
 .../apache/nutch/scoring/opic/package-info.java |  23 +
 nutch-plugins/scoring-similarity/build-ivy.xml  |  54 ++
 nutch-plugins/scoring-similarity/build.xml      |  27 +
 nutch-plugins/scoring-similarity/ivy.xml        |  42 +
 nutch-plugins/scoring-similarity/plugin.xml     |  45 +
 nutch-plugins/scoring-similarity/pom.xml        |  45 +
 .../scoring/similarity/SimilarityModel.java     |  38 +
 .../similarity/SimilarityScoringFilter.java     |  70 ++
 .../similarity/cosine/CosineSimilarity.java     |  84 ++
 .../scoring/similarity/cosine/DocVector.java    |  57 ++
 .../nutch/scoring/similarity/cosine/Model.java  | 190 ++++
 .../scoring/similarity/cosine/package-info.java |   7 +
 .../similarity/util/LuceneAnalyzerUtil.java     |  93 ++
 .../similarity/util/LuceneTokenizer.java        | 166 ++++
 .../scoring/similarity/util/package-info.java   |  24 +
 nutch-plugins/subcollection/README.txt          |  10 +
 nutch-plugins/subcollection/build.xml           |  22 +
 nutch-plugins/subcollection/ivy.xml             |  41 +
 nutch-plugins/subcollection/plugin.xml          |  41 +
 nutch-plugins/subcollection/pom.xml             |  38 +
 .../nutch/collection/CollectionManager.java     | 240 +++++
 .../apache/nutch/collection/Subcollection.java  | 259 ++++++
 .../org/apache/nutch/collection/package.html    |  36 +
 .../SubcollectionIndexingFilter.java            | 101 +++
 .../indexer/subcollection/package-info.java     |  25 +
 .../nutch/collection/TestSubcollection.java     | 112 +++
 nutch-plugins/tld/build.xml                     |  22 +
 nutch-plugins/tld/ivy.xml                       |  41 +
 nutch-plugins/tld/plugin.xml                    |  51 ++
 nutch-plugins/tld/pom.xml                       |  38 +
 .../nutch/indexer/tld/TLDIndexingFilter.java    |  69 ++
 .../org/apache/nutch/indexer/tld/package.html   |   5 +
 .../nutch/scoring/tld/TLDScoringFilter.java     | 114 +++
 .../org/apache/nutch/scoring/tld/package.html   |   5 +
 nutch-plugins/urlfilter-automaton/build.xml     |  51 ++
 nutch-plugins/urlfilter-automaton/ivy.xml       |  42 +
 nutch-plugins/urlfilter-automaton/plugin.xml    |  43 +
 nutch-plugins/urlfilter-automaton/pom.xml       |  58 ++
 .../urlfilter/automaton/AutomatonURLFilter.java | 116 +++
 .../nutch/urlfilter/automaton/package.html      |   9 +
 .../automaton/TestAutomatonURLFilter.java       |  56 ++
 .../src/test/resources/Benchmarks.rules         |  26 +
 .../src/test/resources/Benchmarks.urls          | 297 +++++++
 .../src/test/resources/IntranetCrawling.rules   |  24 +
 .../src/test/resources/IntranetCrawling.urls    |   8 +
 .../src/test/resources/WholeWebCrawling.rules   |  19 +
 .../src/test/resources/WholeWebCrawling.urls    |  11 +
 nutch-plugins/urlfilter-domain/build.xml        |  28 +
 nutch-plugins/urlfilter-domain/ivy.xml          |  41 +
 nutch-plugins/urlfilter-domain/plugin.xml       |  43 +
 nutch-plugins/urlfilter-domain/pom.xml          |  38 +
 .../nutch/urlfilter/domain/DomainURLFilter.java | 212 +++++
 .../nutch/urlfilter/domain/package-info.java    |  25 +
 .../urlfilter/domain/TestDomainURLFilter.java   |  67 ++
 .../src/test/resources/hosts.txt                |   5 +
 .../urlfilter-domainblacklist/build.xml         |  28 +
 nutch-plugins/urlfilter-domainblacklist/ivy.xml |  41 +
 .../urlfilter-domainblacklist/plugin.xml        |  43 +
 nutch-plugins/urlfilter-domainblacklist/pom.xml |  38 +
 .../DomainBlacklistURLFilter.java               | 210 +++++
 .../urlfilter/domainblacklist/package-info.java |  24 +
 .../TestDomainBlacklistURLFilter.java           |  49 +
 .../src/test/resources/hosts.txt                |   5 +
 nutch-plugins/urlfilter-ignoreexempt/README.md  |  43 +
 nutch-plugins/urlfilter-ignoreexempt/build.xml  |  55 ++
 nutch-plugins/urlfilter-ignoreexempt/ivy.xml    |  41 +
 nutch-plugins/urlfilter-ignoreexempt/plugin.xml |  45 +
 nutch-plugins/urlfilter-ignoreexempt/pom.xml    |  45 +
 .../ignoreexempt/ExemptionUrlFilter.java        | 101 +++
 .../urlfilter/ignoreexempt/package-info.java    |  24 +
 nutch-plugins/urlfilter-prefix/build.xml        |  22 +
 nutch-plugins/urlfilter-prefix/ivy.xml          |  41 +
 nutch-plugins/urlfilter-prefix/plugin.xml       |  47 +
 nutch-plugins/urlfilter-prefix/pom.xml          |  38 +
 .../nutch/urlfilter/prefix/PrefixURLFilter.java | 178 ++++
 .../apache/nutch/urlfilter/prefix/package.html  |   5 +
 .../urlfilter/prefix/TestPrefixURLFilter.java   |  79 ++
 nutch-plugins/urlfilter-regex/build.xml         |  51 ++
 nutch-plugins/urlfilter-regex/ivy.xml           |  41 +
 nutch-plugins/urlfilter-regex/plugin.xml        |  48 +
 nutch-plugins/urlfilter-regex/pom.xml           |  53 ++
 .../nutch/urlfilter/regex/RegexURLFilter.java   | 111 +++
 .../apache/nutch/urlfilter/regex/package.html   |   5 +
 .../urlfilter/regex/TestRegexURLFilter.java     |  61 ++
 .../src/test/resources/Benchmarks.rules         |  26 +
 .../src/test/resources/Benchmarks.urls          | 297 +++++++
 .../src/test/resources/IntranetCrawling.rules   |  27 +
 .../src/test/resources/IntranetCrawling.urls    |   8 +
 .../src/test/resources/WholeWebCrawling.rules   |  22 +
 .../src/test/resources/WholeWebCrawling.urls    |  11 +
 .../src/test/resources/nutch1838.rules          |  12 +
 .../src/test/resources/nutch1838.urls           |   3 +
 nutch-plugins/urlfilter-suffix/build.xml        |  22 +
 nutch-plugins/urlfilter-suffix/ivy.xml          |  41 +
 nutch-plugins/urlfilter-suffix/plugin.xml       |  47 +
 nutch-plugins/urlfilter-suffix/pom.xml          |  38 +
 .../nutch/urlfilter/suffix/SuffixURLFilter.java | 331 +++++++
 .../nutch/urlfilter/suffix/package-info.java    |  23 +
 .../urlfilter/suffix/TestSuffixURLFilter.java   | 123 +++
 nutch-plugins/urlfilter-validator/build.xml     |  22 +
 nutch-plugins/urlfilter-validator/ivy.xml       |  41 +
 nutch-plugins/urlfilter-validator/plugin.xml    |  41 +
 nutch-plugins/urlfilter-validator/pom.xml       |  38 +
 .../nutch/urlfilter/validator/UrlValidator.java | 386 ++++++++
 .../nutch/urlfilter/validator/package.html      |   9 +
 .../urlfilter/validator/TestUrlValidator.java   |  79 ++
 nutch-plugins/urlmeta/build.xml                 |  22 +
 nutch-plugins/urlmeta/ivy.xml                   |  41 +
 nutch-plugins/urlmeta/plugin.xml                |  47 +
 nutch-plugins/urlmeta/pom.xml                   |  38 +
 .../indexer/urlmeta/URLMetaIndexingFilter.java  | 118 +++
 .../apache/nutch/indexer/urlmeta/package.html   |  12 +
 .../scoring/urlmeta/URLMetaScoringFilter.java   | 175 ++++
 .../apache/nutch/scoring/urlmeta/package.html   |  11 +
 nutch-plugins/urlnormalizer-ajax/build.xml      |  22 +
 nutch-plugins/urlnormalizer-ajax/ivy.xml        |  41 +
 nutch-plugins/urlnormalizer-ajax/plugin.xml     |  41 +
 nutch-plugins/urlnormalizer-ajax/pom.xml        |  38 +
 .../urlnormalizer/ajax/AjaxURLNormalizer.java   | 236 +++++
 .../ajax/TestAjaxURLNormalizer.java             |  67 ++
 nutch-plugins/urlnormalizer-basic/build.xml     |  22 +
 nutch-plugins/urlnormalizer-basic/ivy.xml       |  41 +
 nutch-plugins/urlnormalizer-basic/plugin.xml    |  41 +
 nutch-plugins/urlnormalizer-basic/pom.xml       |  38 +
 .../urlnormalizer/basic/BasicURLNormalizer.java | 290 ++++++
 .../net/urlnormalizer/basic/package-info.java   |  23 +
 .../basic/TestBasicURLNormalizer.java           | 175 ++++
 nutch-plugins/urlnormalizer-host/build.xml      |  27 +
 nutch-plugins/urlnormalizer-host/ivy.xml        |  41 +
 nutch-plugins/urlnormalizer-host/plugin.xml     |  43 +
 nutch-plugins/urlnormalizer-host/pom.xml        |  38 +
 .../urlnormalizer/host/HostURLNormalizer.java   | 198 +++++
 .../net/urlnormalizer/host/package-info.java    |  23 +
 .../host/TestHostURLNormalizer.java             |  57 ++
 .../src/test/resources/hosts.txt                |   8 +
 nutch-plugins/urlnormalizer-pass/build.xml      |  22 +
 nutch-plugins/urlnormalizer-pass/ivy.xml        |  41 +
 nutch-plugins/urlnormalizer-pass/plugin.xml     |  41 +
 nutch-plugins/urlnormalizer-pass/pom.xml        |  38 +
 .../urlnormalizer/pass/PassURLNormalizer.java   |  49 +
 .../net/urlnormalizer/pass/package-info.java    |  23 +
 .../pass/TestPassURLNormalizer.java             |  45 +
 nutch-plugins/urlnormalizer-protocol/build.xml  |  27 +
 nutch-plugins/urlnormalizer-protocol/ivy.xml    |  41 +
 nutch-plugins/urlnormalizer-protocol/plugin.xml |  43 +
 nutch-plugins/urlnormalizer-protocol/pom.xml    |  38 +
 .../protocol/ProtocolURLNormalizer.java         | 190 ++++
 .../protocol/TestProtocolURLNormalizer.java     |  55 ++
 .../src/test/resources/protocols.txt            |   7 +
 .../urlnormalizer-querystring/build.xml         |  22 +
 nutch-plugins/urlnormalizer-querystring/ivy.xml |  41 +
 .../urlnormalizer-querystring/plugin.xml        |  42 +
 nutch-plugins/urlnormalizer-querystring/pom.xml |  38 +
 .../querystring/QuerystringURLNormalizer.java   |  91 ++
 .../urlnormalizer/querystring/package-info.java |  23 +
 .../TestQuerystringURLNormalizer.java           |  49 +
 nutch-plugins/urlnormalizer-regex/build.xml     |  34 +
 nutch-plugins/urlnormalizer-regex/ivy.xml       |  41 +
 nutch-plugins/urlnormalizer-regex/plugin.xml    |  41 +
 nutch-plugins/urlnormalizer-regex/pom.xml       |  38 +
 .../urlnormalizer/regex/RegexURLNormalizer.java | 324 +++++++
 .../net/urlnormalizer/regex/package-info.java   |  23 +
 .../regex/TestRegexURLNormalizer.java           | 186 ++++
 .../test/resources/regex-normalize-default.test |  84 ++
 .../test/resources/regex-normalize-default.xml  |  66 ++
 .../test/resources/regex-normalize-scope1.test  |   8 +
 .../test/resources/regex-normalize-scope1.xml   |  21 +
 nutch-plugins/urlnormalizer-slash/build.xml     |  27 +
 nutch-plugins/urlnormalizer-slash/ivy.xml       |  41 +
 nutch-plugins/urlnormalizer-slash/plugin.xml    |  43 +
 nutch-plugins/urlnormalizer-slash/pom.xml       |  38 +
 .../urlnormalizer/slash/SlashURLNormalizer.java | 224 +++++
 .../slash/TestSlashURLNormalizer.java           |  73 ++
 .../src/test/resources/slashes.txt              |   7 +
 pom.xml                                         | 157 ++++
 src/bin/crawl                                   | 281 ------
 src/bin/nutch                                   | 324 -------
 .../nutch/crawl/AbstractFetchSchedule.java      | 227 -----
 .../nutch/crawl/AdaptiveFetchSchedule.java      | 203 -----
 src/java/org/apache/nutch/crawl/CrawlDatum.java | 572 ------------
 src/java/org/apache/nutch/crawl/CrawlDb.java    | 349 --------
 .../org/apache/nutch/crawl/CrawlDbFilter.java   | 111 ---
 .../org/apache/nutch/crawl/CrawlDbMerger.java   | 216 -----
 .../org/apache/nutch/crawl/CrawlDbReader.java   | 887 -------------------
 .../org/apache/nutch/crawl/CrawlDbReducer.java  | 339 -------
 .../apache/nutch/crawl/DeduplicationJob.java    | 389 --------
 .../nutch/crawl/DefaultFetchSchedule.java       |  45 -
 .../org/apache/nutch/crawl/FetchSchedule.java   | 208 -----
 .../nutch/crawl/FetchScheduleFactory.java       |  53 --
 src/java/org/apache/nutch/crawl/Generator.java  | 859 ------------------
 src/java/org/apache/nutch/crawl/Injector.java   | 510 -----------
 src/java/org/apache/nutch/crawl/Inlink.java     |  83 --
 src/java/org/apache/nutch/crawl/Inlinks.java    | 110 ---
 src/java/org/apache/nutch/crawl/LinkDb.java     | 428 ---------
 .../org/apache/nutch/crawl/LinkDbFilter.java    | 128 ---
 .../org/apache/nutch/crawl/LinkDbMerger.java    | 204 -----
 .../org/apache/nutch/crawl/LinkDbReader.java    | 203 -----
 .../org/apache/nutch/crawl/MD5Signature.java    |  39 -
 .../nutch/crawl/MimeAdaptiveFetchSchedule.java  | 236 -----
 .../org/apache/nutch/crawl/NutchWritable.java   |  66 --
 src/java/org/apache/nutch/crawl/Signature.java  |  37 -
 .../apache/nutch/crawl/SignatureComparator.java |  57 --
 .../apache/nutch/crawl/SignatureFactory.java    |  62 --
 .../apache/nutch/crawl/TextMD5Signature.java    |  42 -
 .../nutch/crawl/TextProfileSignature.java       | 199 -----
 .../org/apache/nutch/crawl/URLPartitioner.java  |  96 --
 src/java/org/apache/nutch/crawl/package.html    |   5 -
 .../org/apache/nutch/fetcher/FetchItem.java     | 118 ---
 .../apache/nutch/fetcher/FetchItemQueue.java    | 139 ---
 .../apache/nutch/fetcher/FetchItemQueues.java   | 212 -----
 .../org/apache/nutch/fetcher/FetchNode.java     |  59 --
 .../org/apache/nutch/fetcher/FetchNodeDb.java   |  49 -
 src/java/org/apache/nutch/fetcher/Fetcher.java  | 600 -------------
 .../nutch/fetcher/FetcherOutputFormat.java      | 123 ---
 .../org/apache/nutch/fetcher/FetcherThread.java | 768 ----------------
 .../org/apache/nutch/fetcher/QueueFeeder.java   | 104 ---
 src/java/org/apache/nutch/fetcher/package.html  |   5 -
 src/java/org/apache/nutch/hostdb/HostDatum.java | 324 -------
 .../org/apache/nutch/hostdb/ReadHostDb.java     | 240 -----
 .../org/apache/nutch/hostdb/ResolverThread.java | 121 ---
 .../org/apache/nutch/hostdb/UpdateHostDb.java   | 259 ------
 .../apache/nutch/hostdb/UpdateHostDbMapper.java | 239 -----
 .../nutch/hostdb/UpdateHostDbReducer.java       | 427 ---------
 .../org/apache/nutch/indexer/CleaningJob.java   | 210 -----
 .../org/apache/nutch/indexer/IndexWriter.java   |  47 -
 .../org/apache/nutch/indexer/IndexWriters.java  | 145 ---
 .../apache/nutch/indexer/IndexerMapReduce.java  | 422 ---------
 .../nutch/indexer/IndexerOutputFormat.java      |  57 --
 .../apache/nutch/indexer/IndexingException.java |  39 -
 .../apache/nutch/indexer/IndexingFilter.java    |  61 --
 .../apache/nutch/indexer/IndexingFilters.java   |  60 --
 .../nutch/indexer/IndexingFiltersChecker.java   | 371 --------
 .../org/apache/nutch/indexer/IndexingJob.java   | 358 --------
 .../org/apache/nutch/indexer/NutchDocument.java | 144 ---
 .../org/apache/nutch/indexer/NutchField.java    | 137 ---
 .../apache/nutch/indexer/NutchIndexAction.java  |  58 --
 src/java/org/apache/nutch/indexer/package.html  |  10 -
 .../apache/nutch/metadata/CreativeCommons.java  |  35 -
 .../org/apache/nutch/metadata/DublinCore.java   | 161 ----
 src/java/org/apache/nutch/metadata/Feed.java    |  38 -
 .../org/apache/nutch/metadata/HttpHeaders.java  |  51 --
 .../org/apache/nutch/metadata/MetaWrapper.java  | 120 ---
 .../org/apache/nutch/metadata/Metadata.java     | 280 ------
 src/java/org/apache/nutch/metadata/Nutch.java   |  98 --
 .../nutch/metadata/SpellCheckedMetadata.java    | 150 ----
 src/java/org/apache/nutch/metadata/package.html |   6 -
 .../apache/nutch/net/URLExemptionFilter.java    |  43 -
 .../apache/nutch/net/URLExemptionFilters.java   |  64 --
 src/java/org/apache/nutch/net/URLFilter.java    |  40 -
 .../org/apache/nutch/net/URLFilterChecker.java  | 134 ---
 .../apache/nutch/net/URLFilterException.java    |  39 -
 src/java/org/apache/nutch/net/URLFilters.java   |  44 -
 .../org/apache/nutch/net/URLNormalizer.java     |  37 -
 .../apache/nutch/net/URLNormalizerChecker.java  | 117 ---
 .../org/apache/nutch/net/URLNormalizers.java    | 325 -------
 src/java/org/apache/nutch/net/package-info.java |  23 -
 .../nutch/net/protocols/HttpDateFormat.java     | 124 ---
 .../nutch/net/protocols/ProtocolException.java  |  47 -
 .../apache/nutch/net/protocols/Response.java    |  46 -
 .../nutch/net/protocols/package-info.java       |  23 -
 .../org/apache/nutch/parse/HTMLMetaTags.java    | 203 -----
 .../org/apache/nutch/parse/HtmlParseFilter.java |  45 -
 .../apache/nutch/parse/HtmlParseFilters.java    |  62 --
 src/java/org/apache/nutch/parse/Outlink.java    | 135 ---
 .../apache/nutch/parse/OutlinkExtractor.java    | 145 ---
 src/java/org/apache/nutch/parse/Parse.java      |  38 -
 .../org/apache/nutch/parse/ParseCallable.java   |  37 -
 src/java/org/apache/nutch/parse/ParseData.java  | 255 ------
 .../org/apache/nutch/parse/ParseException.java  |  39 -
 src/java/org/apache/nutch/parse/ParseImpl.java  |  87 --
 .../apache/nutch/parse/ParseOutputFormat.java   | 398 ---------
 .../org/apache/nutch/parse/ParsePluginList.java |  71 --
 .../apache/nutch/parse/ParsePluginsReader.java  | 278 ------
 .../org/apache/nutch/parse/ParseResult.java     | 178 ----
 .../org/apache/nutch/parse/ParseSegment.java    | 309 -------
 .../org/apache/nutch/parse/ParseStatus.java     | 311 -------
 src/java/org/apache/nutch/parse/ParseText.java  | 119 ---
 src/java/org/apache/nutch/parse/ParseUtil.java  | 181 ----
 src/java/org/apache/nutch/parse/Parser.java     |  58 --
 .../org/apache/nutch/parse/ParserChecker.java   | 270 ------
 .../org/apache/nutch/parse/ParserFactory.java   | 428 ---------
 .../org/apache/nutch/parse/ParserNotFound.java  |  47 -
 .../org/apache/nutch/parse/package-info.java    |  22 -
 .../plugin/CircularDependencyException.java     |  36 -
 src/java/org/apache/nutch/plugin/Extension.java | 194 ----
 .../org/apache/nutch/plugin/ExtensionPoint.java | 123 ---
 .../plugin/MissingDependencyException.java      |  36 -
 src/java/org/apache/nutch/plugin/Pluggable.java |  31 -
 src/java/org/apache/nutch/plugin/Plugin.java    |  95 --
 .../apache/nutch/plugin/PluginClassLoader.java  |  80 --
 .../apache/nutch/plugin/PluginDescriptor.java   | 363 --------
 .../nutch/plugin/PluginManifestParser.java      | 303 -------
 .../apache/nutch/plugin/PluginRepository.java   | 523 -----------
 .../nutch/plugin/PluginRuntimeException.java    |  37 -
 src/java/org/apache/nutch/plugin/package.html   |  40 -
 src/java/org/apache/nutch/protocol/Content.java | 296 -------
 .../org/apache/nutch/protocol/Protocol.java     |  68 --
 .../nutch/protocol/ProtocolException.java       |  39 -
 .../apache/nutch/protocol/ProtocolFactory.java  | 119 ---
 .../apache/nutch/protocol/ProtocolNotFound.java |  36 -
 .../apache/nutch/protocol/ProtocolOutput.java   |  55 --
 .../apache/nutch/protocol/ProtocolStatus.java   | 297 -------
 .../apache/nutch/protocol/RobotRulesParser.java | 325 -------
 .../org/apache/nutch/protocol/package-info.java |  23 -
 .../nutch/scoring/AbstractScoringFilter.java    |  68 --
 .../org/apache/nutch/scoring/ScoringFilter.java | 213 -----
 .../nutch/scoring/ScoringFilterException.java   |  43 -
 .../apache/nutch/scoring/ScoringFilters.java    | 118 ---
 .../org/apache/nutch/scoring/package-info.java  |  22 -
 .../nutch/scoring/webgraph/LinkDatum.java       | 140 ---
 .../nutch/scoring/webgraph/LinkDumper.java      | 433 ---------
 .../apache/nutch/scoring/webgraph/LinkRank.java | 677 --------------
 .../org/apache/nutch/scoring/webgraph/Node.java | 102 ---
 .../nutch/scoring/webgraph/NodeDumper.java      | 433 ---------
 .../nutch/scoring/webgraph/NodeReader.java      | 136 ---
 .../nutch/scoring/webgraph/ScoreUpdater.java    | 253 ------
 .../apache/nutch/scoring/webgraph/WebGraph.java | 783 ----------------
 .../nutch/scoring/webgraph/package-info.java    |  24 -
 .../nutch/segment/ContentAsTextInputFormat.java | 104 ---
 .../apache/nutch/segment/SegmentChecker.java    | 136 ---
 .../nutch/segment/SegmentMergeFilter.java       |  47 -
 .../nutch/segment/SegmentMergeFilters.java      |  84 --
 .../org/apache/nutch/segment/SegmentMerger.java | 793 -----------------
 .../org/apache/nutch/segment/SegmentPart.java   | 113 ---
 .../org/apache/nutch/segment/SegmentReader.java | 719 ---------------
 .../org/apache/nutch/segment/package-info.java  |  23 -
 .../org/apache/nutch/service/ConfManager.java   |  39 -
 .../org/apache/nutch/service/JobManager.java    |  44 -
 .../org/apache/nutch/service/NutchReader.java   |  37 -
 .../org/apache/nutch/service/NutchServer.java   | 224 -----
 .../nutch/service/impl/ConfManagerImpl.java     | 132 ---
 .../apache/nutch/service/impl/JobFactory.java   |  75 --
 .../nutch/service/impl/JobManagerImpl.java      |  95 --
 .../apache/nutch/service/impl/JobWorker.java    | 114 ---
 .../apache/nutch/service/impl/LinkReader.java   | 175 ----
 .../apache/nutch/service/impl/NodeReader.java   | 184 ----
 .../service/impl/NutchServerPoolExecutor.java   | 131 ---
 .../nutch/service/impl/SequenceReader.java      | 171 ----
 .../nutch/service/model/request/DbQuery.java    |  56 --
 .../nutch/service/model/request/JobConfig.java  |  71 --
 .../service/model/request/NutchConfig.java      |  51 --
 .../service/model/request/ReaderConfig.java     |  30 -
 .../nutch/service/model/request/SeedList.java   |  93 --
 .../nutch/service/model/request/SeedUrl.java    |  89 --
 .../service/model/response/FetchNodeDbInfo.java | 103 ---
 .../nutch/service/model/response/JobInfo.java   | 102 ---
 .../service/model/response/NutchServerInfo.java |  55 --
 .../service/resources/AbstractResource.java     |  45 -
 .../nutch/service/resources/AdminResource.java  |  85 --
 .../nutch/service/resources/ConfigResource.java | 137 ---
 .../nutch/service/resources/DbResource.java     | 143 ---
 .../nutch/service/resources/JobResource.java    |  99 ---
 .../nutch/service/resources/ReaderResouce.java  | 177 ----
 .../nutch/service/resources/SeedResource.java   | 111 ---
 .../nutch/tools/AbstractCommonCrawlFormat.java  | 393 --------
 src/java/org/apache/nutch/tools/Benchmark.java  | 284 ------
 .../apache/nutch/tools/CommonCrawlConfig.java   | 147 ---
 .../nutch/tools/CommonCrawlDataDumper.java      | 716 ---------------
 .../apache/nutch/tools/CommonCrawlFormat.java   |  87 --
 .../nutch/tools/CommonCrawlFormatFactory.java   |  74 --
 .../nutch/tools/CommonCrawlFormatJackson.java   | 109 ---
 .../nutch/tools/CommonCrawlFormatJettinson.java | 122 ---
 .../nutch/tools/CommonCrawlFormatSimple.java    | 174 ----
 .../nutch/tools/CommonCrawlFormatWARC.java      | 286 ------
 src/java/org/apache/nutch/tools/DmozParser.java | 391 --------
 src/java/org/apache/nutch/tools/FileDumper.java | 419 ---------
 .../org/apache/nutch/tools/FreeGenerator.java   | 214 -----
 .../org/apache/nutch/tools/ResolveUrls.java     | 204 -----
 src/java/org/apache/nutch/tools/WARCUtils.java  | 154 ----
 .../apache/nutch/tools/arc/ArcInputFormat.java  |  51 --
 .../apache/nutch/tools/arc/ArcRecordReader.java | 299 -------
 .../nutch/tools/arc/ArcSegmentCreator.java      | 426 ---------
 .../apache/nutch/tools/arc/package-info.java    |  23 -
 .../org/apache/nutch/tools/package-info.java    |  22 -
 .../apache/nutch/tools/warc/WARCExporter.java   | 333 -------
 .../apache/nutch/tools/warc/package-info.java   |  23 -
 .../org/apache/nutch/util/CommandRunner.java    | 291 ------
 .../apache/nutch/util/CrawlCompletionStats.java | 245 -----
 .../org/apache/nutch/util/DeflateUtils.java     | 140 ---
 src/java/org/apache/nutch/util/DomUtil.java     | 104 ---
 .../org/apache/nutch/util/DumpFileUtil.java     | 147 ---
 .../org/apache/nutch/util/EncodingDetector.java | 386 --------
 src/java/org/apache/nutch/util/FSUtils.java     | 106 ---
 src/java/org/apache/nutch/util/GZIPUtils.java   | 148 ----
 .../nutch/util/GenericWritableConfigurable.java |  60 --
 .../org/apache/nutch/util/HadoopFSUtil.java     |  72 --
 src/java/org/apache/nutch/util/JexlUtil.java    |  76 --
 src/java/org/apache/nutch/util/LockUtil.java    |  84 --
 src/java/org/apache/nutch/util/MimeUtil.java    | 279 ------
 src/java/org/apache/nutch/util/NodeWalker.java  | 129 ---
 .../apache/nutch/util/NutchConfiguration.java   | 104 ---
 src/java/org/apache/nutch/util/NutchJob.java    |  30 -
 src/java/org/apache/nutch/util/NutchTool.java   | 109 ---
 src/java/org/apache/nutch/util/ObjectCache.java |  56 --
 .../apache/nutch/util/PrefixStringMatcher.java  | 119 ---
 .../nutch/util/ProtocolStatusStatistics.java    | 179 ----
 src/java/org/apache/nutch/util/StringUtil.java  | 155 ----
 .../apache/nutch/util/SuffixStringMatcher.java  | 114 ---
 src/java/org/apache/nutch/util/TableUtil.java   | 161 ----
 src/java/org/apache/nutch/util/TimingUtil.java  |  72 --
 .../apache/nutch/util/TrieStringMatcher.java    | 202 -----
 src/java/org/apache/nutch/util/URLUtil.java     | 533 -----------
 .../nutch/util/domain/DomainStatistics.java     | 234 -----
 .../apache/nutch/util/domain/DomainSuffix.java  |  79 --
 .../nutch/util/domain/DomainSuffixes.java       |  86 --
 .../nutch/util/domain/DomainSuffixesReader.java | 164 ----
 .../nutch/util/domain/TopLevelDomain.java       |  67 --
 .../org/apache/nutch/util/domain/package.html   |  14 -
 .../org/apache/nutch/util/package-info.java     |  22 -
 .../apache/nutch/webui/NutchUiApplication.java  |  75 --
 .../nutch/webui/NutchUiApplication.properties   |  63 --
 .../org/apache/nutch/webui/NutchUiServer.java   | 104 ---
 .../apache/nutch/webui/client/NutchClient.java  |  49 -
 .../nutch/webui/client/NutchClientFactory.java  |  52 --
 .../nutch/webui/client/impl/CrawlingCycle.java  |  82 --
 .../client/impl/CrawlingCycleListener.java      |  31 -
 .../webui/client/impl/NutchClientImpl.java      |  99 ---
 .../nutch/webui/client/impl/RemoteCommand.java  |  76 --
 .../webui/client/impl/RemoteCommandBuilder.java |  64 --
 .../client/impl/RemoteCommandExecutor.java      | 110 ---
 .../client/impl/RemoteCommandsBatchFactory.java |  97 --
 .../webui/client/model/ConnectionStatus.java    |  21 -
 .../apache/nutch/webui/client/model/Crawl.java  | 126 ---
 .../nutch/webui/client/model/JobConfig.java     |  77 --
 .../nutch/webui/client/model/JobInfo.java       | 104 ---
 .../nutch/webui/client/model/NutchStatus.java   |  62 --
 .../nutch/webui/config/CustomDaoFactory.java    |  58 --
 .../nutch/webui/config/CustomTableCreator.java  |  83 --
 .../webui/config/NutchGuiConfiguration.java     |  33 -
 .../nutch/webui/config/SpringConfiguration.java |  91 --
 .../apache/nutch/webui/model/NutchConfig.java   |  24 -
 .../apache/nutch/webui/model/NutchInstance.java | 118 ---
 .../org/apache/nutch/webui/model/SeedList.java  | 106 ---
 .../org/apache/nutch/webui/model/SeedUrl.java   |  96 --
 .../nutch/webui/pages/AbstractBasePage.html     |  33 -
 .../nutch/webui/pages/AbstractBasePage.java     | 206 -----
 .../apache/nutch/webui/pages/DashboardPage.html |  52 --
 .../apache/nutch/webui/pages/DashboardPage.java |  65 --
 .../apache/nutch/webui/pages/LogOutPage.java    |  21 -
 .../nutch/webui/pages/SchedulingPage.java       |  21 -
 .../apache/nutch/webui/pages/SearchPage.java    |  21 -
 .../nutch/webui/pages/StatisticsPage.java       |  21 -
 .../nutch/webui/pages/UrlsUploadPage.java       |  21 -
 .../nutch/webui/pages/UserSettingsPage.java     |  21 -
 .../webui/pages/assets/NutchUiCssReference.java |  39 -
 .../nutch/webui/pages/assets/nutch-style.css    | 149 ----
 .../webui/pages/components/ColorEnumLabel.java  |  71 --
 .../pages/components/ColorEnumLabelBuilder.java |  49 -
 .../pages/components/CpmIteratorAdapter.java    |  41 -
 .../nutch/webui/pages/crawls/CrawlPanel.html    |  58 --
 .../nutch/webui/pages/crawls/CrawlPanel.java    |  98 --
 .../nutch/webui/pages/crawls/CrawlsPage.html    |  90 --
 .../nutch/webui/pages/crawls/CrawlsPage.java    | 139 ---
 .../webui/pages/instances/InstancePanel.html    |  46 -
 .../webui/pages/instances/InstancePanel.java    |  62 --
 .../webui/pages/instances/InstancesPage.html    |  66 --
 .../webui/pages/instances/InstancesPage.java    | 127 ---
 .../nutch/webui/pages/menu/VerticalMenu.html    |  48 -
 .../nutch/webui/pages/menu/VerticalMenu.java    |  27 -
 .../nutch/webui/pages/seed/SeedListsPage.html   |  75 --
 .../nutch/webui/pages/seed/SeedListsPage.java   |  79 --
 .../apache/nutch/webui/pages/seed/SeedPage.html |  91 --
 .../apache/nutch/webui/pages/seed/SeedPage.java | 153 ----
 .../webui/pages/settings/SettingsPage.html      |  43 -
 .../webui/pages/settings/SettingsPage.java      |  59 --
 .../nutch/webui/service/CrawlService.java       |  33 -
 .../webui/service/NutchInstanceService.java     |  33 -
 .../nutch/webui/service/NutchService.java       |  31 -
 .../nutch/webui/service/SeedListService.java    |  33 -
 .../webui/service/impl/CrawlServiceImpl.java    | 132 ---
 .../service/impl/NutchInstanceServiceImpl.java  |  76 --
 .../webui/service/impl/NutchServiceImpl.java    |  82 --
 .../webui/service/impl/SeedListServiceImpl.java |  77 --
 src/java/overview.html                          |   9 -
 src/plugin/build-plugin.xml                     | 255 ------
 src/plugin/build.xml                            | 213 -----
 src/plugin/creativecommons/README.txt           |   1 -
 src/plugin/creativecommons/build.xml            |  28 -
 .../creativecommons/conf/crawl-urlfilter.txt    |  18 -
 src/plugin/creativecommons/conf/nutch-site.xml  |  50 --
 src/plugin/creativecommons/data/anchor.html     |   9 -
 src/plugin/creativecommons/data/rdf.html        |  35 -
 src/plugin/creativecommons/data/rel.html        |   6 -
 src/plugin/creativecommons/ivy.xml              |  41 -
 src/plugin/creativecommons/plugin.xml           |  48 -
 .../creativecommons/nutch/CCIndexingFilter.java | 124 ---
 .../creativecommons/nutch/CCParseFilter.java    | 300 -------
 .../java/org/creativecommons/nutch/package.html |   5 -
 .../nutch/TestCCParseFilter.java                |  73 --
 src/plugin/feed/build.xml                       |  45 -
 src/plugin/feed/ivy.xml                         |  43 -
 src/plugin/feed/plugin.xml                      |  49 -
 src/plugin/feed/sample/rsstest.rss              |  36 -
 .../nutch/indexer/feed/FeedIndexingFilter.java  | 129 ---
 .../apache/nutch/indexer/feed/package-info.java |  22 -
 .../org/apache/nutch/parse/feed/FeedParser.java | 374 --------
 .../apache/nutch/parse/feed/package-info.java   |  22 -
 .../apache/nutch/parse/feed/TestFeedParser.java | 124 ---
 src/plugin/headings/build.xml                   |  22 -
 src/plugin/headings/ivy.xml                     |  41 -
 src/plugin/headings/plugin.xml                  |  45 -
 .../parse/headings/HeadingsParseFilter.java     | 124 ---
 .../nutch/parse/headings/package-info.java      |  22 -
 src/plugin/index-anchor/build.xml               |  22 -
 src/plugin/index-anchor/ivy.xml                 |  41 -
 src/plugin/index-anchor/plugin.xml              |  38 -
 .../indexer/anchor/AnchorIndexingFilter.java    | 107 ---
 .../apache/nutch/indexer/anchor/package.html    |   5 -
 .../anchor/TestAnchorIndexingFilter.java        |  67 --
 src/plugin/index-basic/build.xml                |  22 -
 src/plugin/index-basic/ivy.xml                  |  41 -
 src/plugin/index-basic/plugin.xml               |  42 -
 .../indexer/basic/BasicIndexingFilter.java      | 158 ----
 .../org/apache/nutch/indexer/basic/package.html |   5 -
 .../indexer/basic/TestBasicIndexingFilter.java  |  99 ---
 src/plugin/index-geoip/build-ivy.xml            |  54 --
 src/plugin/index-geoip/build.xml                |  27 -
 src/plugin/index-geoip/ivy.xml                  |  46 -
 src/plugin/index-geoip/plugin.xml               |  51 --
 .../indexer/geoip/GeoIPDocumentCreator.java     | 210 -----
 .../indexer/geoip/GeoIPIndexingFilter.java      | 241 -----
 .../nutch/indexer/geoip/package-info.java       |  28 -
 src/plugin/index-links/build.xml                |  22 -
 src/plugin/index-links/ivy.xml                  |  41 -
 src/plugin/index-links/plugin.xml               |  41 -
 .../indexer/links/LinksIndexingFilter.java      | 167 ----
 .../indexer/links/TestLinksIndexingFilter.java  | 218 -----
 .../org/apache/nutch/parse/TestOutlinks.java    |  54 --
 src/plugin/index-metadata/build.xml             |  22 -
 src/plugin/index-metadata/ivy.xml               |  41 -
 src/plugin/index-metadata/plugin.xml            |  42 -
 .../nutch/indexer/metadata/MetadataIndexer.java | 104 ---
 .../nutch/indexer/metadata/package-info.java    |  23 -
 src/plugin/index-more/build.xml                 |  22 -
 src/plugin/index-more/ivy.xml                   |  41 -
 src/plugin/index-more/plugin.xml                |  42 -
 .../nutch/indexer/more/MoreIndexingFilter.java  | 344 -------
 .../org/apache/nutch/indexer/more/package.html  |   6 -
 .../indexer/more/TestMoreIndexingFilter.java    | 123 ---
 src/plugin/index-replace/README.txt             |  95 --
 src/plugin/index-replace/build.xml              |  55 --
 src/plugin/index-replace/ivy.xml                |  41 -
 src/plugin/index-replace/plugin.xml             |  22 -
 .../index-replace/sample/testIndexReplace.html  |  12 -
 .../nutch/indexer/replace/FieldReplacer.java    | 196 ----
 .../nutch/indexer/replace/ReplaceIndexer.java   | 330 -------
 .../nutch/indexer/replace/package-info.java     |  22 -
 .../nutch/indexer/replace/TestIndexReplace.java | 456 ----------
 src/plugin/index-static/build.xml               |  22 -
 src/plugin/index-static/ivy.xml                 |  41 -
 src/plugin/index-static/plugin.xml              |  42 -
 .../indexer/staticfield/StaticFieldIndexer.java | 143 ---
 .../nutch/indexer/staticfield/package.html      |   5 -
 .../staticfield/TestStaticFieldIndexerTest.java | 194 ----
 src/plugin/indexer-cloudsearch/README.md        |  58 --
 src/plugin/indexer-cloudsearch/build.xml        |  22 -
 .../indexer-cloudsearch/createCSDomain.sh       |  22 -
 src/plugin/indexer-cloudsearch/ivy.xml          |  41 -
 src/plugin/indexer-cloudsearch/plugin.xml       |  50 --
 .../cloudsearch/CloudSearchConstants.java       |  27 -
 .../cloudsearch/CloudSearchIndexWriter.java     | 382 --------
 .../cloudsearch/CloudSearchUtils.java           |  73 --
 src/plugin/indexer-dummy/build.xml              |  22 -
 src/plugin/indexer-dummy/ivy.xml                |  41 -
 src/plugin/indexer-dummy/plugin.xml             |  38 -
 .../indexwriter/dummy/DummyIndexWriter.java     | 103 ---
 .../nutch/indexwriter/dummy/package-info.java   |  23 -
 src/plugin/indexer-elastic/build-ivy.xml        |  54 --
 src/plugin/indexer-elastic/build.xml            |  22 -
 src/plugin/indexer-elastic/howto_upgrade_es.txt |   6 -
 src/plugin/indexer-elastic/ivy.xml              |  43 -
 src/plugin/indexer-elastic/plugin.xml           |  71 --
 .../indexwriter/elastic/ElasticConstants.java   |  28 -
 .../indexwriter/elastic/ElasticIndexWriter.java | 279 ------
 .../nutch/indexwriter/elastic/package-info.java |  22 -
 src/plugin/indexer-solr/build-ivy.xml           |  54 --
 src/plugin/indexer-solr/build.xml               |  22 -
 src/plugin/indexer-solr/ivy.xml                 |  44 -
 src/plugin/indexer-solr/plugin.xml              |  48 -
 .../nutch/indexwriter/solr/SolrConstants.java   |  56 --
 .../nutch/indexwriter/solr/SolrIndexWriter.java | 277 ------
 .../indexwriter/solr/SolrMappingReader.java     | 147 ---
 .../nutch/indexwriter/solr/SolrUtils.java       |  97 --
 .../nutch/indexwriter/solr/package-info.java    |  22 -
 src/plugin/language-identifier/build.xml        |  38 -
 src/plugin/language-identifier/ivy.xml          |  41 -
 src/plugin/language-identifier/plugin.xml       |  49 -
 .../nutch/analysis/lang/HTMLLanguageParser.java | 320 -------
 .../analysis/lang/LanguageIndexingFilter.java   |  89 --
 .../nutch/analysis/lang/langmappings.properties | 188 ----
 .../org/apache/nutch/analysis/lang/package.html |   6 -
 .../analysis/lang/TestHTMLLanguageParser.java   | 149 ----
 .../test/org/apache/nutch/analysis/lang/da.test | 108 ---
 .../test/org/apache/nutch/analysis/lang/de.test | 104 ---
 .../test/org/apache/nutch/analysis/lang/el.test | 109 ---
 .../test/org/apache/nutch/analysis/lang/en.test | 105 ---
 .../test/org/apache/nutch/analysis/lang/es.test | 107 ---
 .../test/org/apache/nutch/analysis/lang/fi.test | 106 ---
 .../test/org/apache/nutch/analysis/lang/fr.test | 105 ---
 .../test/org/apache/nutch/analysis/lang/it.test | 109 ---
 .../test/org/apache/nutch/analysis/lang/nl.test | 105 ---
 .../test/org/apache/nutch/analysis/lang/pt.test | 105 ---
 .../test/org/apache/nutch/analysis/lang/sv.test | 108 ---
 .../nutch/analysis/lang/test-referencial.txt    |  10 -
 src/plugin/lib-htmlunit/build-ivy.xml           |  54 --
 src/plugin/lib-htmlunit/build.xml               |  28 -
 src/plugin/lib-htmlunit/ivy.xml                 |  52 --
 src/plugin/lib-htmlunit/plugin.xml              | 166 ----
 .../protocol/htmlunit/HtmlUnitWebDriver.java    | 189 ----
 .../htmlunit/HtmlUnitWebWindowListener.java     |  53 --
 src/plugin/lib-http/build.xml                   |  22 -
 src/plugin/lib-http/ivy.xml                     |  41 -
 src/plugin/lib-http/plugin.xml                  |  33 -
 .../protocol/http/api/BlockedException.java     |  26 -
 .../nutch/protocol/http/api/HttpBase.java       | 587 ------------
 .../nutch/protocol/http/api/HttpException.java  |  40 -
 .../protocol/http/api/HttpRobotRulesParser.java | 167 ----
 .../apache/nutch/protocol/http/api/package.html |   6 -
 .../protocol/http/api/TestRobotRulesParser.java | 123 ---
 src/plugin/lib-nekohtml/build.xml               |  30 -
 src/plugin/lib-nekohtml/ivy.xml                 |  42 -
 src/plugin/lib-nekohtml/plugin.xml              |  38 -
 src/plugin/lib-regex-filter/build.xml           |  22 -
 src/plugin/lib-regex-filter/ivy.xml             |  41 -
 src/plugin/lib-regex-filter/plugin.xml          |  33 -
 .../apache/nutch/urlfilter/api/RegexRule.java   | 102 ---
 .../nutch/urlfilter/api/RegexURLFilterBase.java | 315 -------
 .../nutch/urlfilter/api/package-info.java       |  23 -
 .../urlfilter/api/RegexURLFilterBaseTest.java   | 134 ---
 src/plugin/lib-selenium/build-ivy.xml           |  54 --
 src/plugin/lib-selenium/build.xml               |  28 -
 .../lib-selenium/howto_upgrade_selenium.txt     |  15 -
 src/plugin/lib-selenium/ivy.xml                 |  52 --
 src/plugin/lib-selenium/plugin.xml              | 175 ----
 .../nutch/protocol/selenium/HttpWebClient.java  | 236 -----
 src/plugin/lib-xml/build.xml                    |  36 -
 src/plugin/lib-xml/ivy.xml                      |  44 -
 src/plugin/lib-xml/plugin.xml                   |  65 --
 src/plugin/microformats-reltag/build.xml        |  27 -
 src/plugin/microformats-reltag/ivy.xml          |  41 -
 src/plugin/microformats-reltag/plugin.xml       |  49 -
 .../reltag/RelTagIndexingFilter.java            |  77 --
 .../nutch/microformats/reltag/RelTagParser.java | 148 ----
 .../nutch/microformats/reltag/package.html      |   8 -
 src/plugin/mimetype-filter/build.xml            |  28 -
 src/plugin/mimetype-filter/ivy.xml              |  41 -
 src/plugin/mimetype-filter/plugin.xml           |  37 -
 .../mimetype-filter/sample/allow-images.txt     |  34 -
 .../mimetype-filter/sample/block-html.txt       |  34 -
 .../indexer/filter/MimeTypeIndexingFilter.java  | 273 ------
 .../filter/MimeTypeIndexingFilterTest.java      | 114 ---
 src/plugin/nutch-extensionpoints/build.xml      |  30 -
 src/plugin/nutch-extensionpoints/ivy.xml        |  41 -
 src/plugin/nutch-extensionpoints/plugin.xml     |  67 --
 src/plugin/parse-ext/build.xml                  |  32 -
 src/plugin/parse-ext/command                    |  24 -
 src/plugin/parse-ext/ivy.xml                    |  41 -
 src/plugin/parse-ext/plugin.xml                 |  60 --
 .../org/apache/nutch/parse/ext/ExtParser.java   | 183 ----
 .../apache/nutch/parse/ext/package-info.java    |  22 -
 .../apache/nutch/parse/ext/TestExtParser.java   | 130 ---
 src/plugin/parse-html/build.xml                 |  40 -
 src/plugin/parse-html/ivy.xml                   |  42 -
 src/plugin/parse-html/plugin.xml                |  48 -
 .../org/apache/nutch/parse/html/DOMBuilder.java | 766 ----------------
 .../nutch/parse/html/DOMContentUtils.java       | 400 ---------
 .../nutch/parse/html/HTMLMetaProcessor.java     | 214 -----
 .../org/apache/nutch/parse/html/HtmlParser.java | 352 --------
 .../parse/html/XMLCharacterRecognizer.java      | 112 ---
 .../org/apache/nutch/parse/html/package.html    |   5 -
 .../nutch/parse/html/TestDOMContentUtils.java   | 347 --------
 .../apache/nutch/parse/html/TestHtmlParser.java | 122 ---
 .../parse/html/TestRobotsMetaProcessor.java     | 155 ----
 src/plugin/parse-js/build.xml                   |  22 -
 src/plugin/parse-js/ivy.xml                     |  41 -
 src/plugin/parse-js/plugin.xml                  |  53 --
 .../apache/nutch/parse/js/JSParseFilter.java    | 301 -------
 .../org/apache/nutch/parse/js/package-info.java |  23 -
 src/plugin/parse-metatags/README.txt            |  17 -
 src/plugin/parse-metatags/build.xml             |  37 -
 src/plugin/parse-metatags/ivy.xml               |  41 -
 src/plugin/parse-metatags/plugin.xml            |  22 -
 .../parse-metatags/sample/testMetatags.html     |   9 -
 .../sample/testMultivalueMetatags.html          |  12 -
 .../nutch/parse/metatags/MetaTagsParser.java    | 124 ---
 .../nutch/parse/metatags/package-info.java      |  24 -
 .../nutch/parse/metatags/TestMetatagParser.java | 104 ---
 src/plugin/parse-replace/README.txt             |  91 --
 src/plugin/parse-replace/build.xml              |  37 -
 src/plugin/parse-replace/ivy.xml                |  41 -
 src/plugin/parse-replace/plugin.xml             |  22 -
 .../parse-replace/sample/testParseReplace.html  |  11 -
 .../nutch/parse/replace/ReplaceParser.java      |  74 --
 .../nutch/parse/replace/package-info.java       |  22 -
 .../nutch/parse/replace/TestParseReplace.java   |  68 --
 src/plugin/parse-swf/build.xml                  |  38 -
 src/plugin/parse-swf/ivy.xml                    |  41 -
 src/plugin/parse-swf/lib/javaswf-LICENSE.txt    |  33 -
 src/plugin/parse-swf/lib/javaswf.jar            | Bin 125369 -> 0 bytes
 src/plugin/parse-swf/plugin.xml                 |  44 -
 src/plugin/parse-swf/sample/test1.swf           | Bin 21054 -> 0 bytes
 src/plugin/parse-swf/sample/test1.txt           |  60 --
 src/plugin/parse-swf/sample/test2.swf           | Bin 42534 -> 0 bytes
 src/plugin/parse-swf/sample/test2.txt           |   5 -
 src/plugin/parse-swf/sample/test3.swf           | Bin 51562 -> 0 bytes
 src/plugin/parse-swf/sample/test3.txt           |  11 -
 .../org/apache/nutch/parse/swf/SWFParser.java   | 685 --------------
 .../apache/nutch/parse/swf/package-info.java    |  22 -
 .../apache/nutch/parse/swf/TestSWFParser.java   |  94 --
 src/plugin/parse-tika/build-ivy.xml             |  54 --
 src/plugin/parse-tika/build.xml                 |  55 --
 src/plugin/parse-tika/howto_upgrade_tika.txt    |   8 -
 src/plugin/parse-tika/ivy.xml                   |  46 -
 src/plugin/parse-tika/plugin.xml                | 136 ---
 src/plugin/parse-tika/sample/encrypted.pdf      | Bin 3431 -> 0 bytes
 src/plugin/parse-tika/sample/nutch.html         | 519 -----------
 src/plugin/parse-tika/sample/nutch_logo_tm.gif  | Bin 2747 -> 0 bytes
 src/plugin/parse-tika/sample/ootest.odt         | Bin 20753 -> 0 bytes
 src/plugin/parse-tika/sample/ootest.sxw         | Bin 20125 -> 0 bytes
 src/plugin/parse-tika/sample/ootest.txt         |  30 -
 src/plugin/parse-tika/sample/pdftest.pdf        | 157 ----
 src/plugin/parse-tika/sample/rsstest.rss        |  37 -
 src/plugin/parse-tika/sample/test.rtf           |  17 -
 src/plugin/parse-tika/sample/word97.doc         | Bin 8192 -> 0 bytes
 .../tika/BoilerpipeExtractorRepository.java     |  62 --
 .../org/apache/nutch/parse/tika/DOMBuilder.java | 794 -----------------
 .../nutch/parse/tika/DOMContentUtils.java       | 402 ---------
 .../nutch/parse/tika/HTMLMetaProcessor.java     | 214 -----
 .../org/apache/nutch/parse/tika/TikaParser.java | 286 ------
 .../parse/tika/XMLCharacterRecognizer.java      | 112 ---
 .../apache/nutch/parse/tika/package-info.java   |  23 -
 .../apache/nutch/tika/TestDOMContentUtils.java  | 337 -------
 .../org/apache/nutch/tika/TestFeedParser.java   | 121 ---
 .../apache/nutch/tika/TestImageMetadata.java    |  67 --
 .../org/apache/nutch/tika/TestMSWordParser.java |  92 --
 .../org/apache/nutch/tika/TestOOParser.java     | 107 ---
 .../org/apache/nutch/tika/TestPdfParser.java    |  73 --
 .../org/apache/nutch/tika/TestRTFParser.java    |  81 --
 .../nutch/tika/TestRobotsMetaProcessor.java     | 156 ----
 src/plugin/parse-zip/build.xml                  |  38 -
 src/plugin/parse-zip/ivy.xml                    |  41 -
 src/plugin/parse-zip/plugin.xml                 |  46 -
 src/plugin/parse-zip/sample/test.zip            | Bin 182 -> 0 bytes
 .../org/apache/nutch/parse/zip/ZipParser.java   | 144 ---
 .../nutch/parse/zip/ZipTextExtractor.java       | 120 ---
 .../apache/nutch/parse/zip/package-info.java    |  22 -
 .../apache/nutch/parse/zip/TestZipParser.java   |  71 --
 src/plugin/parsefilter-naivebayes/build-ivy.xml |  54 --
 src/plugin/parsefilter-naivebayes/build.xml     |  22 -
 src/plugin/parsefilter-naivebayes/ivy.xml       |  49 -
 src/plugin/parsefilter-naivebayes/plugin.xml    |  56 --
 .../nutch/parsefilter/naivebayes/Classify.java  | 120 ---
 .../naivebayes/NaiveBayesParseFilter.java       | 197 ----
 .../nutch/parsefilter/naivebayes/Train.java     | 148 ----
 .../parsefilter/naivebayes/package-info.java    |  28 -
 src/plugin/parsefilter-regex/build.xml          |  27 -
 .../data/regex-parsefilter.txt                  |  10 -
 src/plugin/parsefilter-regex/ivy.xml            |  37 -
 src/plugin/parsefilter-regex/plugin.xml         |  42 -
 .../parsefilter/regex/RegexParseFilter.java     | 199 -----
 .../nutch/parsefilter/regex/package-info.java   |  23 -
 .../parsefilter/regex/TestRegexParseFilter.java |  77 --
 src/plugin/plugin.dtd                           | 206 -----
 src/plugin/protocol-file/build.xml              |  29 -
 src/plugin/protocol-file/ivy.xml                |  41 -
 src/plugin/protocol-file/plugin.xml             |  46 -
 .../protocol-file/sample/testprotocolfile.txt   |   1 -
 .../sample/testprotocolfile_(encoded).txt       |   1 -
 .../org/apache/nutch/protocol/file/File.java    | 228 -----
 .../apache/nutch/protocol/file/FileError.java   |  36 -
 .../nutch/protocol/file/FileException.java      |  40 -
 .../nutch/protocol/file/FileResponse.java       | 317 -------
 .../org/apache/nutch/protocol/file/package.html |   5 -
 .../nutch/protocol/file/TestProtocolFile.java   |  99 ---
 src/plugin/protocol-ftp/build.xml               |  22 -
 src/plugin/protocol-ftp/ivy.xml                 |  42 -
 src/plugin/protocol-ftp/plugin.xml              |  46 -
 .../org/apache/nutch/protocol/ftp/Client.java   | 595 -------------
 .../java/org/apache/nutch/protocol/ftp/Ftp.java | 267 ------
 .../org/apache/nutch/protocol/ftp/FtpError.java |  36 -
 .../apache/nutch/protocol/ftp/FtpException.java |  46 -
 .../ftp/FtpExceptionBadSystResponse.java        |  29 -
 .../FtpExceptionCanNotHaveDataConnection.java   |  29 -
 ...ExceptionControlClosedByForcedDataClose.java |  30 -
 .../ftp/FtpExceptionUnknownForcedDataClose.java |  30 -
 .../apache/nutch/protocol/ftp/FtpResponse.java  | 521 -----------
 .../nutch/protocol/ftp/FtpRobotRulesParser.java | 121 ---
 .../protocol/ftp/PrintCommandListener.java      |  71 --
 .../org/apache/nutch/protocol/ftp/package.html  |   5 -
 src/plugin/protocol-htmlunit/build.xml          |  37 -
 src/plugin/protocol-htmlunit/ivy.xml            |  38 -
 src/plugin/protocol-htmlunit/plugin.xml         |  51 --
 .../apache/nutch/protocol/htmlunit/Http.java    |  63 --
 .../nutch/protocol/htmlunit/HttpResponse.java   | 573 ------------
 .../apache/nutch/protocol/htmlunit/package.html |  21 -
 src/plugin/protocol-http/build.xml              |  50 --
 src/plugin/protocol-http/ivy.xml                |  41 -
 src/plugin/protocol-http/jsp/basic-http.jsp     |  44 -
 src/plugin/protocol-http/jsp/brokenpage.jsp     |  47 -
 src/plugin/protocol-http/jsp/redirect301.jsp    |  49 -
 src/plugin/protocol-http/jsp/redirect302.jsp    |  49 -
 src/plugin/protocol-http/plugin.xml             |  51 --
 .../org/apache/nutch/protocol/http/Http.java    |  73 --
 .../nutch/protocol/http/HttpResponse.java       | 558 ------------
 .../org/apache/nutch/protocol/http/package.html |   5 -
 .../src/test/conf/nutch-site-test.xml           |  52 --
 .../nutch/protocol/http/TestProtocolHttp.java   | 140 ---
 src/plugin/protocol-httpclient/build.xml        |  45 -
 src/plugin/protocol-httpclient/ivy.xml          |  42 -
 src/plugin/protocol-httpclient/jsp/basic.jsp    |  74 --
 src/plugin/protocol-httpclient/jsp/cookies.jsp  |  63 --
 src/plugin/protocol-httpclient/jsp/digest.jsp   |  68 --
 src/plugin/protocol-httpclient/jsp/noauth.jsp   |  36 -
 src/plugin/protocol-httpclient/jsp/ntlm.jsp     |  89 --
 src/plugin/protocol-httpclient/plugin.xml       |  58 --
 .../DummySSLProtocolSocketFactory.java          | 163 ----
 .../httpclient/DummyX509TrustManager.java       |  92 --
 .../apache/nutch/protocol/httpclient/Http.java  | 572 ------------
 .../protocol/httpclient/HttpAuthentication.java |  45 -
 .../httpclient/HttpAuthenticationException.java |  71 --
 .../httpclient/HttpAuthenticationFactory.java   |  98 --
 .../httpclient/HttpBasicAuthentication.java     | 199 -----
 .../httpclient/HttpFormAuthConfigurer.java      | 106 ---
 .../httpclient/HttpFormAuthentication.java      | 223 -----
 .../nutch/protocol/httpclient/HttpResponse.java | 216 -----
 .../nutch/protocol/httpclient/package.html      |   9 -
 .../src/test/conf/httpclient-auth-test.xml      |  58 --
 .../src/test/conf/nutch-site-test.xml           |  52 --
 .../httpclient/TestProtocolHttpClient.java      | 217 -----
 .../protocol-interactiveselenium/README.md      |  38 -
 .../protocol-interactiveselenium/build-ivy.xml  |  54 --
 .../protocol-interactiveselenium/build.xml      |  37 -
 src/plugin/protocol-interactiveselenium/ivy.xml |  42 -
 .../protocol-interactiveselenium/plugin.xml     |  47 -
 .../protocol/interactiveselenium/Http.java      |  59 --
 .../interactiveselenium/HttpResponse.java       | 399 ---------
 .../DefalultMultiInteractionHandler.java        |  53 --
 .../DefaultClickAllAjaxLinksHandler.java        |  88 --
 .../handlers/DefaultHandler.java                |  30 -
 .../handlers/InteractiveSeleniumHandler.java    |  25 -
 .../protocol/interactiveselenium/package.html   |   5 -
 src/plugin/protocol-selenium/README.md          | 208 -----
 src/plugin/protocol-selenium/build-ivy.xml      |  54 --
 src/plugin/protocol-selenium/build.xml          |  36 -
 src/plugin/protocol-selenium/ivy.xml            |  42 -
 src/plugin/protocol-selenium/plugin.xml         |  47 -
 .../apache/nutch/protocol/selenium/Http.java    |  59 --
 .../nutch/protocol/selenium/HttpResponse.java   | 360 --------
 .../apache/nutch/protocol/selenium/package.html |   5 -
 src/plugin/scoring-depth/build.xml              |   6 -
 src/plugin/scoring-depth/ivy.xml                |  41 -
 src/plugin/scoring-depth/plugin.xml             |  24 -
 .../nutch/scoring/depth/DepthScoringFilter.java | 207 -----
 .../nutch/scoring/depth/package-info.java       |  23 -
 src/plugin/scoring-link/build.xml               |  27 -
 src/plugin/scoring-link/ivy.xml                 |  41 -
 src/plugin/scoring-link/plugin.xml              |  39 -
 .../scoring/link/LinkAnalysisScoringFilter.java |  95 --
 .../apache/nutch/scoring/link/package-info.java |  23 -
 src/plugin/scoring-opic/build.xml               |  27 -
 src/plugin/scoring-opic/ivy.xml                 |  41 -
 src/plugin/scoring-opic/plugin.xml              |  39 -
 .../nutch/scoring/opic/OPICScoringFilter.java   | 173 ----
 .../apache/nutch/scoring/opic/package-info.java |  23 -
 src/plugin/scoring-similarity/build-ivy.xml     |  54 --
 src/plugin/scoring-similarity/build.xml         |  27 -
 src/plugin/scoring-similarity/ivy.xml           |  42 -
 src/plugin/scoring-similarity/plugin.xml        |  45 -
 .../scoring/similarity/SimilarityModel.java     |  38 -
 .../similarity/SimilarityScoringFilter.java     |  70 --
 .../similarity/cosine/CosineSimilarity.java     |  84 --
 .../scoring/similarity/cosine/DocVector.java    |  57 --
 .../nutch/scoring/similarity/cosine/Model.java  | 190 ----
 .../scoring/similarity/cosine/package-info.java |   7 -
 .../similarity/util/LuceneAnalyzerUtil.java     |  93 --
 .../similarity/util/LuceneTokenizer.java        | 166 ----
 .../scoring/similarity/util/package-info.java   |  24 -
 src/plugin/subcollection/README.txt             |  10 -
 src/plugin/subcollection/build.xml              |  22 -
 src/plugin/subcollection/ivy.xml                |  41 -
 src/plugin/subcollection/plugin.xml             |  41 -
 .../nutch/collection/CollectionManager.java     | 240 -----
 .../apache/nutch/collection/Subcollection.java  | 259 ------
 .../org/apache/nutch/collection/package.html    |  36 -
 .../SubcollectionIndexingFilter.java            | 101 ---
 .../indexer/subcollection/package-info.java     |  25 -
 .../nutch/collection/TestSubcollection.java     | 112 ---
 src/plugin/tld/build.xml                        |  22 -
 src/plugin/tld/ivy.xml                          |  41 -
 src/plugin/tld/plugin.xml                       |  51 --
 .../nutch/indexer/tld/TLDIndexingFilter.java    |  69 --
 .../org/apache/nutch/indexer/tld/package.html   |   5 -
 .../nutch/scoring/tld/TLDScoringFilter.java     | 114 ---
 .../org/apache/nutch/scoring/tld/package.html   |   5 -
 src/plugin/urlfilter-automaton/build.xml        |  51 --
 src/plugin/urlfilter-automaton/ivy.xml          |  42 -
 src/plugin/urlfilter-automaton/plugin.xml       |  43 -
 .../urlfilter-automaton/sample/Benchmarks.rules |  26 -
 .../urlfilter-automaton/sample/Benchmarks.urls  | 297 -------
 .../sample/IntranetCrawling.rules               |  24 -
 .../sample/IntranetCrawling.urls                |   8 -
 .../sample/WholeWebCrawling.rules               |  19 -
 .../sample/WholeWebCrawling.urls                |  11 -
 .../urlfilter/automaton/AutomatonURLFilter.java | 116 ---
 .../nutch/urlfilter/automaton/package.html      |   9 -
 .../automaton/TestAutomatonURLFilter.java       |  56 --
 src/plugin/urlfilter-domain/build.xml           |  28 -
 src/plugin/urlfilter-domain/data/hosts.txt      |   5 -
 src/plugin/urlfilter-domain/ivy.xml             |  41 -
 src/plugin/urlfilter-domain/plugin.xml          |  43 -
 .../nutch/urlfilter/domain/DomainURLFilter.java | 212 -----
 .../nutch/urlfilter/domain/package-info.java    |  25 -
 .../urlfilter/domain/TestDomainURLFilter.java   |  67 --
 src/plugin/urlfilter-domainblacklist/build.xml  |  28 -
 .../urlfilter-domainblacklist/data/hosts.txt    |   5 -
 src/plugin/urlfilter-domainblacklist/ivy.xml    |  41 -
 src/plugin/urlfilter-domainblacklist/plugin.xml |  43 -
 .../DomainBlacklistURLFilter.java               | 210 -----
 .../urlfilter/domainblacklist/package-info.java |  24 -
 .../TestDomainBlacklistURLFilter.java           |  49 -
 src/plugin/urlfilter-ignoreexempt/README.md     |  43 -
 src/plugin/urlfilter-ignoreexempt/build.xml     |  55 --
 .../urlfilter-ignoreexempt/data/.donotdelete    |   0
 src/plugin/urlfilter-ignoreexempt/ivy.xml       |  41 -
 src/plugin/urlfilter-ignoreexempt/plugin.xml    |  45 -
 .../ignoreexempt/ExemptionUrlFilter.java        | 101 ---
 .../urlfilter/ignoreexempt/package-info.java    |  24 -
 src/plugin/urlfilter-prefix/build.xml           |  22 -
 src/plugin/urlfilter-prefix/ivy.xml             |  41 -
 src/plugin/urlfilter-prefix/plugin.xml          |  47 -
 .../nutch/urlfilter/prefix/PrefixURLFilter.java | 178 ----
 .../apache/nutch/urlfilter/prefix/package.html  |   5 -
 .../urlfilter/prefix/TestPrefixURLFilter.java   |  79 --
 src/plugin/urlfilter-regex/build.xml            |  51 --
 src/plugin/urlfilter-regex/ivy.xml              |  41 -
 src/plugin/urlfilter-regex/plugin.xml           |  48 -
 .../urlfilter-regex/sample/Benchmarks.rules     |  26 -
 .../urlfilter-regex/sample/Benchmarks.urls      | 297 -------
 .../sample/IntranetCrawling.rules               |  27 -
 .../sample/IntranetCrawling.urls                |   8 -
 .../sample/WholeWebCrawling.rules               |  22 -
 .../sample/WholeWebCrawling.urls                |  11 -
 .../urlfilter-regex/sample/nutch1838.rules      |  12 -
 .../urlfilter-regex/sample/nutch1838.urls       |   3 -
 .../nutch/urlfilter/regex/RegexURLFilter.java   | 111 ---
 .../apache/nutch/urlfilter/regex/package.html   |   5 -
 .../urlfilter/regex/TestRegexURLFilter.java     |  61 --
 src/plugin/urlfilter-suffix/build.xml           |  22 -
 src/plugin/urlfilter-suffix/ivy.xml             |  41 -
 src/plugin/urlfilter-suffix/plugin.xml          |  47 -
 .../nutch/urlfilter/suffix/SuffixURLFilter.java | 331 -------
 .../nutch/urlfilter/suffix/package-info.java    |  23 -
 .../urlfilter/suffix/TestSuffixURLFilter.java   | 123 ---
 src/plugin/urlfilter-validator/build.xml        |  22 -
 src/plugin/urlfilter-validator/ivy.xml          |  41 -
 src/plugin/urlfilter-validator/plugin.xml       |  41 -
 .../nutch/urlfilter/validator/UrlValidator.java | 386 --------
 .../nutch/urlfilter/validator/package.html      |   9 -
 .../urlfilter/validator/TestUrlValidator.java   |  79 --
 src/plugin/urlmeta/build.xml                    |  22 -
 src/plugin/urlmeta/ivy.xml                      |  41 -
 src/plugin/urlmeta/plugin.xml                   |  47 -
 .../indexer/urlmeta/URLMetaIndexingFilter.java  | 118 ---
 .../apache/nutch/indexer/urlmeta/package.html   |  12 -
 .../scoring/urlmeta/URLMetaScoringFilter.java   | 175 ----
 .../apache/nutch/scoring/urlmeta/package.html   |  11 -
 src/plugin/urlnormalizer-ajax/build.xml         |  22 -
 src/plugin/urlnormalizer-ajax/ivy.xml           |  41 -
 src/plugin/urlnormalizer-ajax/plugin.xml        |  41 -
 .../urlnormalizer/ajax/AjaxURLNormalizer.java   | 236 -----
 .../ajax/TestAjaxURLNormalizer.java             |  67 --
 src/plugin/urlnormalizer-basic/build.xml        |  22 -
 src/plugin/urlnormalizer-basic/ivy.xml          |  41 -
 src/plugin/urlnormalizer-basic/plugin.xml       |  41 -
 .../urlnormalizer/basic/BasicURLNormalizer.java | 290 ------
 .../net/urlnormalizer/basic/package-info.java   |  23 -
 .../basic/TestBasicURLNormalizer.java           | 175 ----
 src/plugin/urlnormalizer-host/build.xml         |  27 -
 src/plugin/urlnormalizer-host/data/hosts.txt    |   8 -
 src/plugin/urlnormalizer-host/ivy.xml           |  41 -
 src/plugin/urlnormalizer-host/plugin.xml        |  43 -
 .../urlnormalizer/host/HostURLNormalizer.java   | 198 -----
 .../net/urlnormalizer/host/package-info.java    |  23 -
 .../host/TestHostURLNormalizer.java             |  57 --
 src/plugin/urlnormalizer-pass/build.xml         |  22 -
 src/plugin/urlnormalizer-pass/ivy.xml           |  41 -
 src/plugin/urlnormalizer-pass/plugin.xml        |  41 -
 .../urlnormalizer/pass/PassURLNormalizer.java   |  49 -
 .../net/urlnormalizer/pass/package-info.java    |  23 -
 .../pass/TestPassURLNormalizer.java             |  45 -
 src/plugin/urlnormalizer-protocol/build.xml     |  27 -
 .../urlnormalizer-protocol/data/protocols.txt   |   7 -
 src/plugin/urlnormalizer-protocol/ivy.xml       |  41 -
 src/plugin/urlnormalizer-protocol/plugin.xml    |  43 -
 .../protocol/ProtocolURLNormalizer.java         | 190 ----
 .../protocol/TestProtocolURLNormalizer.java     |  55 --
 src/plugin/urlnormalizer-querystring/build.xml  |  22 -
 src/plugin/urlnormalizer-querystring/ivy.xml    |  41 -
 src/plugin/urlnormalizer-querystring/plugin.xml |  42 -
 .../querystring/QuerystringURLNormalizer.java   |  91 --
 .../urlnormalizer/querystring/package-info.java |  23 -
 .../TestQuerystringURLNormalizer.java           |  49 -
 src/plugin/urlnormalizer-regex/build.xml        |  34 -
 src/plugin/urlnormalizer-regex/ivy.xml          |  41 -
 src/plugin/urlnormalizer-regex/plugin.xml       |  41 -
 .../sample/regex-normalize-default.test         |  84 --
 .../sample/regex-normalize-default.xml          |  66 --
 .../sample/regex-normalize-scope1.test          |   8 -
 .../sample/regex-normalize-scope1.xml           |  21 -
 .../urlnormalizer/regex/RegexURLNormalizer.java | 324 -------
 .../net/urlnormalizer/regex/package-info.java   |  23 -
 .../regex/TestRegexURLNormalizer.java           | 186 ----
 src/plugin/urlnormalizer-slash/build.xml        |  27 -
 src/plugin/urlnormalizer-slash/data/slashes.txt |   7 -
 src/plugin/urlnormalizer-slash/ivy.xml          |  41 -
 src/plugin/urlnormalizer-slash/plugin.xml       |  43 -
 .../urlnormalizer/slash/SlashURLNormalizer.java | 224 -----
 .../slash/TestSlashURLNormalizer.java           |  73 --
 src/test/crawl-tests.xml                        |  62 --
 src/test/domain-urlfilter.txt                   |  22 -
 src/test/filter-all.txt                         |   7 -
 src/test/log4j.properties                       |   7 -
 src/test/nutch-site.xml                         |  19 -
 .../nutch/crawl/ContinuousCrawlTestUtil.java    | 270 ------
 .../org/apache/nutch/crawl/CrawlDBTestUtil.java | 179 ----
 .../nutch/crawl/CrawlDbUpdateTestDriver.java    | 138 ---
 .../apache/nutch/crawl/CrawlDbUpdateUtil.java   | 166 ----
 .../org/apache/nutch/crawl/DummyWritable.java   |  32 -
 .../nutch/crawl/TODOTestCrawlDbStates.java      | 168 ----
 .../nutch/crawl/TestAdaptiveFetchSchedule.java  | 121 ---
 .../apache/nutch/crawl/TestCrawlDbFilter.java   | 145 ---
 .../apache/nutch/crawl/TestCrawlDbMerger.java   | 160 ----
 .../apache/nutch/crawl/TestCrawlDbStates.java   | 566 ------------
 .../org/apache/nutch/crawl/TestGenerator.java   | 370 --------
 .../org/apache/nutch/crawl/TestInjector.java    | 181 ----
 .../apache/nutch/crawl/TestLinkDbMerger.java    | 160 ----
 .../nutch/crawl/TestSignatureFactory.java       |  35 -
 .../org/apache/nutch/fetcher/TestFetcher.java   | 207 -----
 .../nutch/indexer/TestIndexerMapReduce.java     | 187 ----
 .../nutch/indexer/TestIndexingFilters.java      | 110 ---
 .../org/apache/nutch/metadata/TestMetadata.java | 281 ------
 .../metadata/TestSpellCheckedMetadata.java      | 303 -------
 .../org/apache/nutch/net/TestURLFilters.java    |  41 -
 .../apache/nutch/net/TestURLNormalizers.java    |  83 --
 .../nutch/parse/TestOutlinkExtractor.java       |  99 ---
 .../org/apache/nutch/parse/TestParseData.java   |  58 --
 .../org/apache/nutch/parse/TestParseText.java   |  34 -
 .../apache/nutch/parse/TestParserFactory.java   | 105 ---
 .../apache/nutch/parse/parse-plugin-test.xml    |  58 --
 .../nutch/plugin/HelloWorldExtension.java       |  36 -
 .../org/apache/nutch/plugin/ITestExtension.java |  27 -
 .../apache/nutch/plugin/SimpleTestPlugin.java   |  57 --
 .../apache/nutch/plugin/TestPluginSystem.java   | 302 -------
 .../org/apache/nutch/protocol/TestContent.java  |  94 --
 .../nutch/protocol/TestProtocolFactory.java     |  85 --
 .../apache/nutch/segment/TestSegmentMerger.java | 131 ---
 .../segment/TestSegmentMergerCrawlDatums.java   | 427 ---------
 .../apache/nutch/service/TestNutchServer.java   |  65 --
 .../nutch/tools/TestCommonCrawlDataDumper.java  | 125 ---
 .../tools/proxy/AbstractTestbedHandler.java     |  49 -
 .../apache/nutch/tools/proxy/DelayHandler.java  |  56 --
 .../apache/nutch/tools/proxy/FakeHandler.java   | 102 ---
 .../nutch/tools/proxy/LogDebugHandler.java      |  64 --
 .../nutch/tools/proxy/NotFoundHandler.java      |  40 -
 .../apache/nutch/tools/proxy/ProxyTestbed.java  | 156 ----
 .../nutch/tools/proxy/SegmentHandler.java       | 255 ------
 .../apache/nutch/tools/proxy/package-info.java  |  22 -
 .../org/apache/nutch/util/DumpFileUtilTest.java |  68 --
 .../apache/nutch/util/TestEncodingDetector.java |  90 --
 .../org/apache/nutch/util/TestGZIPUtils.java    | 241 -----
 .../org/apache/nutch/util/TestMimeUtil.java     | 127 ---
 .../org/apache/nutch/util/TestNodeWalker.java   | 107 ---
 .../nutch/util/TestPrefixStringMatcher.java     | 115 ---
 .../org/apache/nutch/util/TestStringUtil.java   |  61 --
 .../nutch/util/TestSuffixStringMatcher.java     | 114 ---
 .../org/apache/nutch/util/TestTableUtil.java    |  75 --
 src/test/org/apache/nutch/util/TestURLUtil.java | 281 ------
 .../apache/nutch/util/WritableTestUtils.java    |  55 --
 .../fetch-test-site/dup_of_pagea.html           |  11 -
 .../fetch-test-site/exception.html              |  13 -
 src/testresources/fetch-test-site/index.html    |  13 -
 .../fetch-test-site/nested_spider_trap.html     |  23 -
 src/testresources/fetch-test-site/pagea.html    |  11 -
 src/testresources/fetch-test-site/pageb.html    |  11 -
 src/testresources/fetch-test-site/robots.txt    |   0
 src/testresources/test-mime-util/test.xlsx      | Bin 3950 -> 0 bytes
 .../20150309101625/content/part-00000/.data.crc | Bin 124 -> 0 bytes
 .../content/part-00000/.index.crc               | Bin 12 -> 0 bytes
 .../20150309101625/content/part-00000/data      | Bin 14452 -> 0 bytes
 .../20150309101625/content/part-00000/index     | Bin 217 -> 0 bytes
 .../crawl_fetch/part-00000/.data.crc            | Bin 12 -> 0 bytes
 .../crawl_fetch/part-00000/.index.crc           | Bin 12 -> 0 bytes
 .../20150309101625/crawl_fetch/part-00000/data  | Bin 293 -> 0 bytes
 .../20150309101625/crawl_fetch/part-00000/index | Bin 217 -> 0 bytes
 .../crawl_generate/.part-00000.crc              | Bin 12 -> 0 bytes
 .../20150309101625/crawl_generate/part-00000    | Bin 169 -> 0 bytes
 .../20150309101625/crawl_parse/.part-00000.crc  | Bin 68 -> 0 bytes
 .../20150309101625/crawl_parse/part-00000       | Bin 7627 -> 0 bytes
 .../parse_data/part-00000/.data.crc             | Bin 24 -> 0 bytes
 .../parse_data/part-00000/.index.crc            | Bin 12 -> 0 bytes
 .../20150309101625/parse_data/part-00000/data   | Bin 1985 -> 0 bytes
 .../20150309101625/parse_data/part-00000/index  | Bin 217 -> 0 bytes
 .../parse_text/part-00000/.data.crc             | Bin 60 -> 0 bytes
 .../parse_text/part-00000/.index.crc            | Bin 12 -> 0 bytes
 .../20150309101625/parse_text/part-00000/data   | Bin 6554 -> 0 bytes
 .../20150309101625/parse_text/part-00000/index  | Bin 217 -> 0 bytes
 .../20150309101656/content/part-00000/.data.crc | Bin 3372 -> 0 bytes
 .../content/part-00000/.index.crc               | Bin 12 -> 0 bytes
 .../20150309101656/content/part-00000/data      | Bin 430250 -> 0 bytes
 .../20150309101656/content/part-00000/index     | Bin 220 -> 0 bytes
 .../crawl_fetch/part-00000/.data.crc            | Bin 104 -> 0 bytes
 .../crawl_fetch/part-00000/.index.crc           | Bin 12 -> 0 bytes
 .../20150309101656/crawl_fetch/part-00000/data  | Bin 12121 -> 0 bytes
 .../20150309101656/crawl_fetch/part-00000/index | Bin 220 -> 0 bytes
 .../crawl_generate/.part-00000.crc              | Bin 52 -> 0 bytes
 .../20150309101656/crawl_generate/part-00000    | Bin 5590 -> 0 bytes
 .../20150309101656/crawl_parse/.part-00000.crc  | Bin 1652 -> 0 bytes
 .../20150309101656/crawl_parse/part-00000       | Bin 210047 -> 0 bytes
 .../parse_data/part-00000/.data.crc             | Bin 460 -> 0 bytes
 .../parse_data/part-00000/.index.crc            | Bin 12 -> 0 bytes
 .../20150309101656/parse_data/part-00000/data   | Bin 57355 -> 0 bytes
 .../20150309101656/parse_data/part-00000/index  | Bin 220 -> 0 bytes
 .../parse_text/part-00000/.data.crc             | Bin 1260 -> 0 bytes
 .../parse_text/part-00000/.index.crc            | Bin 12 -> 0 bytes
 .../20150309101656/parse_text/part-00000/data   | Bin 159920 -> 0 bytes
 .../20150309101656/parse_text/part-00000/index  | Bin 220 -> 0 bytes
 1973 files changed, 102499 insertions(+), 98774 deletions(-)
----------------------------------------------------------------------

[43/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build for nutch-core and nutch-plugins

Posted by th...@apache.org.

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/net/URLNormalizer.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/net/URLNormalizer.java b/nutch-core/src/main/java/org/apache/nutch/net/URLNormalizer.java
new file mode 100644
index 0000000..78ccb27
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/net/URLNormalizer.java
@@ -0,0 +1,37 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.net;
+
+import java.net.MalformedURLException;
+
+import org.apache.hadoop.conf.Configurable;
+
+/**
+ * Interface used to convert URLs to normal form and optionally perform
+ * substitutions
+ */
+public interface URLNormalizer extends Configurable {
+
+  /* Extension ID */
+  public static final String X_POINT_ID = URLNormalizer.class.getName();
+
+  /* Interface for URL normalization */
+  public String normalize(String urlString, String scope)
+      throws MalformedURLException;
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/net/URLNormalizerChecker.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/net/URLNormalizerChecker.java b/nutch-core/src/main/java/org/apache/nutch/net/URLNormalizerChecker.java
new file mode 100644
index 0000000..d8f1c6e
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/net/URLNormalizerChecker.java
@@ -0,0 +1,117 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.net;
+
+import org.apache.nutch.plugin.Extension;
+import org.apache.nutch.plugin.ExtensionPoint;
+import org.apache.nutch.plugin.PluginRepository;
+
+import org.apache.hadoop.conf.Configuration;
+
+import org.apache.nutch.util.NutchConfiguration;
+
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
+
+/**
+ * Checks one given normalizer or all normalizers.
+ */
+public class URLNormalizerChecker {
+
+  private Configuration conf;
+
+  public URLNormalizerChecker(Configuration conf) {
+    this.conf = conf;
+  }
+
+  private void checkOne(String normalizerName, String scope) throws Exception {
+    URLNormalizer normalizer = null;
+
+    ExtensionPoint point = PluginRepository.get(conf).getExtensionPoint(
+        URLNormalizer.X_POINT_ID);
+
+    if (point == null)
+      throw new RuntimeException(URLNormalizer.X_POINT_ID + " not found.");
+
+    Extension[] extensions = point.getExtensions();
+
+    for (int i = 0; i < extensions.length; i++) {
+      Extension extension = extensions[i];
+      normalizer = (URLNormalizer) extension.getExtensionInstance();
+      if (normalizer.getClass().getName().equals(normalizerName)) {
+        break;
+      } else {
+        normalizer = null;
+      }
+    }
+
+    if (normalizer == null)
+      throw new RuntimeException("URLNormalizer " + normalizerName
+          + " not found.");
+
+    System.out.println("Checking URLNormalizer " + normalizerName);
+
+    BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
+    String line;
+    while ((line = in.readLine()) != null) {
+      String out = normalizer.normalize(line, scope);
+      System.out.println(out);
+    }
+  }
+
+  private void checkAll(String scope) throws Exception {
+    System.out.println("Checking combination of all URLNormalizers available");
+
+    BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
+    String line;
+    URLNormalizers normalizers = new URLNormalizers(conf, scope);
+    while ((line = in.readLine()) != null) {
+      String out = normalizers.normalize(line, scope);
+      System.out.println(out);
+    }
+  }
+
+  public static void main(String[] args) throws Exception {
+
+    String usage = "Usage: URLNormalizerChecker [-normalizer <normalizerName>] [-scope <scope>]"
+        + "\n\tscope can be one of: default,partition,generate_host_count,fetcher,crawldb,linkdb,inject,outlink";
+
+    String normalizerName = null;
+    String scope = URLNormalizers.SCOPE_DEFAULT;
+    for (int i = 0; i < args.length; i++) {
+      if (args[i].equals("-normalizer")) {
+        normalizerName = args[++i];
+      } else if (args[i].equals("-scope")) {
+        scope = args[++i];
+      } else {
+        System.err.println(usage);
+        System.exit(-1);
+      }
+    }
+
+    URLNormalizerChecker checker = new URLNormalizerChecker(
+        NutchConfiguration.create());
+    if (normalizerName != null) {
+      checker.checkOne(normalizerName, scope);
+    } else {
+      checker.checkAll(scope);
+    }
+
+    System.exit(0);
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/net/URLNormalizers.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/net/URLNormalizers.java b/nutch-core/src/main/java/org/apache/nutch/net/URLNormalizers.java
new file mode 100644
index 0000000..7a34353
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/net/URLNormalizers.java
@@ -0,0 +1,325 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.net;
+
+import java.net.MalformedURLException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Set;
+import java.util.Vector;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.plugin.Extension;
+import org.apache.nutch.plugin.ExtensionPoint;
+import org.apache.nutch.plugin.PluginRepository;
+import org.apache.nutch.plugin.PluginRuntimeException;
+import org.apache.nutch.util.ObjectCache;
+
+/**
+ * This class uses a "chained filter" pattern to run defined normalizers.
+ * Different lists of normalizers may be defined for different "scopes", or
+ * contexts where they are used (note however that they need to be activated
+ * first through <tt>plugin.include</tt> property).
+ * 
+ * <p>
+ * There is one global scope defined by default, which consists of all active
+ * normalizers. The order in which these normalizers are executed may be defined
+ * in "urlnormalizer.order" property, which lists space-separated implementation
+ * classes (if this property is missing normalizers will be run in random
+ * order). If there are more normalizers activated than explicitly named on this
+ * list, the remaining ones will be run in random order after the ones specified
+ * on the list are executed.
+ * </p>
+ * <p>
+ * You can define a set of contexts (or scopes) in which normalizers may be
+ * called. Each scope can have its own list of normalizers (defined in
+ * "urlnormalizer.scope.<scope_name>" property) and its own order (defined in
+ * "urlnormalizer.order.<scope_name>" property). If any of these properties are
+ * missing, default settings are used for the global scope.
+ * </p>
+ * <p>
+ * In case no normalizers are required for any given scope, a
+ * <code>org.apache.nutch.net.urlnormalizer.pass.PassURLNormalizer</code> should
+ * be used.
+ * </p>
+ * <p>
+ * Each normalizer may further select among many configurations, depending on
+ * the scope in which it is called, because the scope name is passed as a
+ * parameter to each normalizer. You can also use the same normalizer for many
+ * scopes.
+ * </p>
+ * <p>
+ * Several scopes have been defined, and various Nutch tools will attempt using
+ * scope-specific normalizers first (and fall back to default config if
+ * scope-specific configuration is missing).
+ * </p>
+ * <p>
+ * Normalizers may be run several times, to ensure that modifications introduced
+ * by normalizers at the end of the list can be further reduced by normalizers
+ * executed at the beginning. By default this loop is executed just once - if
+ * you want to ensure that all possible combinations have been applied you may
+ * want to run this loop up to the number of activated normalizers. This loop
+ * count can be configured through <tt>urlnormalizer.loop.count</tt> property.
+ * As soon as the url is unchanged the loop will stop and return the result.
+ * </p>
+ * 
+ * @author Andrzej Bialecki
+ */
+public final class URLNormalizers {
+
+  /**
+   * Default scope. If no scope properties are defined then the configuration
+   * for this scope will be used.
+   */
+  public static final String SCOPE_DEFAULT = "default";
+  /** Scope used by {@link org.apache.nutch.crawl.URLPartitioner}. */
+  public static final String SCOPE_PARTITION = "partition";
+  /** Scope used by {@link org.apache.nutch.crawl.Generator}. */
+  public static final String SCOPE_GENERATE_HOST_COUNT = "generate_host_count";
+  /**
+   * Scope used by {@link org.apache.nutch.fetcher.Fetcher} when processing
+   * redirect URLs.
+   */
+  public static final String SCOPE_FETCHER = "fetcher";
+  /** Scope used when updating the CrawlDb with new URLs. */
+  public static final String SCOPE_CRAWLDB = "crawldb";
+  /** Scope used when updating the LinkDb with new URLs. */
+  public static final String SCOPE_LINKDB = "linkdb";
+  /** Scope used by {@link org.apache.nutch.crawl.Injector}. */
+  public static final String SCOPE_INJECT = "inject";
+  /**
+   * Scope used when constructing new {@link org.apache.nutch.parse.Outlink}
+   * instances.
+   */
+  public static final String SCOPE_OUTLINK = "outlink";
+  /** Scope used when indexing URLs. */
+  public static final String SCOPE_INDEXER = "indexer";
+
+  public static final Logger LOG = LoggerFactory
+      .getLogger(URLNormalizers.class);
+
+  /* Empty extension list for caching purposes. */
+  private final List<Extension> EMPTY_EXTENSION_LIST = Collections
+      .<Extension> emptyList();
+
+  private final URLNormalizer[] EMPTY_NORMALIZERS = new URLNormalizer[0];
+
+  private Configuration conf;
+
+  private ExtensionPoint extensionPoint;
+
+  private URLNormalizer[] normalizers;
+
+  private int loopCount;
+
+  public URLNormalizers(Configuration conf, String scope) {
+    this.conf = conf;
+    this.extensionPoint = PluginRepository.get(conf).getExtensionPoint(
+        URLNormalizer.X_POINT_ID);
+    ObjectCache objectCache = ObjectCache.get(conf);
+
+    if (this.extensionPoint == null) {
+      throw new RuntimeException("x point " + URLNormalizer.X_POINT_ID
+          + " not found.");
+    }
+
+    normalizers = (URLNormalizer[]) objectCache
+        .getObject(URLNormalizer.X_POINT_ID + "_" + scope);
+    if (normalizers == null) {
+      normalizers = getURLNormalizers(scope);
+    }
+    if (normalizers == EMPTY_NORMALIZERS) {
+      normalizers = (URLNormalizer[]) objectCache
+          .getObject(URLNormalizer.X_POINT_ID + "_" + SCOPE_DEFAULT);
+      if (normalizers == null) {
+        normalizers = getURLNormalizers(SCOPE_DEFAULT);
+      }
+    }
+
+    loopCount = conf.getInt("urlnormalizer.loop.count", 1);
+  }
+
+  /**
+   * Function returns an array of {@link URLNormalizer}s for a given scope, with
+   * a specified order.
+   * 
+   * @param scope
+   *          The scope to return the <code>Array</code> of
+   *          {@link URLNormalizer}s for.
+   * @return An <code>Array</code> of {@link URLNormalizer}s for the given
+   *         scope.
+   * @throws PluginRuntimeException
+   */
+  URLNormalizer[] getURLNormalizers(String scope) {
+    List<Extension> extensions = getExtensions(scope);
+    ObjectCache objectCache = ObjectCache.get(conf);
+
+    if (extensions == EMPTY_EXTENSION_LIST) {
+      return EMPTY_NORMALIZERS;
+    }
+
+    List<URLNormalizer> normalizers = new Vector<URLNormalizer>(
+        extensions.size());
+
+    Iterator<Extension> it = extensions.iterator();
+    while (it.hasNext()) {
+      Extension ext = it.next();
+      URLNormalizer normalizer = null;
+      try {
+        // check to see if we've cached this URLNormalizer instance yet
+        normalizer = (URLNormalizer) objectCache.getObject(ext.getId());
+        if (normalizer == null) {
+          // go ahead and instantiate it and then cache it
+          normalizer = (URLNormalizer) ext.getExtensionInstance();
+          objectCache.setObject(ext.getId(), normalizer);
+        }
+        normalizers.add(normalizer);
+      } catch (PluginRuntimeException e) {
+        e.printStackTrace();
+        LOG.warn("URLNormalizers:PluginRuntimeException when "
+            + "initializing url normalizer plugin "
+            + ext.getDescriptor().getPluginId()
+            + " instance in getURLNormalizers "
+            + "function: attempting to continue instantiating plugins");
+      }
+    }
+    return normalizers.toArray(new URLNormalizer[normalizers.size()]);
+  }
+
+  /**
+   * Finds the best-suited normalizer plugin for a given scope.
+   * 
+   * @param scope
+   *          Scope for which we seek a normalizer plugin.
+   * @return a list of extensions to be used for this scope. If none, returns
+   *         empty list.
+   * @throws PluginRuntimeException
+   */
+  @SuppressWarnings("unchecked")
+  private List<Extension> getExtensions(String scope) {
+    ObjectCache objectCache = ObjectCache.get(conf);
+    List<Extension> extensions = (List<Extension>) objectCache
+        .getObject(URLNormalizer.X_POINT_ID + "_x_" + scope);
+
+    // Just compare the reference:
+    // if this is the empty list, we know we will find no extension.
+    if (extensions == EMPTY_EXTENSION_LIST) {
+      return EMPTY_EXTENSION_LIST;
+    }
+
+    if (extensions == null) {
+      extensions = findExtensions(scope);
+      if (extensions != null) {
+        objectCache.setObject(URLNormalizer.X_POINT_ID + "_x_" + scope,
+            extensions);
+      } else {
+        // Put the empty extension list into cache
+        // to remember we don't know any related extension.
+        objectCache.setObject(URLNormalizer.X_POINT_ID + "_x_" + scope,
+            EMPTY_EXTENSION_LIST);
+        extensions = EMPTY_EXTENSION_LIST;
+      }
+    }
+    return extensions;
+  }
+
+  /**
+   * searches a list of suitable url normalizer plugins for the given scope.
+   * 
+   * @param scope
+   *          Scope for which we seek a url normalizer plugin.
+   * @return List - List of extensions to be used for this scope. If none,
+   *         returns null.
+   * @throws PluginRuntimeException
+   */
+  private List<Extension> findExtensions(String scope) {
+
+    String[] orders = null;
+    String orderlist = conf.get("urlnormalizer.order." + scope);
+    if (orderlist == null)
+      orderlist = conf.get("urlnormalizer.order");
+    if (orderlist != null && !orderlist.trim().equals("")) {
+      orders = orderlist.trim().split("\\s+");
+    }
+    String scopelist = conf.get("urlnormalizer.scope." + scope);
+    Set<String> impls = null;
+    if (scopelist != null && !scopelist.trim().equals("")) {
+      String[] names = scopelist.split("\\s+");
+      impls = new HashSet<String>(Arrays.asList(names));
+    }
+    Extension[] extensions = this.extensionPoint.getExtensions();
+    HashMap<String, Extension> normalizerExtensions = new HashMap<String, Extension>();
+    for (int i = 0; i < extensions.length; i++) {
+      Extension extension = extensions[i];
+      if (impls != null && !impls.contains(extension.getClazz()))
+        continue;
+      normalizerExtensions.put(extension.getClazz(), extension);
+    }
+    List<Extension> res = new ArrayList<Extension>();
+    if (orders == null) {
+      res.addAll(normalizerExtensions.values());
+    } else {
+      // first add those explicitly named in correct order
+      for (int i = 0; i < orders.length; i++) {
+        Extension e = normalizerExtensions.get(orders[i]);
+        if (e != null) {
+          res.add(e);
+          normalizerExtensions.remove(orders[i]);
+        }
+      }
+      // then add all others in random order
+      res.addAll(normalizerExtensions.values());
+    }
+    return res;
+  }
+
+  /**
+   * Normalize
+   * 
+   * @param urlString
+   *          The URL string to normalize.
+   * @param scope
+   *          The given scope.
+   * @return A normalized String, using the given <code>scope</code>
+   * @throws MalformedURLException
+   *           If the given URL string is malformed.
+   */
+  public String normalize(String urlString, String scope)
+      throws MalformedURLException {
+    // optionally loop several times, and break if no further changes
+    String initialString = urlString;
+    for (int k = 0; k < loopCount; k++) {
+      for (int i = 0; i < this.normalizers.length; i++) {
+        if (urlString == null)
+          return null;
+        urlString = this.normalizers[i].normalize(urlString, scope);
+      }
+      if (initialString.equals(urlString))
+        break;
+      initialString = urlString;
+    }
+    return urlString;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/net/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/net/package-info.java b/nutch-core/src/main/java/org/apache/nutch/net/package-info.java
new file mode 100644
index 0000000..19e0111
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/net/package-info.java
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Web-related interfaces: URL {@link org.apache.nutch.net.URLFilter filters}
+ * and {@link org.apache.nutch.net.URLNormalizer normalizers}.
+ */
+package org.apache.nutch.net;
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/net/protocols/HttpDateFormat.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/net/protocols/HttpDateFormat.java b/nutch-core/src/main/java/org/apache/nutch/net/protocols/HttpDateFormat.java
new file mode 100644
index 0000000..5f4115b
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/net/protocols/HttpDateFormat.java
@@ -0,0 +1,124 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.net.protocols;
+
+import java.util.Calendar;
+import java.util.Date;
+import java.util.Locale;
+import java.util.TimeZone;
+import java.text.SimpleDateFormat;
+import java.text.ParseException;
+
+/**
+ * class to handle HTTP dates.
+ * 
+ * Modified from FastHttpDateFormat.java in jakarta-tomcat.
+ * 
+ * @author John Xing
+ */
+public class HttpDateFormat {
+
+  protected static SimpleDateFormat format = new SimpleDateFormat(
+      "EEE, dd MMM yyyy HH:mm:ss zzz", Locale.US);
+
+  /**
+   * HTTP date uses TimeZone GMT
+   */
+  static {
+    format.setTimeZone(TimeZone.getTimeZone("GMT"));
+  }
+
+  // HttpDate (long t) {
+  // }
+
+  // HttpDate (String s) {
+  // }
+
+  // /**
+  // * Get the current date in HTTP format.
+  // */
+  // public static String getCurrentDate() {
+  //
+  // long now = System.currentTimeMillis();
+  // if ((now - currentDateGenerated) > 1000) {
+  // synchronized (format) {
+  // if ((now - currentDateGenerated) > 1000) {
+  // currentDateGenerated = now;
+  // currentDate = format.format(new Date(now));
+  // }
+  // }
+  // }
+  // return currentDate;
+  //
+  // }
+
+  /**
+   * Get the HTTP format of the specified date.
+   */
+  public static String toString(Date date) {
+    String string;
+    synchronized (format) {
+      string = format.format(date);
+    }
+    return string;
+  }
+
+  public static String toString(Calendar cal) {
+    String string;
+    synchronized (format) {
+      string = format.format(cal.getTime());
+    }
+    return string;
+  }
+
+  public static String toString(long time) {
+    String string;
+    synchronized (format) {
+      string = format.format(new Date(time));
+    }
+    return string;
+  }
+
+  public static Date toDate(String dateString) throws ParseException {
+    Date date;
+    synchronized (format) {
+      date = format.parse(dateString);
+    }
+    return date;
+  }
+
+  public static long toLong(String dateString) throws ParseException {
+    long time;
+    synchronized (format) {
+      time = format.parse(dateString).getTime();
+    }
+    return time;
+  }
+
+  public static void main(String[] args) throws Exception {
+    Date now = new Date(System.currentTimeMillis());
+
+    String string = HttpDateFormat.toString(now);
+
+    long time = HttpDateFormat.toLong(string);
+
+    System.out.println(string);
+    System.out.println(HttpDateFormat.toString(time));
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/net/protocols/ProtocolException.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/net/protocols/ProtocolException.java b/nutch-core/src/main/java/org/apache/nutch/net/protocols/ProtocolException.java
new file mode 100644
index 0000000..0ae3776
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/net/protocols/ProtocolException.java
@@ -0,0 +1,47 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.net.protocols;
+
+import java.io.Serializable;
+
+/**
+ * Base exception for all protocol handlers
+ * 
+ * @deprecated Use {@link org.apache.nutch.protocol.ProtocolException} instead.
+ */
+@Deprecated
+@SuppressWarnings("serial")
+public class ProtocolException extends Exception implements Serializable {
+
+  public ProtocolException() {
+    super();
+  }
+
+  public ProtocolException(String message) {
+    super(message);
+  }
+
+  public ProtocolException(String message, Throwable cause) {
+    super(message, cause);
+  }
+
+  public ProtocolException(Throwable cause) {
+    super(cause);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/net/protocols/Response.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/net/protocols/Response.java b/nutch-core/src/main/java/org/apache/nutch/net/protocols/Response.java
new file mode 100644
index 0000000..efff14b
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/net/protocols/Response.java
@@ -0,0 +1,46 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.net.protocols;
+
+// JDK imports
+import java.net.URL;
+
+// Nutch imports
+import org.apache.nutch.metadata.HttpHeaders;
+import org.apache.nutch.metadata.Metadata;
+
+/**
+ * A response interface. Makes all protocols model HTTP.
+ */
+public interface Response extends HttpHeaders {
+
+  /** Returns the URL used to retrieve this response. */
+  public URL getUrl();
+
+  /** Returns the response code. */
+  public int getCode();
+
+  /** Returns the value of a named header. */
+  public String getHeader(String name);
+
+  /** Returns all the headers. */
+  public Metadata getHeaders();
+
+  /** Returns the full content of the response. */
+  public byte[] getContent();
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/net/protocols/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/net/protocols/package-info.java b/nutch-core/src/main/java/org/apache/nutch/net/protocols/package-info.java
new file mode 100644
index 0000000..8823f5b
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/net/protocols/package-info.java
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Helper classes related to the {@link org.apache.nutch.protocol.Protocol Protocol}
+ * interface, sea also {@link org.apache.nutch.protocol}.
+ */
+package org.apache.nutch.net.protocols;
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/parse/HTMLMetaTags.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/parse/HTMLMetaTags.java b/nutch-core/src/main/java/org/apache/nutch/parse/HTMLMetaTags.java
new file mode 100644
index 0000000..c36c036
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/parse/HTMLMetaTags.java
@@ -0,0 +1,203 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse;
+
+import java.net.URL;
+import java.util.Iterator;
+import java.util.Properties;
+
+import org.apache.nutch.metadata.Metadata;
+
+/**
+ * This class holds the information about HTML "meta" tags extracted from a
+ * page. Some special tags have convenience methods for easy checking.
+ */
+public class HTMLMetaTags {
+  private boolean noIndex = false;
+
+  private boolean noFollow = false;
+
+  private boolean noCache = false;
+
+  private URL baseHref = null;
+
+  private boolean refresh = false;
+
+  private int refreshTime = 0;
+
+  private URL refreshHref = null;
+
+  private Metadata generalTags = new Metadata();
+
+  private Properties httpEquivTags = new Properties();
+
+  /**
+   * Sets all boolean values to <code>false</code>. Clears all other tags.
+   */
+  public void reset() {
+    noIndex = false;
+    noFollow = false;
+    noCache = false;
+    refresh = false;
+    refreshTime = 0;
+    baseHref = null;
+    refreshHref = null;
+    generalTags.clear();
+    httpEquivTags.clear();
+  }
+
+  /**
+   * Sets <code>noFollow</code> to <code>true</code>.
+   */
+  public void setNoFollow() {
+    noFollow = true;
+  }
+
+  /**
+   * Sets <code>noIndex</code> to <code>true</code>.
+   */
+  public void setNoIndex() {
+    noIndex = true;
+  }
+
+  /**
+   * Sets <code>noCache</code> to <code>true</code>.
+   */
+  public void setNoCache() {
+    noCache = true;
+  }
+
+  /**
+   * Sets <code>refresh</code> to the supplied value.
+   */
+  public void setRefresh(boolean refresh) {
+    this.refresh = refresh;
+  }
+
+  /**
+   * Sets the <code>baseHref</code>.
+   */
+  public void setBaseHref(URL baseHref) {
+    this.baseHref = baseHref;
+  }
+
+  /**
+   * Sets the <code>refreshHref</code>.
+   */
+  public void setRefreshHref(URL refreshHref) {
+    this.refreshHref = refreshHref;
+  }
+
+  /**
+   * Sets the <code>refreshTime</code>.
+   */
+  public void setRefreshTime(int refreshTime) {
+    this.refreshTime = refreshTime;
+  }
+
+  /**
+   * A convenience method. Returns the current value of <code>noIndex</code>.
+   */
+  public boolean getNoIndex() {
+    return noIndex;
+  }
+
+  /**
+   * A convenience method. Returns the current value of <code>noFollow</code>.
+   */
+  public boolean getNoFollow() {
+    return noFollow;
+  }
+
+  /**
+   * A convenience method. Returns the current value of <code>noCache</code>.
+   */
+  public boolean getNoCache() {
+    return noCache;
+  }
+
+  /**
+   * A convenience method. Returns the current value of <code>refresh</code>.
+   */
+  public boolean getRefresh() {
+    return refresh;
+  }
+
+  /**
+   * A convenience method. Returns the <code>baseHref</code>, if set, or
+   * <code>null</code> otherwise.
+   */
+  public URL getBaseHref() {
+    return baseHref;
+  }
+
+  /**
+   * A convenience method. Returns the <code>refreshHref</code>, if set, or
+   * <code>null</code> otherwise. The value may be invalid if
+   * {@link #getRefresh()}returns <code>false</code>.
+   */
+  public URL getRefreshHref() {
+    return refreshHref;
+  }
+
+  /**
+   * A convenience method. Returns the current value of <code>refreshTime</code>
+   * . The value may be invalid if {@link #getRefresh()}returns
+   * <code>false</code>.
+   */
+  public int getRefreshTime() {
+    return refreshTime;
+  }
+
+  /**
+   * Returns all collected values of the general meta tags. Property names are
+   * tag names, property values are "content" values.
+   */
+  public Metadata getGeneralTags() {
+    return generalTags;
+  }
+
+  /**
+   * Returns all collected values of the "http-equiv" meta tags. Property names
+   * are tag names, property values are "content" values.
+   */
+  public Properties getHttpEquivTags() {
+    return httpEquivTags;
+  }
+
+  public String toString() {
+    StringBuffer sb = new StringBuffer();
+    sb.append("base=" + baseHref + ", noCache=" + noCache + ", noFollow="
+        + noFollow + ", noIndex=" + noIndex + ", refresh=" + refresh
+        + ", refreshHref=" + refreshHref + "\n");
+    sb.append(" * general tags:\n");
+    String[] names = generalTags.names();
+    for (String name : names) {
+      String key = name;
+      sb.append("   - " + key + "\t=\t" + generalTags.get(key) + "\n");
+    }
+    sb.append(" * http-equiv tags:\n");
+    Iterator<Object> it = httpEquivTags.keySet().iterator();
+    it = httpEquivTags.keySet().iterator();
+    while (it.hasNext()) {
+      String key = (String) it.next();
+      sb.append("   - " + key + "\t=\t" + httpEquivTags.get(key) + "\n");
+    }
+    return sb.toString();
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/parse/HtmlParseFilter.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/parse/HtmlParseFilter.java b/nutch-core/src/main/java/org/apache/nutch/parse/HtmlParseFilter.java
new file mode 100644
index 0000000..55b51ac
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/parse/HtmlParseFilter.java
@@ -0,0 +1,45 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse;
+
+// JDK imports
+import org.w3c.dom.DocumentFragment;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configurable;
+
+// Nutch imports
+import org.apache.nutch.plugin.Pluggable;
+import org.apache.nutch.protocol.Content;
+
+/**
+ * Extension point for DOM-based HTML parsers. Permits one to add additional
+ * metadata to HTML parses. All plugins found which implement this extension
+ * point are run sequentially on the parse.
+ */
+public interface HtmlParseFilter extends Pluggable, Configurable {
+  /** The name of the extension point. */
+  final static String X_POINT_ID = HtmlParseFilter.class.getName();
+
+  /**
+   * Adds metadata or otherwise modifies a parse of HTML content, given the DOM
+   * tree of a page.
+   */
+  ParseResult filter(Content content, ParseResult parseResult,
+      HTMLMetaTags metaTags, DocumentFragment doc);
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/parse/HtmlParseFilters.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/parse/HtmlParseFilters.java b/nutch-core/src/main/java/org/apache/nutch/parse/HtmlParseFilters.java
new file mode 100644
index 0000000..9dd9aad
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/parse/HtmlParseFilters.java
@@ -0,0 +1,62 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse;
+
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.plugin.PluginRepository;
+import org.apache.hadoop.conf.Configuration;
+
+import org.w3c.dom.DocumentFragment;
+
+/** Creates and caches {@link HtmlParseFilter} implementing plugins. */
+public class HtmlParseFilters {
+
+  private HtmlParseFilter[] htmlParseFilters;
+
+  public static final String HTMLPARSEFILTER_ORDER = "htmlparsefilter.order";
+
+  public HtmlParseFilters(Configuration conf) {
+    htmlParseFilters = (HtmlParseFilter[]) PluginRepository.get(conf)
+        .getOrderedPlugins(HtmlParseFilter.class, HtmlParseFilter.X_POINT_ID,
+            HTMLPARSEFILTER_ORDER);
+  }
+
+  /** Run all defined filters. */
+  public ParseResult filter(Content content, ParseResult parseResult,
+      HTMLMetaTags metaTags, DocumentFragment doc) {
+
+    // loop on each filter
+    for (int i = 0; i < this.htmlParseFilters.length; i++) {
+      // call filter interface
+      parseResult = htmlParseFilters[i].filter(content, parseResult, metaTags,
+          doc);
+
+      // any failure on parse obj, return
+      if (!parseResult.isSuccess()) {
+        // TODO: What happens when parseResult.isEmpty() ?
+        // Maybe clone parseResult and use parseResult as backup...
+
+        // remove failed parse before return
+        parseResult.filter();
+        return parseResult;
+      }
+    }
+
+    return parseResult;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/parse/Outlink.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/parse/Outlink.java b/nutch-core/src/main/java/org/apache/nutch/parse/Outlink.java
new file mode 100644
index 0000000..3ee0354
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/parse/Outlink.java
@@ -0,0 +1,135 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.net.MalformedURLException;
+import java.util.Map.Entry;
+
+import org.apache.hadoop.io.MapWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+
+/* An outgoing link from a page. */
+public class Outlink implements Writable {
+
+  private String toUrl;
+  private String anchor;
+  private MapWritable md;
+
+  public Outlink() {
+  }
+
+  public Outlink(String toUrl, String anchor) throws MalformedURLException {
+    this.toUrl = toUrl;
+    if (anchor == null)
+      anchor = "";
+    this.anchor = anchor;
+    md = null;
+  }
+
+  public void readFields(DataInput in) throws IOException {
+    toUrl = Text.readString(in);
+    anchor = Text.readString(in);
+    boolean hasMD = in.readBoolean();
+    if (hasMD) {
+      md = new org.apache.hadoop.io.MapWritable();
+      md.readFields(in);
+    } else
+      md = null;
+  }
+
+  /** Skips over one Outlink in the input. */
+  public static void skip(DataInput in) throws IOException {
+    Text.skip(in); // skip toUrl
+    Text.skip(in); // skip anchor
+    boolean hasMD = in.readBoolean();
+    if (hasMD) {
+      MapWritable metadata = new org.apache.hadoop.io.MapWritable();
+      metadata.readFields(in);
+      ;
+    }
+  }
+
+  public void write(DataOutput out) throws IOException {
+    Text.writeString(out, toUrl);
+    Text.writeString(out, anchor);
+    if (md != null && md.size() > 0) {
+      out.writeBoolean(true);
+      md.write(out);
+    } else {
+      out.writeBoolean(false);
+    }
+  }
+
+  public static Outlink read(DataInput in) throws IOException {
+    Outlink outlink = new Outlink();
+    outlink.readFields(in);
+    return outlink;
+  }
+
+  public String getToUrl() {
+    return toUrl;
+  }
+
+  public void setUrl(String toUrl) {
+    this.toUrl = toUrl;
+  }
+
+  public String getAnchor() {
+    return anchor;
+  }
+
+  public MapWritable getMetadata() {
+    return md;
+  }
+
+  public void setMetadata(MapWritable md) {
+    this.md = md;
+  }
+
+  public boolean equals(Object o) {
+    if (!(o instanceof Outlink))
+      return false;
+    Outlink other = (Outlink) o;
+    return this.toUrl.equals(other.toUrl) && this.anchor.equals(other.anchor);
+  }
+
+  public String toString() {
+    StringBuffer repr = new StringBuffer("toUrl: ");
+    repr.append(toUrl);
+    repr.append(" anchor: ");
+    repr.append(anchor);
+    if (md != null && !md.isEmpty()) {
+      for (Entry<Writable, Writable> e : md.entrySet()) {
+        repr.append(" ");
+        repr.append(e.getKey());
+        repr.append(": ");
+        repr.append(e.getValue());
+      }
+    }
+    return repr.toString();
+  }
+
+  @Override
+  public int hashCode() {
+    return toUrl.hashCode() ^ anchor.hashCode();
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/parse/OutlinkExtractor.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/parse/OutlinkExtractor.java b/nutch-core/src/main/java/org/apache/nutch/parse/OutlinkExtractor.java
new file mode 100644
index 0000000..d1773f8
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/parse/OutlinkExtractor.java
@@ -0,0 +1,145 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse;
+
+import java.net.MalformedURLException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.oro.text.regex.MatchResult;
+import org.apache.oro.text.regex.Pattern;
+import org.apache.oro.text.regex.PatternCompiler;
+import org.apache.oro.text.regex.PatternMatcher;
+import org.apache.oro.text.regex.PatternMatcherInput;
+import org.apache.oro.text.regex.Perl5Compiler;
+import org.apache.oro.text.regex.Perl5Matcher;
+
+/**
+ * Extractor to extract {@link org.apache.nutch.parse.Outlink}s / URLs from
+ * plain text using Regular Expressions.
+ * 
+ * @see <a
+ *      href="http://wiki.java.net/bin/view/Javapedia/RegularExpressions">Comparison
+ *      of different regexp-Implementations </a>
+ * @see <a href="http://regex.info/java.html">Overview about Java Regexp APIs
+ *      </a>
+ * 
+ * @author Stephan Strittmatter - http://www.sybit.de
+ * @version 1.0
+ * @since 0.7
+ */
+public class OutlinkExtractor {
+  private static final Logger LOG = LoggerFactory
+      .getLogger(OutlinkExtractor.class);
+
+  /**
+   * Regex pattern to get URLs within a plain text.
+   * 
+   * @see <a
+   *      href="http://www.truerwords.net/articles/ut/urlactivation.html">http://www.truerwords.net/articles/ut/urlactivation.html
+
+   *      </a>
+   */
+  private static final String URL_PATTERN = "([A-Za-z][A-Za-z0-9+.-]{1,120}:[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2}){1,333}(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]{0,1000}))?)";
+
+  /**
+   * Extracts <code>Outlink</code> from given plain text. Applying this method
+   * to non-plain-text can result in extremely lengthy runtimes for parasitic
+   * cases (postscript is a known example).
+   * 
+   * @param plainText
+   *          the plain text from wich URLs should be extracted.
+   * 
+   * @return Array of <code>Outlink</code>s within found in plainText
+   */
+  public static Outlink[] getOutlinks(final String plainText, Configuration conf) {
+    return OutlinkExtractor.getOutlinks(plainText, "", conf);
+  }
+
+  /**
+   * Extracts <code>Outlink</code> from given plain text and adds anchor to the
+   * extracted <code>Outlink</code>s
+   * 
+   * @param plainText
+   *          the plain text from wich URLs should be extracted.
+   * @param anchor
+   *          the anchor of the url
+   * 
+   * @return Array of <code>Outlink</code>s within found in plainText
+   */
+  public static Outlink[] getOutlinks(final String plainText, String anchor,
+      Configuration conf) {
+    long start = System.currentTimeMillis();
+    final List<Outlink> outlinks = new ArrayList<Outlink>();
+
+    try {
+      final PatternCompiler cp = new Perl5Compiler();
+      final Pattern pattern = cp.compile(URL_PATTERN,
+          Perl5Compiler.CASE_INSENSITIVE_MASK | Perl5Compiler.READ_ONLY_MASK
+              | Perl5Compiler.MULTILINE_MASK);
+      final PatternMatcher matcher = new Perl5Matcher();
+
+      final PatternMatcherInput input = new PatternMatcherInput(plainText);
+
+      MatchResult result;
+      String url;
+
+      // loop the matches
+      while (matcher.contains(input, pattern)) {
+        // if this is taking too long, stop matching
+        // (SHOULD really check cpu time used so that heavily loaded systems
+        // do not unnecessarily hit this limit.)
+        if (System.currentTimeMillis() - start >= 60000L) {
+          if (LOG.isWarnEnabled()) {
+            LOG.warn("Time limit exceeded for getOutLinks");
+          }
+          break;
+        }
+        result = matcher.getMatch();
+        url = result.group(0);
+        try {
+          outlinks.add(new Outlink(url, anchor));
+        } catch (MalformedURLException mue) {
+          LOG.warn("Invalid url: '" + url + "', skipping.");
+        }
+      }
+    } catch (Exception ex) {
+      // if the matcher fails (perhaps a malformed URL) we just log it and move
+      // on
+      if (LOG.isErrorEnabled()) {
+        LOG.error("getOutlinks", ex);
+      }
+    }
+
+    final Outlink[] retval;
+
+    // create array of the Outlinks
+    if (outlinks != null && outlinks.size() > 0) {
+      retval = outlinks.toArray(new Outlink[0]);
+    } else {
+      retval = new Outlink[0];
+    }
+
+    return retval;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/parse/Parse.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/parse/Parse.java b/nutch-core/src/main/java/org/apache/nutch/parse/Parse.java
new file mode 100644
index 0000000..9a33445
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/parse/Parse.java
@@ -0,0 +1,38 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse;
+
+/**
+ * The result of parsing a page's raw content.
+ * 
+ * @see Parser#getParse(Content)
+ */
+public interface Parse {
+
+  /**
+   * The textual content of the page. This is indexed, searched, and used when
+   * generating snippets.
+   */
+  String getText();
+
+  /** Other data extracted from the page. */
+  ParseData getData();
+
+  /** Indicates if the parse is coming from a url or a sub-url */
+  boolean isCanonical();
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/parse/ParseCallable.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/parse/ParseCallable.java b/nutch-core/src/main/java/org/apache/nutch/parse/ParseCallable.java
new file mode 100644
index 0000000..12cae8a
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/parse/ParseCallable.java
@@ -0,0 +1,37 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse;
+
+import java.util.concurrent.Callable;
+
+import org.apache.nutch.protocol.Content;
+
+class ParseCallable implements Callable<ParseResult> {
+  private Parser p;
+  private Content content;
+
+  public ParseCallable(Parser p, Content content) {
+    this.p = p;
+    this.content = content;
+  }
+
+  @Override
+  public ParseResult call() throws Exception {
+    return p.getParse(content);
+  }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/parse/ParseData.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/parse/ParseData.java b/nutch-core/src/main/java/org/apache/nutch/parse/ParseData.java
new file mode 100644
index 0000000..8189269
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/parse/ParseData.java
@@ -0,0 +1,255 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse;
+
+import java.io.*;
+import java.util.*;
+
+import org.apache.commons.cli.Options;
+import org.apache.hadoop.io.*;
+import org.apache.hadoop.util.GenericOptionsParser;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.*;
+import org.apache.hadoop.fs.FileSystem;
+
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.util.NutchConfiguration;
+
+/**
+ * Data extracted from a page's content.
+ * 
+ * @see Parse#getData()
+ */
+public final class ParseData extends VersionedWritable {
+  public static final String DIR_NAME = "parse_data";
+
+  private final static byte VERSION = 5;
+
+  private String title;
+  private Outlink[] outlinks;
+  private Metadata contentMeta;
+  private Metadata parseMeta;
+  private ParseStatus status;
+  private byte version = VERSION;
+
+  public ParseData() {
+    contentMeta = new Metadata();
+    parseMeta = new Metadata();
+  }
+
+  public ParseData(ParseStatus status, String title, Outlink[] outlinks,
+      Metadata contentMeta) {
+    this(status, title, outlinks, contentMeta, new Metadata());
+  }
+
+  public ParseData(ParseStatus status, String title, Outlink[] outlinks,
+      Metadata contentMeta, Metadata parseMeta) {
+    this.status = status;
+    this.title = title;
+    this.outlinks = outlinks;
+    this.contentMeta = contentMeta;
+    this.parseMeta = parseMeta;
+  }
+
+  //
+  // Accessor methods
+  //
+
+  /** The status of parsing the page. */
+  public ParseStatus getStatus() {
+    return status;
+  }
+
+  /** The title of the page. */
+  public String getTitle() {
+    return title;
+  }
+
+  /** The outlinks of the page. */
+  public Outlink[] getOutlinks() {
+    return outlinks;
+  }
+
+  /** The original Metadata retrieved from content */
+  public Metadata getContentMeta() {
+    return contentMeta;
+  }
+
+  /**
+   * Other content properties. This is the place to find format-specific
+   * properties. Different parser implementations for different content types
+   * will populate this differently.
+   */
+  public Metadata getParseMeta() {
+    return parseMeta;
+  }
+
+  public void setParseMeta(Metadata parseMeta) {
+    this.parseMeta = parseMeta;
+  }
+
+  public void setOutlinks(Outlink[] outlinks) {
+    this.outlinks = outlinks;
+  }
+
+  /**
+   * Get a metadata single value. This method first looks for the metadata value
+   * in the parse metadata. If no value is found it the looks for the metadata
+   * in the content metadata.
+   * 
+   * @see #getContentMeta()
+   * @see #getParseMeta()
+   */
+  public String getMeta(String name) {
+    String value = parseMeta.get(name);
+    if (value == null) {
+      value = contentMeta.get(name);
+    }
+    return value;
+  }
+
+  //
+  // Writable methods
+  //
+
+  public byte getVersion() {
+    return version;
+  }
+
+  public final void readFields(DataInput in) throws IOException {
+
+    version = in.readByte();
+    // incompatible change from UTF8 (version < 5) to Text
+    if (version != VERSION)
+      throw new VersionMismatchException(VERSION, version);
+    status = ParseStatus.read(in);
+    title = Text.readString(in); // read title
+
+    int numOutlinks = in.readInt();
+    outlinks = new Outlink[numOutlinks];
+    for (int i = 0; i < numOutlinks; i++) {
+      outlinks[i] = Outlink.read(in);
+    }
+
+    if (version < 3) {
+      int propertyCount = in.readInt(); // read metadata
+      contentMeta.clear();
+      for (int i = 0; i < propertyCount; i++) {
+        contentMeta.add(Text.readString(in), Text.readString(in));
+      }
+    } else {
+      contentMeta.clear();
+      contentMeta.readFields(in);
+    }
+    if (version > 3) {
+      parseMeta.clear();
+      parseMeta.readFields(in);
+    }
+  }
+
+  public final void write(DataOutput out) throws IOException {
+    out.writeByte(VERSION); // write version
+    status.write(out); // write status
+    Text.writeString(out, title); // write title
+
+    out.writeInt(outlinks.length); // write outlinks
+    for (int i = 0; i < outlinks.length; i++) {
+      outlinks[i].write(out);
+    }
+    contentMeta.write(out); // write content metadata
+    parseMeta.write(out);
+  }
+
+  public static ParseData read(DataInput in) throws IOException {
+    ParseData parseText = new ParseData();
+    parseText.readFields(in);
+    return parseText;
+  }
+
+  //
+  // other methods
+  //
+
+  public boolean equals(Object o) {
+    if (!(o instanceof ParseData))
+      return false;
+    ParseData other = (ParseData) o;
+    return this.status.equals(other.status) && this.title.equals(other.title)
+        && Arrays.equals(this.outlinks, other.outlinks)
+        && this.contentMeta.equals(other.contentMeta)
+        && this.parseMeta.equals(other.parseMeta);
+  }
+
+  public String toString() {
+    StringBuffer buffer = new StringBuffer();
+
+    buffer.append("Version: " + version + "\n");
+    buffer.append("Status: " + status + "\n");
+    buffer.append("Title: " + title + "\n");
+
+    if (outlinks != null) {
+      buffer.append("Outlinks: " + outlinks.length + "\n");
+      for (int i = 0; i < outlinks.length; i++) {
+        buffer.append("  outlink: " + outlinks[i] + "\n");
+      }
+    }
+
+    buffer.append("Content Metadata: " + contentMeta + "\n");
+    buffer.append("Parse Metadata: " + parseMeta + "\n");
+
+    return buffer.toString();
+  }
+
+  public static void main(String argv[]) throws Exception {
+    String usage = "ParseData (-local | -dfs <namenode:port>) recno segment";
+
+    if (argv.length < 3) {
+      System.out.println("usage:" + usage);
+      return;
+    }
+
+    Options opts = new Options();
+    Configuration conf = NutchConfiguration.create();
+
+    GenericOptionsParser parser = new GenericOptionsParser(conf, opts, argv);
+
+    String[] remainingArgs = parser.getRemainingArgs();
+    FileSystem fs = FileSystem.get(conf);
+
+    try {
+      int recno = Integer.parseInt(remainingArgs[0]);
+      String segment = remainingArgs[1];
+
+      Path file = new Path(segment, DIR_NAME);
+      System.out.println("Reading from file: " + file);
+
+      ArrayFile.Reader parses = new ArrayFile.Reader(fs, file.toString(), conf);
+
+      ParseData parseDatum = new ParseData();
+      parses.get(recno, parseDatum);
+
+      System.out.println("Retrieved " + recno + " from file " + file);
+      System.out.println(parseDatum);
+
+      parses.close();
+    } finally {
+      fs.close();
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/parse/ParseException.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/parse/ParseException.java b/nutch-core/src/main/java/org/apache/nutch/parse/ParseException.java
new file mode 100644
index 0000000..3f27e33
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/parse/ParseException.java
@@ -0,0 +1,39 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse;
+
+@SuppressWarnings("serial")
+public class ParseException extends Exception {
+
+  public ParseException() {
+    super();
+  }
+
+  public ParseException(String message) {
+    super(message);
+  }
+
+  public ParseException(String message, Throwable cause) {
+    super(message, cause);
+  }
+
+  public ParseException(Throwable cause) {
+    super(cause);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/parse/ParseImpl.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/parse/ParseImpl.java b/nutch-core/src/main/java/org/apache/nutch/parse/ParseImpl.java
new file mode 100644
index 0000000..dc72769
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/parse/ParseImpl.java
@@ -0,0 +1,87 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse;
+
+import java.io.*;
+import org.apache.hadoop.io.*;
+
+/**
+ * The result of parsing a page's raw content.
+ * 
+ * @see Parser#getParse(Content)
+ */
+public class ParseImpl implements Parse, Writable {
+  private ParseText text;
+  private ParseData data;
+  private boolean isCanonical;
+
+  public ParseImpl() {
+  }
+
+  public ParseImpl(Parse parse) {
+    this(new ParseText(parse.getText()), parse.getData(), true);
+  }
+
+  public ParseImpl(String text, ParseData data) {
+    this(new ParseText(text), data, true);
+  }
+
+  public ParseImpl(ParseText text, ParseData data) {
+    this(text, data, true);
+  }
+
+  public ParseImpl(ParseText text, ParseData data, boolean isCanonical) {
+    this.text = text;
+    this.data = data;
+    this.isCanonical = isCanonical;
+  }
+
+  public String getText() {
+    return text.getText();
+  }
+
+  public ParseData getData() {
+    return data;
+  }
+
+  public boolean isCanonical() {
+    return isCanonical;
+  }
+
+  public final void write(DataOutput out) throws IOException {
+    out.writeBoolean(isCanonical);
+    text.write(out);
+    data.write(out);
+  }
+
+  public void readFields(DataInput in) throws IOException {
+    isCanonical = in.readBoolean();
+    text = new ParseText();
+    text.readFields(in);
+
+    data = new ParseData();
+    data.readFields(in);
+  }
+
+  public static ParseImpl read(DataInput in) throws IOException {
+    ParseImpl parseImpl = new ParseImpl();
+    parseImpl.readFields(in);
+    return parseImpl;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/parse/ParseOutputFormat.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/parse/ParseOutputFormat.java b/nutch-core/src/main/java/org/apache/nutch/parse/ParseOutputFormat.java
new file mode 100644
index 0000000..51b32fc
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/parse/ParseOutputFormat.java
@@ -0,0 +1,398 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse;
+
+// Commons Logging imports
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.io.*;
+import org.apache.hadoop.io.MapFile.Writer.Option;
+import org.apache.hadoop.io.SequenceFile.CompressionType;
+import org.apache.hadoop.io.SequenceFile.Metadata;
+import org.apache.hadoop.io.compress.DefaultCodec;
+import org.apache.hadoop.fs.*;
+import org.apache.hadoop.mapred.*;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.fetcher.Fetcher;
+import org.apache.nutch.scoring.ScoringFilterException;
+import org.apache.nutch.scoring.ScoringFilters;
+import org.apache.nutch.util.StringUtil;
+import org.apache.nutch.util.URLUtil;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.net.*;
+
+import java.io.*;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map.Entry;
+
+import org.apache.hadoop.util.Progressable;
+
+/* Parse content in a segment. */
+public class ParseOutputFormat implements OutputFormat<Text, Parse> {
+  private static final Logger LOG = LoggerFactory
+      .getLogger(ParseOutputFormat.class);
+  private URLFilters filters;
+  private URLExemptionFilters exemptionFilters;
+  private URLNormalizers normalizers;
+  private ScoringFilters scfilters;
+
+  private static class SimpleEntry implements Entry<Text, CrawlDatum> {
+    private Text key;
+    private CrawlDatum value;
+
+    public SimpleEntry(Text key, CrawlDatum value) {
+      this.key = key;
+      this.value = value;
+    }
+
+    public Text getKey() {
+      return key;
+    }
+
+    public CrawlDatum getValue() {
+      return value;
+    }
+
+    public CrawlDatum setValue(CrawlDatum value) {
+      this.value = value;
+      return this.value;
+    }
+  }
+
+  public void checkOutputSpecs(FileSystem fs, JobConf job) throws IOException {
+    Path out = FileOutputFormat.getOutputPath(job);
+    if ((out == null) && (job.getNumReduceTasks() != 0)) {
+      throw new InvalidJobConfException("Output directory not set in JobConf.");
+    }
+    if (fs == null) {
+      fs = out.getFileSystem(job);
+    }
+    if (fs.exists(new Path(out, CrawlDatum.PARSE_DIR_NAME)))
+      throw new IOException("Segment already parsed!");
+  }
+
+  public RecordWriter<Text, Parse> getRecordWriter(FileSystem fs, JobConf job,
+      String name, Progressable progress) throws IOException {
+
+    if (job.getBoolean("parse.filter.urls", true)) {
+      filters = new URLFilters(job);
+      exemptionFilters = new URLExemptionFilters(job);
+    }
+
+    if (job.getBoolean("parse.normalize.urls", true)) {
+      normalizers = new URLNormalizers(job, URLNormalizers.SCOPE_OUTLINK);
+    }
+
+    this.scfilters = new ScoringFilters(job);
+    final int interval = job.getInt("db.fetch.interval.default", 2592000);
+    final boolean ignoreInternalLinks = job.getBoolean(
+        "db.ignore.internal.links", false);
+    final boolean ignoreExternalLinks = job.getBoolean(
+        "db.ignore.external.links", false);
+    final String ignoreExternalLinksMode = job.get(
+        "db.ignore.external.links.mode", "byHost");
+    
+    int maxOutlinksPerPage = job.getInt("db.max.outlinks.per.page", 100);
+    final boolean isParsing = job.getBoolean("fetcher.parse", true);
+    final int maxOutlinks = (maxOutlinksPerPage < 0) ? Integer.MAX_VALUE
+        : maxOutlinksPerPage;
+    final CompressionType compType = SequenceFileOutputFormat
+        .getOutputCompressionType(job);
+    Path out = FileOutputFormat.getOutputPath(job);
+
+    Path text = new Path(new Path(out, ParseText.DIR_NAME), name);
+    Path data = new Path(new Path(out, ParseData.DIR_NAME), name);
+    Path crawl = new Path(new Path(out, CrawlDatum.PARSE_DIR_NAME), name);
+
+    final String[] parseMDtoCrawlDB = job.get("db.parsemeta.to.crawldb", "")
+        .split(" *, *");
+
+    // textOut Options
+    Option tKeyClassOpt = (Option) MapFile.Writer.keyClass(Text.class);
+    org.apache.hadoop.io.SequenceFile.Writer.Option tValClassOpt = SequenceFile.Writer.valueClass(ParseText.class);
+    org.apache.hadoop.io.SequenceFile.Writer.Option tProgressOpt = SequenceFile.Writer.progressable(progress);
+    org.apache.hadoop.io.SequenceFile.Writer.Option tCompOpt = SequenceFile.Writer.compression(CompressionType.RECORD);
+    
+    final MapFile.Writer textOut = new MapFile.Writer(job, text,
+        tKeyClassOpt, tValClassOpt, tCompOpt, tProgressOpt);
+    
+    // dataOut Options
+    Option dKeyClassOpt = (Option) MapFile.Writer.keyClass(Text.class);
+    org.apache.hadoop.io.SequenceFile.Writer.Option dValClassOpt = SequenceFile.Writer.valueClass(ParseData.class);
+    org.apache.hadoop.io.SequenceFile.Writer.Option dProgressOpt = SequenceFile.Writer.progressable(progress);
+    org.apache.hadoop.io.SequenceFile.Writer.Option dCompOpt = SequenceFile.Writer.compression(compType);
+
+    final MapFile.Writer dataOut = new MapFile.Writer(job, data,
+        dKeyClassOpt, dValClassOpt, dCompOpt, dProgressOpt);
+    
+    final SequenceFile.Writer crawlOut = SequenceFile.createWriter(job, SequenceFile.Writer.file(crawl),
+        SequenceFile.Writer.keyClass(Text.class),
+        SequenceFile.Writer.valueClass(CrawlDatum.class),
+        SequenceFile.Writer.bufferSize(fs.getConf().getInt("io.file.buffer.size",4096)),
+        SequenceFile.Writer.replication(fs.getDefaultReplication(crawl)),
+        SequenceFile.Writer.blockSize(1073741824),
+        SequenceFile.Writer.compression(compType, new DefaultCodec()),
+        SequenceFile.Writer.progressable(progress),
+        SequenceFile.Writer.metadata(new Metadata())); 
+
+    return new RecordWriter<Text, Parse>() {
+
+      public void write(Text key, Parse parse) throws IOException {
+
+        String fromUrl = key.toString();
+        // host or domain name of the source URL
+        String origin = null;
+        textOut.append(key, new ParseText(parse.getText()));
+
+        ParseData parseData = parse.getData();
+        // recover the signature prepared by Fetcher or ParseSegment
+        String sig = parseData.getContentMeta().get(Nutch.SIGNATURE_KEY);
+        if (sig != null) {
+          byte[] signature = StringUtil.fromHexString(sig);
+          if (signature != null) {
+            // append a CrawlDatum with a signature
+            CrawlDatum d = new CrawlDatum(CrawlDatum.STATUS_SIGNATURE, 0);
+            d.setSignature(signature);
+            crawlOut.append(key, d);
+          }
+        }
+
+        // see if the parse metadata contain things that we'd like
+        // to pass to the metadata of the crawlDB entry
+        CrawlDatum parseMDCrawlDatum = null;
+        for (String mdname : parseMDtoCrawlDB) {
+          String mdvalue = parse.getData().getParseMeta().get(mdname);
+          if (mdvalue != null) {
+            if (parseMDCrawlDatum == null)
+              parseMDCrawlDatum = new CrawlDatum(CrawlDatum.STATUS_PARSE_META,
+                  0);
+            parseMDCrawlDatum.getMetaData().put(new Text(mdname),
+                new Text(mdvalue));
+          }
+        }
+        if (parseMDCrawlDatum != null)
+          crawlOut.append(key, parseMDCrawlDatum);
+
+        // need to determine origin (once for all outlinks)
+        if (ignoreExternalLinks || ignoreInternalLinks) {
+          URL originURL = new URL(fromUrl.toString());
+          // based on domain?
+          if ("bydomain".equalsIgnoreCase(ignoreExternalLinksMode)) {
+            origin = URLUtil.getDomainName(originURL).toLowerCase();
+          } 
+          // use host 
+          else {
+            origin = originURL.getHost().toLowerCase();
+          }
+        }
+
+        ParseStatus pstatus = parseData.getStatus();
+        if (pstatus != null && pstatus.isSuccess()
+            && pstatus.getMinorCode() == ParseStatus.SUCCESS_REDIRECT) {
+          String newUrl = pstatus.getMessage();
+          int refreshTime = Integer.valueOf(pstatus.getArgs()[1]);
+          newUrl = filterNormalize(fromUrl, newUrl, origin,
+              ignoreInternalLinks, ignoreExternalLinks, ignoreExternalLinksMode, filters, exemptionFilters, normalizers,
+              URLNormalizers.SCOPE_FETCHER);
+
+          if (newUrl != null) {
+            String reprUrl = URLUtil.chooseRepr(fromUrl, newUrl,
+                refreshTime < Fetcher.PERM_REFRESH_TIME);
+            CrawlDatum newDatum = new CrawlDatum();
+            newDatum.setStatus(CrawlDatum.STATUS_LINKED);
+            if (reprUrl != null && !reprUrl.equals(newUrl)) {
+              newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY,
+                  new Text(reprUrl));
+            }
+            crawlOut.append(new Text(newUrl), newDatum);
+          }
+        }
+
+        // collect outlinks for subsequent db update
+        Outlink[] links = parseData.getOutlinks();
+        int outlinksToStore = Math.min(maxOutlinks, links.length);
+
+        int validCount = 0;
+        CrawlDatum adjust = null;
+        List<Entry<Text, CrawlDatum>> targets = new ArrayList<Entry<Text, CrawlDatum>>(
+            outlinksToStore);
+        List<Outlink> outlinkList = new ArrayList<Outlink>(outlinksToStore);
+        for (int i = 0; i < links.length && validCount < outlinksToStore; i++) {
+          String toUrl = links[i].getToUrl();
+
+          // Only normalize and filter if fetcher.parse = false
+          if (!isParsing) {
+            toUrl = ParseOutputFormat.filterNormalize(fromUrl, toUrl, origin,
+                ignoreInternalLinks, ignoreExternalLinks, ignoreExternalLinksMode, filters, exemptionFilters, normalizers);
+            if (toUrl == null) {
+              continue;
+            }
+          }
+
+          CrawlDatum target = new CrawlDatum(CrawlDatum.STATUS_LINKED, interval);
+          Text targetUrl = new Text(toUrl);
+
+          // see if the outlink has any metadata attached
+          // and if so pass that to the crawldatum so that
+          // the initial score or distribution can use that
+          MapWritable outlinkMD = links[i].getMetadata();
+          if (outlinkMD != null) {
+            target.getMetaData().putAll(outlinkMD);
+          }
+
+          try {
+            scfilters.initialScore(targetUrl, target);
+          } catch (ScoringFilterException e) {
+            LOG.warn("Cannot filter init score for url " + key
+                + ", using default: " + e.getMessage());
+            target.setScore(0.0f);
+          }
+
+          targets.add(new SimpleEntry(targetUrl, target));
+
+          // OVerwrite URL in Outlink object with normalized URL (NUTCH-1174)
+          links[i].setUrl(toUrl);
+          outlinkList.add(links[i]);
+          validCount++;
+        }
+
+        try {
+          // compute score contributions and adjustment to the original score
+          adjust = scfilters.distributeScoreToOutlinks(key, parseData, targets,
+              null, links.length);
+        } catch (ScoringFilterException e) {
+          LOG.warn("Cannot distribute score from " + key + ": "
+              + e.getMessage());
+        }
+        for (Entry<Text, CrawlDatum> target : targets) {
+          crawlOut.append(target.getKey(), target.getValue());
+        }
+        if (adjust != null)
+          crawlOut.append(key, adjust);
+
+        Outlink[] filteredLinks = outlinkList.toArray(new Outlink[outlinkList
+            .size()]);
+        parseData = new ParseData(parseData.getStatus(), parseData.getTitle(),
+            filteredLinks, parseData.getContentMeta(), parseData.getParseMeta());
+        dataOut.append(key, parseData);
+        if (!parse.isCanonical()) {
+          CrawlDatum datum = new CrawlDatum();
+          datum.setStatus(CrawlDatum.STATUS_FETCH_SUCCESS);
+          String timeString = parse.getData().getContentMeta()
+              .get(Nutch.FETCH_TIME_KEY);
+          try {
+            datum.setFetchTime(Long.parseLong(timeString));
+          } catch (Exception e) {
+            LOG.warn("Can't read fetch time for: " + key);
+            datum.setFetchTime(System.currentTimeMillis());
+          }
+          crawlOut.append(key, datum);
+        }
+      }
+
+      public void close(Reporter reporter) throws IOException {
+        textOut.close();
+        dataOut.close();
+        crawlOut.close();
+      }
+
+    };
+
+  }
+
+  public static String filterNormalize(String fromUrl, String toUrl,
+      String fromHost, boolean ignoreInternalLinks, boolean ignoreExternalLinks,
+      String ignoreExternalLinksMode, URLFilters filters, URLExemptionFilters exemptionFilters,
+      URLNormalizers normalizers) {
+    return filterNormalize(fromUrl, toUrl, fromHost, ignoreInternalLinks, ignoreExternalLinks,
+        ignoreExternalLinksMode, filters, exemptionFilters, normalizers,
+        URLNormalizers.SCOPE_OUTLINK);
+  }
+
+  public static String filterNormalize(String fromUrl, String toUrl,
+      String origin, boolean ignoreInternalLinks, boolean ignoreExternalLinks,
+       String ignoreExternalLinksMode, URLFilters filters,
+       URLExemptionFilters exemptionFilters, URLNormalizers normalizers,
+        String urlNormalizerScope) {
+    // ignore links to self (or anchors within the page)
+    if (fromUrl.equals(toUrl)) {
+      return null;
+    }
+    if (ignoreExternalLinks || ignoreInternalLinks) {
+      URL targetURL = null;
+      try {
+        targetURL = new URL(toUrl);
+      } catch (MalformedURLException e1) {
+        return null; // skip it
+      }
+      if (ignoreExternalLinks) {
+        if ("bydomain".equalsIgnoreCase(ignoreExternalLinksMode)) {
+          String toDomain = URLUtil.getDomainName(targetURL).toLowerCase();
+          //FIXME: toDomain will never be null, correct?
+          if (toDomain == null || !toDomain.equals(origin)) {
+            return null; // skip it
+          }
+        } else {
+          String toHost = targetURL.getHost().toLowerCase();
+          if (!toHost.equals(origin)) { // external host link
+            if (exemptionFilters == null // check if it is exempted?
+                || !exemptionFilters.isExempted(fromUrl, toUrl)) {
+              return null; ///skip it, This external url is not exempted.
+            }
+          }
+        }
+      }
+      if (ignoreInternalLinks) {
+        if ("bydomain".equalsIgnoreCase(ignoreExternalLinksMode)) {
+          String toDomain = URLUtil.getDomainName(targetURL).toLowerCase();
+          //FIXME: toDomain will never be null, correct?
+          if (toDomain == null || toDomain.equals(origin)) {
+            return null; // skip it
+          }
+        } else {
+          String toHost = targetURL.getHost().toLowerCase();
+          //FIXME: toDomain will never be null, correct?
+          if (toHost == null || toHost.equals(origin)) {
+            return null; // skip it
+          }
+        }
+      }
+    }
+
+    try {
+      if (normalizers != null) {
+        toUrl = normalizers.normalize(toUrl, urlNormalizerScope); // normalize
+                                                                  // the url
+      }
+      if (filters != null) {
+        toUrl = filters.filter(toUrl); // filter the url
+      }
+      if (toUrl == null) {
+        return null;
+      }
+    } catch (Exception e) {
+      return null;
+    }
+
+    return toUrl;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/parse/ParsePluginList.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/parse/ParsePluginList.java b/nutch-core/src/main/java/org/apache/nutch/parse/ParsePluginList.java
new file mode 100644
index 0000000..6ad0ac8
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/parse/ParsePluginList.java
@@ -0,0 +1,71 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse;
+
+// JDK imports
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * This class represents a natural ordering for which parsing plugin should get
+ * called for a particular mimeType. It provides methods to store the
+ * parse-plugins.xml data, and methods to retreive the name of the appropriate
+ * parsing plugin for a contentType.
+ * 
+ * @author mattmann
+ * @version 1.0
+ */
+class ParsePluginList {
+
+  /* a map to link mimeType to an ordered list of parsing plugins */
+  private Map<String, List<String>> fMimeTypeToPluginMap = null;
+
+  /* A list of aliases */
+  private Map<String, String> aliases = null;
+
+  /**
+   * Constructs a new ParsePluginList
+   */
+  ParsePluginList() {
+    fMimeTypeToPluginMap = new HashMap<String, List<String>>();
+    aliases = new HashMap<String, String>();
+  }
+
+  List<String> getPluginList(String mimeType) {
+    return fMimeTypeToPluginMap.get(mimeType);
+  }
+
+  void setAliases(Map<String, String> aliases) {
+    this.aliases = aliases;
+  }
+
+  Map<String, String> getAliases() {
+    return aliases;
+  }
+
+  void setPluginList(String mimeType, List<String> l) {
+    fMimeTypeToPluginMap.put(mimeType, l);
+  }
+
+  List<String> getSupportedMimeTypes() {
+    return Arrays
+        .asList(fMimeTypeToPluginMap.keySet().toArray(new String[] {}));
+  }
+
+}

[25/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build for nutch-core and nutch-plugins

Posted by th...@apache.org.

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/creativecommons/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/creativecommons/plugin.xml b/nutch-plugins/creativecommons/plugin.xml
new file mode 100755
index 0000000..de9cf36
--- /dev/null
+++ b/nutch-plugins/creativecommons/plugin.xml
@@ -0,0 +1,48 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="creativecommons"
+   name="Creative Commons Plugins"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="creativecommons.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+   <extension id="org.creativecommons.nutch.CCParseFilter"
+              name="Creative Commons Metadata Filter"
+              point="org.apache.nutch.parse.HtmlParseFilter">
+      <implementation id="CCParseFilter"
+                      class="org.creativecommons.nutch.CCParseFilter"/>
+   </extension>
+
+   <extension id="org.creativecommons.nutch.CCIndexingFilter"
+              name="Creative Commons Indexing Filter"
+              point="org.apache.nutch.indexer.IndexingFilter">
+      <implementation id="CCIndexingFilter"
+                      class="org.creativecommons.nutch.CCIndexingFilter"/>
+   </extension>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/creativecommons/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/creativecommons/pom.xml b/nutch-plugins/creativecommons/pom.xml
new file mode 100644
index 0000000..7eb7564
--- /dev/null
+++ b/nutch-plugins/creativecommons/pom.xml
@@ -0,0 +1,38 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>creativecommons</artifactId>
+    <packaging>jar</packaging>
+
+    <name>creativecommons</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/creativecommons/src/main/java/org/creativecommons/nutch/CCIndexingFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/creativecommons/src/main/java/org/creativecommons/nutch/CCIndexingFilter.java b/nutch-plugins/creativecommons/src/main/java/org/creativecommons/nutch/CCIndexingFilter.java
new file mode 100644
index 0000000..e7c55c4
--- /dev/null
+++ b/nutch-plugins/creativecommons/src/main/java/org/creativecommons/nutch/CCIndexingFilter.java
@@ -0,0 +1,124 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.creativecommons.nutch;
+
+import org.apache.nutch.metadata.CreativeCommons;
+
+import org.apache.nutch.parse.Parse;
+
+import org.apache.nutch.indexer.IndexingFilter;
+import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.hadoop.io.Text;
+
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.metadata.Metadata;
+
+import org.apache.hadoop.conf.Configuration;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.*;
+import java.net.URL;
+import java.net.MalformedURLException;
+
+/** Adds basic searchable fields to a document. */
+public class CCIndexingFilter implements IndexingFilter {
+  public static final Logger LOG = LoggerFactory
+      .getLogger(CCIndexingFilter.class);
+
+  /** The name of the document field we use. */
+  public static String FIELD = "cc";
+
+  private Configuration conf;
+
+  public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
+      CrawlDatum datum, Inlinks inlinks) throws IndexingException {
+
+    Metadata metadata = parse.getData().getParseMeta();
+    // index the license
+    String licenseUrl = metadata.get(CreativeCommons.LICENSE_URL);
+    if (licenseUrl != null) {
+      if (LOG.isInfoEnabled()) {
+        LOG.info("CC: indexing " + licenseUrl + " for: " + url.toString());
+      }
+
+      // add the entire license as cc:license=xxx
+      addFeature(doc, "license=" + licenseUrl);
+
+      // index license attributes extracted of the license url
+      addUrlFeatures(doc, licenseUrl);
+    }
+
+    // index the license location as cc:meta=xxx
+    String licenseLocation = metadata.get(CreativeCommons.LICENSE_LOCATION);
+    if (licenseLocation != null) {
+      addFeature(doc, "meta=" + licenseLocation);
+    }
+
+    // index the work type cc:type=xxx
+    String workType = metadata.get(CreativeCommons.WORK_TYPE);
+    if (workType != null) {
+      addFeature(doc, workType);
+    }
+
+    return doc;
+  }
+
+  /**
+   * Add the features represented by a license URL. Urls are of the form
+   * "http://creativecommons.org/licenses/xx-xx/xx/xx", where "xx" names a
+   * license feature.
+   */
+  public void addUrlFeatures(NutchDocument doc, String urlString) {
+    try {
+      URL url = new URL(urlString);
+
+      // tokenize the path of the url, breaking at slashes and dashes
+      StringTokenizer names = new StringTokenizer(url.getPath(), "/-");
+
+      if (names.hasMoreTokens())
+        names.nextToken(); // throw away "licenses"
+
+      // add a feature per component after "licenses"
+      while (names.hasMoreTokens()) {
+        String feature = names.nextToken();
+        addFeature(doc, feature);
+      }
+    } catch (MalformedURLException e) {
+      if (LOG.isWarnEnabled()) {
+        LOG.warn("CC: failed to parse url: " + urlString + " : " + e);
+      }
+    }
+  }
+
+  private void addFeature(NutchDocument doc, String feature) {
+    doc.add(FIELD, feature);
+  }
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+  }
+
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/creativecommons/src/main/java/org/creativecommons/nutch/CCParseFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/creativecommons/src/main/java/org/creativecommons/nutch/CCParseFilter.java b/nutch-plugins/creativecommons/src/main/java/org/creativecommons/nutch/CCParseFilter.java
new file mode 100644
index 0000000..1fa951e
--- /dev/null
+++ b/nutch-plugins/creativecommons/src/main/java/org/creativecommons/nutch/CCParseFilter.java
@@ -0,0 +1,300 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.creativecommons.nutch;
+
+import org.apache.nutch.metadata.CreativeCommons;
+import org.apache.nutch.parse.*;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.hadoop.conf.Configuration;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.*;
+import java.io.*;
+import java.net.*;
+import javax.xml.parsers.*;
+import org.xml.sax.InputSource;
+import org.w3c.dom.*;
+
+/** Adds metadata identifying the Creative Commons license used, if any. */
+public class CCParseFilter implements HtmlParseFilter {
+  public static final Logger LOG = LoggerFactory.getLogger(CCParseFilter.class);
+
+  /** Walks DOM tree, looking for RDF in comments and licenses in anchors. */
+  public static class Walker {
+    private URL base; // base url of page
+    private String rdfLicense; // subject url found, if any
+    private URL relLicense; // license url found, if any
+    private URL anchorLicense; // anchor url found, if any
+    private String workType; // work type URI
+
+    private Walker(URL base) {
+      this.base = base;
+    }
+
+    /** Scan the document adding attributes to metadata. */
+    public static void walk(Node doc, URL base, Metadata metadata,
+        Configuration conf) throws ParseException {
+
+      // walk the DOM tree, scanning for license data
+      Walker walker = new Walker(base);
+      walker.walk(doc);
+
+      // interpret results of walk
+      String licenseUrl = null;
+      String licenseLocation = null;
+      if (walker.rdfLicense != null) { // 1st choice: subject in RDF
+        licenseLocation = "rdf";
+        licenseUrl = walker.rdfLicense;
+      } else if (walker.relLicense != null) { // 2nd: anchor w/ rel=license
+        licenseLocation = "rel";
+        licenseUrl = walker.relLicense.toString();
+      } else if (walker.anchorLicense != null) { // 3rd: anchor w/ CC license
+        licenseLocation = "a";
+        licenseUrl = walker.anchorLicense.toString();
+      } else if (conf.getBoolean("creativecommons.exclude.unlicensed", false)) {
+        throw new ParseException("No CC license.  Excluding.");
+      }
+
+      // add license to metadata
+      if (licenseUrl != null) {
+        if (LOG.isInfoEnabled()) {
+          LOG.info("CC: found " + licenseUrl + " in " + licenseLocation
+              + " of " + base);
+        }
+        metadata.add(CreativeCommons.LICENSE_URL, licenseUrl);
+        metadata.add(CreativeCommons.LICENSE_LOCATION, licenseLocation);
+      }
+
+      if (walker.workType != null) {
+        if (LOG.isInfoEnabled()) {
+          LOG.info("CC: found " + walker.workType + " in " + base);
+        }
+        metadata.add(CreativeCommons.WORK_TYPE, walker.workType);
+      }
+
+    }
+
+    /** Scan the document looking for RDF in comments and license elements. */
+    private void walk(Node node) {
+
+      // check element nodes for license URL
+      if (node instanceof Element) {
+        findLicenseUrl((Element) node);
+      }
+
+      // check comment nodes for license RDF
+      if (node instanceof Comment) {
+        findRdf(((Comment) node).getData());
+      }
+
+      // recursively walk child nodes
+      NodeList children = node.getChildNodes();
+      for (int i = 0; children != null && i < children.getLength(); i++) {
+        walk(children.item(i));
+      }
+    }
+
+    /**
+     * Extract license url from element, if any. Thse are the href attribute of
+     * anchor elements with rel="license". These must also point to
+     * http://creativecommons.org/licenses/.
+     */
+    private void findLicenseUrl(Element element) {
+      // only look in Anchor elements
+      if (!"a".equalsIgnoreCase(element.getTagName()))
+        return;
+
+      // require an href
+      String href = element.getAttribute("href");
+      if (href == null)
+        return;
+
+      try {
+        URL url = new URL(base, href); // resolve the url
+
+        // check that it's a CC license URL
+        if ("http".equalsIgnoreCase(url.getProtocol())
+            && "creativecommons.org".equalsIgnoreCase(url.getHost())
+            && url.getPath() != null && url.getPath().startsWith("/licenses/")
+            && url.getPath().length() > "/licenses/".length()) {
+
+          // check rel="license"
+          String rel = element.getAttribute("rel");
+          if (rel != null && "license".equals(rel) && this.relLicense == null) {
+            this.relLicense = url; // found rel license
+          } else if (this.anchorLicense == null) {
+            this.anchorLicense = url; // found anchor license
+          }
+        }
+      } catch (MalformedURLException e) { // ignore malformed urls
+      }
+    }
+
+    /** Configure a namespace aware XML parser. */
+    private static final DocumentBuilderFactory FACTORY = DocumentBuilderFactory
+        .newInstance();
+    static {
+      FACTORY.setNamespaceAware(true);
+    }
+
+    /** Creative Commons' namespace URI. */
+    private static final String CC_NS = "http://web.resource.org/cc/";
+
+    /** Dublin Core namespace URI. */
+    private static final String DC_NS = "http://purl.org/dc/elements/1.1/";
+
+    /** RDF syntax namespace URI. */
+    private static final String RDF_NS = "http://www.w3.org/1999/02/22-rdf-syntax-ns#";
+
+    private void findRdf(String comment) {
+      // first check for likely RDF in comment
+      int rdfPosition = comment.indexOf("RDF");
+      if (rdfPosition < 0)
+        return; // no RDF, abort
+      int nsPosition = comment.indexOf(CC_NS);
+      if (nsPosition < 0)
+        return; // no RDF, abort
+
+      // try to parse the XML
+      Document doc;
+      try {
+        DocumentBuilder parser = FACTORY.newDocumentBuilder();
+        doc = parser.parse(new InputSource(new StringReader(comment)));
+      } catch (Exception e) {
+        if (LOG.isWarnEnabled()) {
+          LOG.warn("CC: Failed to parse RDF in " + base + ": " + e);
+        }
+        // e.printStackTrace();
+        return;
+      }
+
+      // check that root is rdf:RDF
+      NodeList roots = doc.getElementsByTagNameNS(RDF_NS, "RDF");
+      if (roots.getLength() != 1) {
+        if (LOG.isWarnEnabled()) {
+          LOG.warn("CC: No RDF root in " + base);
+        }
+        return;
+      }
+      Element rdf = (Element) roots.item(0);
+
+      // get cc:License nodes inside rdf:RDF
+      NodeList licenses = rdf.getElementsByTagNameNS(CC_NS, "License");
+      for (int i = 0; i < licenses.getLength(); i++) {
+
+        Element l = (Element) licenses.item(i);
+
+        // license is rdf:about= attribute from cc:License
+        this.rdfLicense = l.getAttributeNodeNS(RDF_NS, "about").getValue();
+
+        // walk predicates of cc:License
+        NodeList predicates = l.getChildNodes();
+        for (int j = 0; j < predicates.getLength(); j++) {
+          Node predicateNode = predicates.item(j);
+          if (!(predicateNode instanceof Element))
+            continue;
+          Element predicateElement = (Element) predicateNode;
+
+          // extract predicates of cc:xxx predicates
+          if (!CC_NS.equals(predicateElement.getNamespaceURI())) {
+            continue;
+          }
+
+          // add object and predicate to metadata
+          // metadata.put(object, predicate);
+          // if (LOG.isInfoEnabled()) {
+          // LOG.info("CC: found: "+predicate+"="+object);
+          // }
+        }
+      }
+
+      // get cc:Work nodes from rdf:RDF
+      NodeList works = rdf.getElementsByTagNameNS(CC_NS, "Work");
+      for (int i = 0; i < works.getLength(); i++) {
+        // get dc:type nodes from cc:Work
+        NodeList types = rdf.getElementsByTagNameNS(DC_NS, "type");
+
+        for (int j = 0; j < types.getLength(); j++) {
+          Element type = (Element) types.item(j);
+          String workUri = type.getAttributeNodeNS(RDF_NS, "resource")
+              .getValue();
+          this.workType = WORK_TYPE_NAMES.get(workUri);
+        }
+      }
+    }
+  }
+
+  private static final HashMap<String, String> WORK_TYPE_NAMES = new HashMap<String, String>();
+  static {
+    WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/MovingImage", "video");
+    WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/StillImage", "image");
+    WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Sound", "audio");
+    WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Text", "text");
+    WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Interactive",
+        "interactive");
+    WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Software", "software");
+    WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Image", "image");
+  }
+
+  private Configuration conf;
+
+  /**
+   * Adds metadata or otherwise modifies a parse of an HTML document, given the
+   * DOM tree of a page.
+   */
+  public ParseResult filter(Content content, ParseResult parseResult,
+      HTMLMetaTags metaTags, DocumentFragment doc) {
+
+    // get parse obj
+    Parse parse = parseResult.get(content.getUrl());
+
+    // construct base url
+    URL base;
+    try {
+      base = new URL(content.getBaseUrl());
+    } catch (MalformedURLException e) {
+      Parse emptyParse = new ParseStatus(e).getEmptyParse(getConf());
+      parseResult.put(content.getUrl(), new ParseText(emptyParse.getText()),
+          emptyParse.getData());
+      return parseResult;
+    }
+
+    try {
+      // extract license metadata
+      Walker.walk(doc, base, parse.getData().getParseMeta(), getConf());
+    } catch (ParseException e) {
+      Parse emptyParse = new ParseStatus(e).getEmptyParse(getConf());
+      parseResult.put(content.getUrl(), new ParseText(emptyParse.getText()),
+          emptyParse.getData());
+      return parseResult;
+    }
+
+    return parseResult;
+  }
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+  }
+
+  public Configuration getConf() {
+    return this.conf;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/creativecommons/src/main/java/org/creativecommons/nutch/package.html
----------------------------------------------------------------------
diff --git a/nutch-plugins/creativecommons/src/main/java/org/creativecommons/nutch/package.html b/nutch-plugins/creativecommons/src/main/java/org/creativecommons/nutch/package.html
new file mode 100644
index 0000000..0c91293
--- /dev/null
+++ b/nutch-plugins/creativecommons/src/main/java/org/creativecommons/nutch/package.html
@@ -0,0 +1,5 @@
+<html>
+<body>
+<p>Sample plugins that parse and index Creative Commons medadata.</p>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/creativecommons/src/test/java/org/creativecommons/nutch/TestCCParseFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/creativecommons/src/test/java/org/creativecommons/nutch/TestCCParseFilter.java b/nutch-plugins/creativecommons/src/test/java/org/creativecommons/nutch/TestCCParseFilter.java
new file mode 100755
index 0000000..5beb47d
--- /dev/null
+++ b/nutch-plugins/creativecommons/src/test/java/org/creativecommons/nutch/TestCCParseFilter.java
@@ -0,0 +1,73 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.creativecommons.nutch;
+
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.protocol.Content;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Assert;
+import org.junit.Test;
+import static org.apache.nutch.test.TestUtils.getFile;
+
+import java.io.*;
+
+public class TestCCParseFilter {
+
+  @Test
+  public void testPages() throws Exception {
+
+    pageTest(getFile(this, "anchor.html"), "http://foo.com/",
+        "http://creativecommons.org/licenses/by-nc-sa/1.0", "a", null);
+    // Tika returns <a> whereas parse-html returns <rel>
+    // check later
+    pageTest(getFile(this, "rel.html"), "http://foo.com/",
+        "http://creativecommons.org/licenses/by-nc/2.0", "rel", null);
+    // Tika returns <a> whereas parse-html returns <rdf>
+    // check later
+    pageTest(getFile(this, "rdf.html"), "http://foo.com/",
+        "http://creativecommons.org/licenses/by-nc/1.0", "rdf", "text");
+  }
+
+  public void pageTest(File file, String url, String license, String location,
+      String type) throws Exception {
+
+    String contentType = "text/html";
+    InputStream in = new FileInputStream(file);
+    ByteArrayOutputStream out = new ByteArrayOutputStream((int) file.length());
+    byte[] buffer = new byte[1024];
+    int i;
+    while ((i = in.read(buffer)) != -1) {
+      out.write(buffer, 0, i);
+    }
+    in.close();
+    byte[] bytes = out.toByteArray();
+    Configuration conf = NutchConfiguration.create();
+
+    Content content = new Content(url, url, bytes, contentType, new Metadata(),
+        conf);
+    Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
+
+    Metadata metadata = parse.getData().getParseMeta();
+    Assert.assertEquals(license, metadata.get("License-Url"));
+    Assert.assertEquals(location, metadata.get("License-Location"));
+    Assert.assertEquals(type, metadata.get("Work-Type"));
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/creativecommons/src/test/resources/anchor.html
----------------------------------------------------------------------
diff --git a/nutch-plugins/creativecommons/src/test/resources/anchor.html b/nutch-plugins/creativecommons/src/test/resources/anchor.html
new file mode 100755
index 0000000..90b5227
--- /dev/null
+++ b/nutch-plugins/creativecommons/src/test/resources/anchor.html
@@ -0,0 +1,9 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/1999/REC-html401-19991224/loose.dtd">
+<html>
+<head>
+</head>
+<body>
+<p><a href="http://creativecommons.org/licenses/by-nc-sa/1.0"><img alt="Creative Commons License" src="http://creativecommons.org/images/public/somerights.gif" align="right"></a>This file is licensed under a
+<a href="http://creativecommons.org/licenses/by-nc-sa/1.0">Creative Commons License</a>.</p>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/creativecommons/src/test/resources/rdf.html
----------------------------------------------------------------------
diff --git a/nutch-plugins/creativecommons/src/test/resources/rdf.html b/nutch-plugins/creativecommons/src/test/resources/rdf.html
new file mode 100755
index 0000000..fb2c34d
--- /dev/null
+++ b/nutch-plugins/creativecommons/src/test/resources/rdf.html
@@ -0,0 +1,35 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
+<html>
+ <head>
+ </head>
+ <body>
+
+<!-- Creative Commons License -->
+<p><a href="http://creativecommons.org/licenses/by-nc/1.0"><img alt="Creative Commons License" border="0" src="http://creativecommons.org/images/public/somerights.gif" /></a><br />
+This work is licensed under a
+<a href="http://creativecommons.org/licenses/by-nc/1.0">Creative Commons License</a>.
+<!--  end Creative Commons License -->
+
+  <!--
+<rdf:RDF xmlns="http://web.resource.org/cc/"
+    xmlns:dc="http://purl.org/dc/elements/1.1/"
+    xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
+<Work rdf:about="http://boingboing.net">
+   <dc:type rdf:resource="http://purl.org/dc/dcmitype/Text" />
+   <license rdf:resource="http://creativecommons.org/licenses/by-nc/1.0" />
+</Work>
+
+<License rdf:about="http://creativecommons.org/licenses/by-nc/1.0">
+   <requires rdf:resource="http://web.resource.org/cc/Attribution" />
+   <permits rdf:resource="http://web.resource.org/cc/DerivativeWorks" />
+   <permits rdf:resource="http://web.resource.org/cc/Reproduction" />
+   <permits rdf:resource="http://web.resource.org/cc/Distribution" />
+   <prohibits rdf:resource="http://web.resource.org/cc/CommercialUse" />
+   <requires rdf:resource="http://web.resource.org/cc/Notice" />
+</License>
+
+</rdf:RDF>
+
+-->
+ </body>
+</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/creativecommons/src/test/resources/rel.html
----------------------------------------------------------------------
diff --git a/nutch-plugins/creativecommons/src/test/resources/rel.html b/nutch-plugins/creativecommons/src/test/resources/rel.html
new file mode 100755
index 0000000..413d52f
--- /dev/null
+++ b/nutch-plugins/creativecommons/src/test/resources/rel.html
@@ -0,0 +1,6 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en"><head>
+</head><body>
+<a rel="license" href="http://creativecommons.org/licenses/by-nc/2.0">CC by-nc</a> 
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/feed/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/feed/build.xml b/nutch-plugins/feed/build.xml
new file mode 100644
index 0000000..7fe7050
--- /dev/null
+++ b/nutch-plugins/feed/build.xml
@@ -0,0 +1,45 @@
+<?xml version="1.0"?>
+<!--
+	Licensed to the Apache Software Foundation (ASF) under one or more
+	contributor license agreements.  See the NOTICE file distributed with
+	this work for additional information regarding copyright ownership.
+	The ASF licenses this file to You under the Apache License, Version 2.0
+	(the "License"); you may not use this file except in compliance with
+	the License.  You may obtain a copy of the License at
+	
+	http://www.apache.org/licenses/LICENSE-2.0
+	
+	Unless required by applicable law or agreed to in writing, software
+	distributed under the License is distributed on an "AS IS" BASIS,
+	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	See the License for the specific language governing permissions and
+	limitations under the License.
+-->
+
+<project name="feed" default="jar-core">
+
+    <import file="../build-plugin.xml" />
+    
+    <!-- Build compilation dependencies -->
+    <target name="deps-jar">
+      <ant target="jar" inheritall="false" dir="../lib-xml"/>
+    </target>
+
+    <!-- Add compilation dependencies to classpath -->
+    <path id="plugin.deps">
+      <fileset dir="${nutch.root}/build">
+        <include name="**/lib-xml/*.jar" />
+      </fileset>
+    </path>
+
+    <!-- Deploy Unit test dependencies -->
+    <target name="deps-test">
+      <ant target="deploy" inheritall="false"
+           dir="../nutch-extensionpoints" />
+      <ant target="deploy" inheritall="false" dir="../protocol-file" />
+    </target>
+    
+    <!-- for junit test -->
+    <mkdir dir="${build.test}/data" />
+    <copy file="sample/rsstest.rss" todir="${build.test}/data" />
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/feed/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/feed/ivy.xml b/nutch-plugins/feed/ivy.xml
new file mode 100644
index 0000000..c29bd03
--- /dev/null
+++ b/nutch-plugins/feed/ivy.xml
@@ -0,0 +1,43 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+    <dependency org="rome" name="rome" rev="0.9" conf="*->master"/>
+    <dependency org="org.jdom" name="jdom" rev="1.1" conf="*->master"/>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/feed/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/feed/plugin.xml b/nutch-plugins/feed/plugin.xml
new file mode 100644
index 0000000..3a68d8d
--- /dev/null
+++ b/nutch-plugins/feed/plugin.xml
@@ -0,0 +1,49 @@
+<?xml version="1.0"?>
+<!--
+	Licensed to the Apache Software Foundation (ASF) under one or more
+	contributor license agreements.  See the NOTICE file distributed with
+	this work for additional information regarding copyright ownership.
+	The ASF licenses this file to You under the Apache License, Version 2.0
+	(the "License"); you may not use this file except in compliance with
+	the License.  You may obtain a copy of the License at
+	
+	http://www.apache.org/licenses/LICENSE-2.0
+	
+	Unless required by applicable law or agreed to in writing, software
+	distributed under the License is distributed on an "AS IS" BASIS,
+	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	See the License for the specific language governing permissions and
+	limitations under the License.
+-->
+<plugin id="feed" name="Feed Parse/Index/Query Plug-in" version="1.0.0"
+	 provider-name="nutch.org">
+    <runtime>
+      <library name="feed.jar">
+        <export name="*" />
+      </library>
+      <library name="rome-0.9.jar" />
+      <library name="jdom-1.1.jar" />
+    </runtime>
+    
+    <requires>
+      <import plugin="nutch-extensionpoints" />
+      <import plugin="lib-xml" />
+    </requires>
+    
+    <extension id="org.apache.nutch.parse.feed" name="Feed Parser"
+      point="org.apache.nutch.parse.Parser">
+      
+      <implementation id="org.apache.nutch.parse.feed.FeedParser"
+         class="org.apache.nutch.parse.feed.FeedParser">
+         <parameter name="contentType" value="application/rss+xml" />
+         <parameter name="contentType" value="application/atom+xml" />
+         <parameter name="contentType" value="text/xml" />
+         <parameter name="pathSuffix" value="rss" />
+     </implementation>
+    </extension>
+    <extension id="org.apache.nutch.indexer.feed" name="Feed Indexer"
+       point="org.apache.nutch.indexer.IndexingFilter">
+     <implementation id="FeedIndexingFilter"
+       class="org.apache.nutch.indexer.feed.FeedIndexingFilter" />
+    </extension>
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/feed/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/feed/pom.xml b/nutch-plugins/feed/pom.xml
new file mode 100644
index 0000000..d94c0b6
--- /dev/null
+++ b/nutch-plugins/feed/pom.xml
@@ -0,0 +1,45 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>feed</artifactId>
+    <packaging>jar</packaging>
+
+    <name>feed</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+    <dependencies>
+        <dependency>
+            <groupId>rome</groupId>
+            <artifactId>rome</artifactId>
+            <version>1.0</version>
+        </dependency>
+    </dependencies>
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/feed/src/main/java/org/apache/nutch/indexer/feed/FeedIndexingFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/feed/src/main/java/org/apache/nutch/indexer/feed/FeedIndexingFilter.java b/nutch-plugins/feed/src/main/java/org/apache/nutch/indexer/feed/FeedIndexingFilter.java
new file mode 100644
index 0000000..94b440a
--- /dev/null
+++ b/nutch-plugins/feed/src/main/java/org/apache/nutch/indexer/feed/FeedIndexingFilter.java
@@ -0,0 +1,129 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.indexer.feed;
+
+//JDK imports
+import java.util.Date;
+
+//APACHE imports
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.indexer.IndexingFilter;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.metadata.Feed;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+
+/**
+ * @author dogacan
+ * @author mattmann
+ * @since NUTCH-444
+ * 
+ *        An {@link IndexingFilter} implementation to pull out the relevant
+ *        extracted {@link Metadata} fields from the RSS feeds and into the
+ *        index.
+ * 
+ */
+public class FeedIndexingFilter implements IndexingFilter {
+
+  public static final String dateFormatStr = "yyyyMMddHHmm";
+
+  private Configuration conf;
+
+  private final static String PUBLISHED_DATE = "publishedDate";
+
+  private final static String UPDATED_DATE = "updatedDate";
+
+  /**
+   * Extracts out the relevant fields:
+   * 
+   * <ul>
+   * <li>FEED_AUTHOR</li>
+   * <li>FEED_TAGS</li>
+   * <li>FEED_PUBLISHED</li>
+   * <li>FEED_UPDATED</li>
+   * <li>FEED</li>
+   * </ul>
+   * 
+   * And sends them to the {@link Indexer} for indexing within the Nutch index.
+   * 
+   */
+  public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
+      CrawlDatum datum, Inlinks inlinks) throws IndexingException {
+    ParseData parseData = parse.getData();
+    Metadata parseMeta = parseData.getParseMeta();
+
+    String[] authors = parseMeta.getValues(Feed.FEED_AUTHOR);
+    String[] tags = parseMeta.getValues(Feed.FEED_TAGS);
+    String published = parseMeta.get(Feed.FEED_PUBLISHED);
+    String updated = parseMeta.get(Feed.FEED_UPDATED);
+    String feed = parseMeta.get(Feed.FEED);
+
+    if (authors != null) {
+      for (String author : authors) {
+        doc.add(Feed.FEED_AUTHOR, author);
+      }
+    }
+
+    if (tags != null) {
+      for (String tag : tags) {
+        doc.add(Feed.FEED_TAGS, tag);
+      }
+    }
+
+    if (feed != null)
+      doc.add(Feed.FEED, feed);
+
+    if (published != null) {
+      Date date = new Date(Long.parseLong(published));
+      doc.add(PUBLISHED_DATE, date);
+    }
+
+    if (updated != null) {
+      Date date = new Date(Long.parseLong(updated));
+      doc.add(UPDATED_DATE, date);
+    }
+
+    return doc;
+  }
+
+  /**
+   * @return the {@link Configuration} object used to configure this
+   *         {@link IndexingFilter}.
+   */
+  public Configuration getConf() {
+    return conf;
+  }
+
+  /**
+   * Sets the {@link Configuration} object used to configure this
+   * {@link IndexingFilter}.
+   * 
+   * @param conf
+   *          The {@link Configuration} object used to configure this
+   *          {@link IndexingFilter}.
+   */
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/feed/src/main/java/org/apache/nutch/indexer/feed/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/feed/src/main/java/org/apache/nutch/indexer/feed/package-info.java b/nutch-plugins/feed/src/main/java/org/apache/nutch/indexer/feed/package-info.java
new file mode 100644
index 0000000..8f52628
--- /dev/null
+++ b/nutch-plugins/feed/src/main/java/org/apache/nutch/indexer/feed/package-info.java
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Indexing filter to index meta data from RSS feeds.
+ */
+package org.apache.nutch.indexer.feed;
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/feed/src/main/java/org/apache/nutch/parse/feed/FeedParser.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/feed/src/main/java/org/apache/nutch/parse/feed/FeedParser.java b/nutch-plugins/feed/src/main/java/org/apache/nutch/parse/feed/FeedParser.java
new file mode 100644
index 0000000..936c885
--- /dev/null
+++ b/nutch-plugins/feed/src/main/java/org/apache/nutch/parse/feed/FeedParser.java
@@ -0,0 +1,374 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse.feed;
+
+// JDK imports
+import java.io.ByteArrayInputStream;
+import java.io.DataInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.util.Date;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map.Entry;
+
+// APACHE imports
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.util.StringUtils;
+// import org.apache.nutch.indexer.anchor.AnchorIndexingFilter; removed as per NUTCH-1078
+import org.apache.nutch.metadata.Feed;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.net.URLFilters;
+import org.apache.nutch.net.URLNormalizers;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseResult;
+import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.parse.ParseText;
+import org.apache.nutch.parse.Parser;
+import org.apache.nutch.parse.ParserFactory;
+import org.apache.nutch.parse.ParserNotFound;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.EncodingDetector;
+import org.apache.nutch.util.NutchConfiguration;
+import org.xml.sax.InputSource;
+
+// ROME imports
+import com.sun.syndication.feed.synd.SyndCategory;
+import com.sun.syndication.feed.synd.SyndContent;
+import com.sun.syndication.feed.synd.SyndEntry;
+import com.sun.syndication.feed.synd.SyndFeed;
+import com.sun.syndication.feed.synd.SyndPerson;
+import com.sun.syndication.io.SyndFeedInput;
+
+/**
+ * 
+ * @author dogacan
+ * @author mattmann
+ * @since NUTCH-444
+ * 
+ *        <p>
+ *        A new RSS/ATOM Feed{@link Parser} that rapidly parses all referenced
+ *        links and content present in the feed.
+ *        </p>
+ * 
+ */
+public class FeedParser implements Parser {
+
+  public static final String CHARSET_UTF8 = "charset=UTF-8";
+
+  public static final String TEXT_PLAIN_CONTENT_TYPE = "text/plain; "
+      + CHARSET_UTF8;
+
+  public static final Logger LOG = LoggerFactory.getLogger(FeedParser.class);
+
+  private Configuration conf;
+
+  private ParserFactory parserFactory;
+
+  private URLNormalizers normalizers;
+
+  private URLFilters filters;
+
+  private String defaultEncoding;
+
+  /**
+   * Parses the given feed and extracts out and parsers all linked items within
+   * the feed, using the underlying ROME feed parsing library.
+   * 
+   * @param content
+   *          A {@link Content} object representing the feed that is being
+   *          parsed by this {@link Parser}.
+   * 
+   * @return A {@link ParseResult} containing all {@link Parse}d feeds that were
+   *         present in the feed file that this {@link Parser} dealt with.
+   * 
+   */
+  public ParseResult getParse(Content content) {
+    SyndFeed feed = null;
+    ParseResult parseResult = new ParseResult(content.getUrl());
+
+    EncodingDetector detector = new EncodingDetector(conf);
+    detector.autoDetectClues(content, true);
+    String encoding = detector.guessEncoding(content, defaultEncoding);
+    try {
+      InputSource input = new InputSource(new ByteArrayInputStream(
+          content.getContent()));
+      input.setEncoding(encoding);
+      SyndFeedInput feedInput = new SyndFeedInput();
+      feed = feedInput.build(input);
+    } catch (Exception e) {
+      // return empty parse
+      LOG.warn("Parse failed: url: " + content.getUrl() + ", exception: "
+          + StringUtils.stringifyException(e));
+      return new ParseStatus(e)
+          .getEmptyParseResult(content.getUrl(), getConf());
+    }
+
+    String feedLink = feed.getLink();
+    try {
+      feedLink = normalizers.normalize(feedLink, URLNormalizers.SCOPE_OUTLINK);
+      if (feedLink != null)
+        feedLink = filters.filter(feedLink);
+    } catch (Exception e) {
+      feedLink = null;
+    }
+
+    List<?> entries = feed.getEntries();
+    for (Object entry : entries) {
+      addToMap(parseResult, feed, feedLink, (SyndEntry) entry, content);
+    }
+
+    String feedDesc = stripTags(feed.getDescriptionEx());
+    String feedTitle = stripTags(feed.getTitleEx());
+
+    parseResult.put(content.getUrl(), new ParseText(feedDesc), new ParseData(
+        new ParseStatus(ParseStatus.SUCCESS), feedTitle, new Outlink[0],
+        content.getMetadata()));
+
+    return parseResult;
+  }
+
+  /**
+   * 
+   * Sets the {@link Configuration} object for this {@link Parser}. This
+   * {@link Parser} expects the following configuration properties to be set:
+   * 
+   * <ul>
+   * <li>URLNormalizers - properties in the configuration object to set up the
+   * default url normalizers.</li>
+   * <li>URLFilters - properties in the configuration object to set up the
+   * default url filters.</li>
+   * </ul>
+   * 
+   * @param conf
+   *          The Hadoop {@link Configuration} object to use to configure this
+   *          {@link Parser}.
+   * 
+   */
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    this.parserFactory = new ParserFactory(conf);
+    this.normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_OUTLINK);
+    this.filters = new URLFilters(conf);
+    this.defaultEncoding = conf.get("parser.character.encoding.default",
+        "windows-1252");
+  }
+
+  /**
+   * 
+   * @return The {@link Configuration} object used to configure this
+   *         {@link Parser}.
+   */
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+  /**
+   * Runs a command line version of this {@link Parser}.
+   * 
+   * @param args
+   *          A single argument (expected at arg[0]) representing a path on the
+   *          local filesystem that points to a feed file.
+   * 
+   * @throws Exception
+   *           If any error occurs.
+   */
+  public static void main(String[] args) throws Exception {
+    if (args.length != 1) {
+      System.err.println("Usage: FeedParser <feed>");
+      System.exit(1);
+    }
+    String name = args[0];
+    String url = "file:" + name;
+    Configuration conf = NutchConfiguration.create();
+    FeedParser parser = new FeedParser();
+    parser.setConf(conf);
+    File file = new File(name);
+    byte[] bytes = new byte[(int) file.length()];
+    DataInputStream in = new DataInputStream(new FileInputStream(file));
+    in.readFully(bytes);
+    ParseResult parseResult = parser.getParse(new Content(url, url, bytes,
+        "application/rss+xml", new Metadata(), conf));
+    for (Entry<Text, Parse> entry : parseResult) {
+      System.out.println("key: " + entry.getKey());
+      Parse parse = entry.getValue();
+      System.out.println("data: " + parse.getData());
+      System.out.println("text: " + parse.getText() + "\n");
+    }
+  }
+
+  private void addToMap(ParseResult parseResult, SyndFeed feed,
+      String feedLink, SyndEntry entry, Content content) {
+    String link = entry.getLink(), text = null, title = null;
+    Metadata parseMeta = new Metadata(), contentMeta = content.getMetadata();
+    Parse parse = null;
+    SyndContent description = entry.getDescription();
+
+    try {
+      link = normalizers.normalize(link, URLNormalizers.SCOPE_OUTLINK);
+
+      if (link != null)
+        link = filters.filter(link);
+    } catch (Exception e) {
+      e.printStackTrace();
+      return;
+    }
+
+    if (link == null)
+      return;
+
+    title = stripTags(entry.getTitleEx());
+
+    if (feedLink != null)
+      parseMeta.set("feed", feedLink);
+
+    addFields(parseMeta, contentMeta, feed, entry);
+
+    // some item descriptions contain markup text in them,
+    // so we temporarily set their content-type to parse them
+    // with another plugin
+    String contentType = contentMeta.get(Response.CONTENT_TYPE);
+
+    if (description != null)
+      text = description.getValue();
+
+    if (text == null) {
+      List<?> contents = entry.getContents();
+      StringBuilder buf = new StringBuilder();
+      for (Object syndContent : contents) {
+        buf.append(((SyndContent) syndContent).getValue());
+      }
+      text = buf.toString();
+    }
+
+    try {
+      Parser parser = parserFactory.getParsers(contentType, link)[0];
+      parse = parser.getParse(
+          new Content(link, link, text.getBytes(), contentType, contentMeta,
+              conf)).get(link);
+    } catch (ParserNotFound e) { /* ignore */
+    }
+
+    if (parse != null) {
+      ParseData data = parse.getData();
+      data.getContentMeta().remove(Response.CONTENT_TYPE);
+      mergeMetadata(data.getParseMeta(), parseMeta);
+      parseResult.put(link, new ParseText(parse.getText()),
+          new ParseData(ParseStatus.STATUS_SUCCESS, title, data.getOutlinks(),
+              data.getContentMeta(), data.getParseMeta()));
+    } else {
+      contentMeta.remove(Response.CONTENT_TYPE);
+      parseResult.put(link, new ParseText(text), new ParseData(
+          ParseStatus.STATUS_FAILURE, title, new Outlink[0], contentMeta,
+          parseMeta));
+    }
+
+  }
+
+  private static String stripTags(SyndContent c) {
+    if (c == null)
+      return "";
+
+    String value = c.getValue();
+
+    String[] parts = value.split("<[^>]*>");
+    StringBuffer buf = new StringBuffer();
+
+    for (String part : parts)
+      buf.append(part);
+
+    return buf.toString().trim();
+  }
+
+  private void addFields(Metadata parseMeta, Metadata contentMeta,
+      SyndFeed feed, SyndEntry entry) {
+    List<?> authors = entry.getAuthors(), categories = entry.getCategories();
+    Date published = entry.getPublishedDate(), updated = entry.getUpdatedDate();
+    String contentType = null;
+
+    if (authors != null) {
+      for (Object o : authors) {
+        SyndPerson author = (SyndPerson) o;
+        String authorName = author.getName();
+        if (checkString(authorName)) {
+          parseMeta.add(Feed.FEED_AUTHOR, authorName);
+        }
+      }
+    } else {
+      // getAuthors may return null if feed is non-atom
+      // if so, call getAuthor to get Dublin Core module creator.
+      String authorName = entry.getAuthor();
+      if (checkString(authorName)) {
+        parseMeta.set(Feed.FEED_AUTHOR, authorName);
+      }
+    }
+
+    for (Object i : categories) {
+      parseMeta.add(Feed.FEED_TAGS, ((SyndCategory) i).getName());
+    }
+
+    if (published != null) {
+      parseMeta.set(Feed.FEED_PUBLISHED, Long.toString(published.getTime()));
+    }
+    if (updated != null) {
+      parseMeta.set(Feed.FEED_UPDATED, Long.toString(updated.getTime()));
+    }
+
+    SyndContent description = entry.getDescription();
+    if (description != null) {
+      contentType = description.getType();
+    } else {
+      // TODO: What to do if contents.size() > 1?
+      List<?> contents = entry.getContents();
+      if (contents.size() > 0) {
+        contentType = ((SyndContent) contents.get(0)).getType();
+      }
+    }
+
+    if (checkString(contentType)) {
+      // ROME may return content-type as html
+      if (contentType.equals("html"))
+        contentType = "text/html";
+      else if (contentType.equals("xhtml"))
+        contentType = "text/xhtml";
+      contentMeta.set(Response.CONTENT_TYPE, contentType + "; " + CHARSET_UTF8);
+    } else {
+      contentMeta.set(Response.CONTENT_TYPE, TEXT_PLAIN_CONTENT_TYPE);
+    }
+
+  }
+
+  private void mergeMetadata(Metadata first, Metadata second) {
+    for (String name : second.names()) {
+      String[] values = second.getValues(name);
+      for (String value : values) {
+        first.add(name, value);
+      }
+    }
+  }
+
+  private boolean checkString(String s) {
+    return s != null && !s.equals("");
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/feed/src/main/java/org/apache/nutch/parse/feed/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/feed/src/main/java/org/apache/nutch/parse/feed/package-info.java b/nutch-plugins/feed/src/main/java/org/apache/nutch/parse/feed/package-info.java
new file mode 100644
index 0000000..3b15968
--- /dev/null
+++ b/nutch-plugins/feed/src/main/java/org/apache/nutch/parse/feed/package-info.java
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Parse RSS feeds.
+ */
+package org.apache.nutch.parse.feed;
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/feed/src/test/java/org/apache/nutch/parse/feed/TestFeedParser.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/feed/src/test/java/org/apache/nutch/parse/feed/TestFeedParser.java b/nutch-plugins/feed/src/test/java/org/apache/nutch/parse/feed/TestFeedParser.java
new file mode 100644
index 0000000..36c8739
--- /dev/null
+++ b/nutch-plugins/feed/src/test/java/org/apache/nutch/parse/feed/TestFeedParser.java
@@ -0,0 +1,124 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.feed;
+
+// JDK imports
+import java.util.Iterator;
+import java.util.Map;
+
+import org.junit.Assert;
+import org.junit.Test;
+// APACHE imports
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.parse.ParseResult;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.protocol.ProtocolNotFound;
+import org.apache.nutch.util.NutchConfiguration;
+
+/**
+ * 
+ * @author mattmann
+ * 
+ *         Test Suite for the {@link FeedParser}.
+ * 
+ */
+public class TestFeedParser {
+
+  private String fileSeparator = System.getProperty("file.separator");
+
+  // This system property is defined in ./src/plugin/build-plugin.xml
+  private String sampleDir = System.getProperty("test.data", ".");
+
+  // Make sure sample files are copied to "test.data" as specified in
+  // ./src/plugin/feed/build.xml during plugin compilation.
+
+  private String[] sampleFiles = { "rsstest.rss" };
+
+  public static final Logger LOG = LoggerFactory.getLogger(TestFeedParser.class
+      .getName());
+
+  /**
+   * Calls the {@link FeedParser} on a sample RSS file and checks that there are
+   * 3 {@link ParseResult} entries including the below 2 links:
+   * <ul>
+   * <li>http://www-scf.usc.edu/~mattmann/</li>
+   * <li>http://www.nutch.org</li>
+   * </ul>
+   * 
+   * 
+   * @throws ProtocolNotFound
+   *           If the {@link Protocol}Layer cannot be loaded (required to fetch
+   *           the {@link Content} for the RSS file).
+   * @throws ParseException
+   *           If the {@link Parser}Layer cannot be loaded.
+   */
+  @Test
+  public void testParseFetchChannel() throws ProtocolNotFound, ParseException {
+    String urlString;
+    Protocol protocol;
+    Content content;
+    ParseResult parseResult;
+
+    Configuration conf = NutchConfiguration.create();
+    for (int i = 0; i < sampleFiles.length; i++) {
+      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
+      urlString = urlString.replace('\\', '/');
+
+      protocol = new ProtocolFactory(conf).getProtocol(urlString);
+      content = protocol.getProtocolOutput(new Text(urlString),
+          new CrawlDatum()).getContent();
+
+      parseResult = new ParseUtil(conf).parseByExtensionId("feed", content);
+
+      Assert.assertEquals(3, parseResult.size());
+
+      boolean hasLink1 = false, hasLink2 = false, hasLink3 = false;
+
+      for (Iterator<Map.Entry<Text, Parse>> j = parseResult.iterator(); j
+          .hasNext();) {
+        Map.Entry<Text, Parse> entry = j.next();
+        if (entry.getKey().toString()
+            .equals("http://www-scf.usc.edu/~mattmann/")) {
+          hasLink1 = true;
+        } else if (entry.getKey().toString().equals("http://www.nutch.org/")) {
+          hasLink2 = true;
+        } else if (entry.getKey().toString().equals(urlString)) {
+          hasLink3 = true;
+        }
+
+        Assert.assertNotNull(entry.getValue());
+        Assert.assertNotNull(entry.getValue().getData());
+      }
+
+      if (!hasLink1 || !hasLink2 || !hasLink3) {
+        Assert.fail("Outlinks read from sample rss file are not correct!");
+      }
+    }
+
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/feed/src/test/resources/rsstest.rss
----------------------------------------------------------------------
diff --git a/nutch-plugins/feed/src/test/resources/rsstest.rss b/nutch-plugins/feed/src/test/resources/rsstest.rss
new file mode 100644
index 0000000..758f6a1
--- /dev/null
+++ b/nutch-plugins/feed/src/test/resources/rsstest.rss
@@ -0,0 +1,36 @@
+<?xml version="1.0" encoding="ISO-8859-1" ?>
+<!--
+	Licensed to the Apache Software Foundation (ASF) under one or more
+	contributor license agreements.  See the NOTICE file distributed with
+	this work for additional information regarding copyright ownership.
+	The ASF licenses this file to You under the Apache License, Version 2.0
+	(the "License"); you may not use this file except in compliance with
+	the License.  You may obtain a copy of the License at
+	
+	http://www.apache.org/licenses/LICENSE-2.0
+	
+	Unless required by applicable law or agreed to in writing, software
+	distributed under the License is distributed on an "AS IS" BASIS,
+	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	See the License for the specific language governing permissions and
+	limitations under the License.
+-->
+<rss version="0.91">
+    <channel>
+      <title>TestChannel</title>
+      <link>http://test.channel.com/</link> 
+      <description>Sample RSS File for Junit test</description> 
+      <language>en-us</language>
+      
+      <item>
+        <title>Home Page of Chris Mattmann</title>
+        <link>http://www-scf.usc.edu/~mattmann/</link>
+        <description>Chris Mattmann's home page</description>
+      </item>
+      <item>
+        <title>Awesome Open Source Search Engine</title> 
+        <link>http://www.nutch.org/</link> 
+        <description>Yup, that's what it is</description> 
+      </item>
+   </channel>
+</rss>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/headings/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/headings/build.xml b/nutch-plugins/headings/build.xml
new file mode 100644
index 0000000..d334ad1
--- /dev/null
+++ b/nutch-plugins/headings/build.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="headings" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/headings/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/headings/ivy.xml b/nutch-plugins/headings/ivy.xml
new file mode 100644
index 0000000..5b8393b
--- /dev/null
+++ b/nutch-plugins/headings/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+      <include file="${nutch.root}/ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/headings/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/headings/plugin.xml b/nutch-plugins/headings/plugin.xml
new file mode 100644
index 0000000..0d7921a
--- /dev/null
+++ b/nutch-plugins/headings/plugin.xml
@@ -0,0 +1,45 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="headings"
+   name="Headings Parse Filter"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+
+   <runtime>
+      <library name="headings.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+   <extension id="org.apache.nutch.parse.headings"
+              name="Nutch Headings Parse Filter"
+              point="org.apache.nutch.parse.HtmlParseFilter">
+
+      <implementation id="HeadingsParseFilter"
+                      class="org.apache.nutch.parse.headings.HeadingsParseFilter">
+      </implementation>
+
+   </extension>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/headings/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/headings/pom.xml b/nutch-plugins/headings/pom.xml
new file mode 100644
index 0000000..219eb71
--- /dev/null
+++ b/nutch-plugins/headings/pom.xml
@@ -0,0 +1,38 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>headings</artifactId>
+    <packaging>jar</packaging>
+
+    <name>headings</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/headings/src/main/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/headings/src/main/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java b/nutch-plugins/headings/src/main/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java
new file mode 100644
index 0000000..657f260
--- /dev/null
+++ b/nutch-plugins/headings/src/main/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java
@@ -0,0 +1,124 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.headings;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.*;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.parse.HTMLMetaTags;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.HtmlParseFilter;
+import org.apache.nutch.parse.ParseResult;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.NodeWalker;
+import org.w3c.dom.*;
+
+/**
+ * HtmlParseFilter to retrieve h1 and h2 values from the DOM.
+ */
+public class HeadingsParseFilter implements HtmlParseFilter {
+
+  /**
+   * Pattern used to strip surpluss whitespace
+   */
+  protected static Pattern whitespacePattern = Pattern.compile("\\s+");
+
+  private Configuration conf;
+  private String[] headings;
+  private boolean multiValued = false;
+
+  public ParseResult filter(Content content, ParseResult parseResult,
+      HTMLMetaTags metaTags, DocumentFragment doc) {
+    Parse parse = parseResult.get(content.getUrl());
+
+    for (int i = 0; headings != null && i < headings.length; i++) {
+      List<String> discoveredHeadings = getElement(doc, headings[i]);
+
+      if (discoveredHeadings.size() > 0) {
+        for (String heading : discoveredHeadings) {
+          if (heading != null) {
+            heading.trim();
+
+            if (heading.length() > 0) {
+              parse.getData().getParseMeta().add(headings[i], heading);
+            }
+          }
+        }
+      }
+    }
+
+    return parseResult;
+  }
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+
+    headings = conf.getStrings("headings");
+    multiValued = conf.getBoolean("headings.multivalued", false);
+  }
+
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+  /**
+   * Finds the specified element and returns its value
+   */
+  protected List<String> getElement(DocumentFragment doc, String element) {
+    List<String> headings = new ArrayList<String>();
+    NodeWalker walker = new NodeWalker(doc);
+
+    while (walker.hasNext()) {
+      Node currentNode = walker.nextNode();
+
+      if (currentNode.getNodeType() == Node.ELEMENT_NODE) {
+        if (element.equalsIgnoreCase(currentNode.getNodeName())) {
+          headings.add(getNodeValue(currentNode));
+
+          // Check for multiValued here, if disabled we don't need
+          // to discover more headings.
+          if (!multiValued) {
+            break;
+          }
+        }
+      }
+    }
+
+    return headings;
+  }
+
+  /**
+   * Returns the text value of the specified Node and child nodes
+   */
+  protected static String getNodeValue(Node node) {
+    StringBuilder buffer = new StringBuilder();
+
+    NodeList children = node.getChildNodes();
+
+    for (int i = 0; i < children.getLength(); i++) {
+      if (children.item(i).getNodeType() == Node.TEXT_NODE) {
+        buffer.append(children.item(i).getNodeValue());
+      }
+    }
+
+    // Return with stripped surplus whitespace
+    Matcher matcher = whitespacePattern.matcher(buffer.toString().trim());
+    return matcher.replaceAll(" ").trim();
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/headings/src/main/java/org/apache/nutch/parse/headings/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/headings/src/main/java/org/apache/nutch/parse/headings/package-info.java b/nutch-plugins/headings/src/main/java/org/apache/nutch/parse/headings/package-info.java
new file mode 100644
index 0000000..363e0b2
--- /dev/null
+++ b/nutch-plugins/headings/src/main/java/org/apache/nutch/parse/headings/package-info.java
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Parse filter to extract headings (h1, h2, etc.) from DOM parse tree.
+ */
+package org.apache.nutch.parse.headings;
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-anchor/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-anchor/build.xml b/nutch-plugins/index-anchor/build.xml
new file mode 100644
index 0000000..597b532
--- /dev/null
+++ b/nutch-plugins/index-anchor/build.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="index-anchor" default="jar-core">
+
+  <import file="../build-plugin.xml" />
+
+</project>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-anchor/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-anchor/ivy.xml b/nutch-plugins/index-anchor/ivy.xml
new file mode 100644
index 0000000..1a86d68
--- /dev/null
+++ b/nutch-plugins/index-anchor/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-anchor/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-anchor/plugin.xml b/nutch-plugins/index-anchor/plugin.xml
new file mode 100644
index 0000000..208594b
--- /dev/null
+++ b/nutch-plugins/index-anchor/plugin.xml
@@ -0,0 +1,38 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+  
+  http://www.apache.org/licenses/LICENSE-2.0
+  
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<plugin id="index-anchor" name="Anchor Indexing Filter" version="1.0.0"
+  provider-name="nutch.org">
+
+  <runtime>
+    <library name="index-anchor.jar">
+      <export name="*" />
+    </library>
+  </runtime>
+
+  <requires>
+    <import plugin="nutch-extensionpoints" />
+  </requires>
+
+  <extension id="org.apache.nutch.indexer.anchor"
+    name="Nutch Anchor Indexing Filter"
+    point="org.apache.nutch.indexer.IndexingFilter">
+    <implementation id="AnchorIndexingFilter"
+      class="org.apache.nutch.indexer.anchor.AnchorIndexingFilter" />
+  </extension>
+
+</plugin>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-anchor/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-anchor/pom.xml b/nutch-plugins/index-anchor/pom.xml
new file mode 100644
index 0000000..df01a61
--- /dev/null
+++ b/nutch-plugins/index-anchor/pom.xml
@@ -0,0 +1,38 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>index-anchor</artifactId>
+    <packaging>jar</packaging>
+
+    <name>index-anchor</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-anchor/src/main/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-anchor/src/main/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java b/nutch-plugins/index-anchor/src/main/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java
new file mode 100644
index 0000000..6c9b834
--- /dev/null
+++ b/nutch-plugins/index-anchor/src/main/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java
@@ -0,0 +1,107 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer.anchor;
+
+import java.util.HashSet;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.indexer.IndexingFilter;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.parse.Parse;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Indexing filter that offers an option to either index all inbound anchor text
+ * for a document or deduplicate anchors. Deduplication does have it's con's,
+ * 
+ * @see {@code anchorIndexingFilter.deduplicate} in nutch-default.xml.
+ */
+public class AnchorIndexingFilter implements IndexingFilter {
+
+  public static final Logger LOG = LoggerFactory
+      .getLogger(AnchorIndexingFilter.class);
+  private Configuration conf;
+  private boolean deduplicate = false;
+
+  /**
+   * Set the {@link Configuration} object
+   */
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+
+    deduplicate = conf.getBoolean("anchorIndexingFilter.deduplicate", false);
+    LOG.info("Anchor deduplication is: " + (deduplicate ? "on" : "off"));
+  }
+
+  /**
+   * Get the {@link Configuration} object
+   */
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+  /**
+   * The {@link AnchorIndexingFilter} filter object which supports boolean
+   * configuration settings for the deduplication of anchors. See
+   * {@code anchorIndexingFilter.deduplicate} in nutch-default.xml.
+   * 
+   * @param doc
+   *          The {@link NutchDocument} object
+   * @param parse
+   *          The relevant {@link Parse} object passing through the filter
+   * @param url
+   *          URL to be filtered for anchor text
+   * @param datum
+   *          The {@link CrawlDatum} entry
+   * @param inlinks
+   *          The {@link Inlinks} containing anchor text
+   * @return filtered NutchDocument
+   */
+  public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
+      CrawlDatum datum, Inlinks inlinks) throws IndexingException {
+
+    String[] anchors = (inlinks != null ? inlinks.getAnchors() : new String[0]);
+
+    HashSet<String> set = null;
+
+    for (int i = 0; i < anchors.length; i++) {
+      if (deduplicate) {
+        if (set == null)
+          set = new HashSet<String>();
+        String lcAnchor = anchors[i].toLowerCase();
+
+        // Check if already processed the current anchor
+        if (!set.contains(lcAnchor)) {
+          doc.add("anchor", anchors[i]);
+
+          // Add to map
+          set.add(lcAnchor);
+        }
+      } else {
+        doc.add("anchor", anchors[i]);
+      }
+    }
+
+    return doc;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-anchor/src/main/java/org/apache/nutch/indexer/anchor/package.html
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-anchor/src/main/java/org/apache/nutch/indexer/anchor/package.html b/nutch-plugins/index-anchor/src/main/java/org/apache/nutch/indexer/anchor/package.html
new file mode 100644
index 0000000..c255029
--- /dev/null
+++ b/nutch-plugins/index-anchor/src/main/java/org/apache/nutch/indexer/anchor/package.html
@@ -0,0 +1,5 @@
+<html>
+<body>
+<p>An indexing plugin for inbound anchor text.</p><p></p>
+</body>
+</html>

[11/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build for nutch-core and nutch-plugins

Posted by th...@apache.org.

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/DOMContentUtils.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/DOMContentUtils.java b/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/DOMContentUtils.java
new file mode 100644
index 0000000..5c4c990
--- /dev/null
+++ b/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/DOMContentUtils.java
@@ -0,0 +1,402 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.tika;
+
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.util.NodeWalker;
+import org.apache.nutch.util.URLUtil;
+import org.apache.tika.sax.Link;
+import org.w3c.dom.NamedNodeMap;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+
+/**
+ * A collection of methods for extracting content from DOM trees.
+ * 
+ * This class holds a few utility methods for pulling content out of DOM nodes,
+ * such as getOutlinks, getText, etc.
+ * 
+ */
+public class DOMContentUtils {
+
+  private static class LinkParams {
+    private String elName;
+    private String attrName;
+    private int childLen;
+
+    private LinkParams(String elName, String attrName, int childLen) {
+      this.elName = elName;
+      this.attrName = attrName;
+      this.childLen = childLen;
+    }
+
+    public String toString() {
+      return "LP[el=" + elName + ",attr=" + attrName + ",len=" + childLen + "]";
+    }
+  }
+
+  private HashMap<String, LinkParams> linkParams = new HashMap<String, LinkParams>();
+  private HashSet<String> ignoredTags = new HashSet<String>();
+  private Configuration conf;
+
+  public DOMContentUtils(Configuration conf) {
+    setConf(conf);
+  }
+
+  public void setConf(Configuration conf) {
+    // forceTags is used to override configurable tag ignoring, later on
+    Collection<String> forceTags = new ArrayList<String>(1);
+
+    this.conf = conf;
+    linkParams.clear();
+    linkParams.put("a", new LinkParams("a", "href", 1));
+    linkParams.put("area", new LinkParams("area", "href", 0));
+    if (conf.getBoolean("parser.html.form.use_action", true)) {
+      linkParams.put("form", new LinkParams("form", "action", 1));
+      if (conf.get("parser.html.form.use_action") != null)
+        forceTags.add("form");
+    }
+    linkParams.put("frame", new LinkParams("frame", "src", 0));
+    linkParams.put("iframe", new LinkParams("iframe", "src", 0));
+    linkParams.put("script", new LinkParams("script", "src", 0));
+    linkParams.put("link", new LinkParams("link", "href", 0));
+    linkParams.put("img", new LinkParams("img", "src", 0));
+
+    // remove unwanted link tags from the linkParams map
+    String[] ignoreTags = conf.getStrings("parser.html.outlinks.ignore_tags");
+    for (int i = 0; ignoreTags != null && i < ignoreTags.length; i++) {
+      ignoredTags.add(ignoreTags[i].toLowerCase());
+      if (!forceTags.contains(ignoreTags[i]))
+        linkParams.remove(ignoreTags[i]);
+    }
+  }
+
+  /**
+   * This method takes a {@link StringBuffer} and a DOM {@link Node}, and will
+   * append all the content text found beneath the DOM node to the
+   * <code>StringBuffer</code>.
+   * 
+   * <p>
+   * 
+   * If <code>abortOnNestedAnchors</code> is true, DOM traversal will be aborted
+   * and the <code>StringBuffer</code> will not contain any text encountered
+   * after a nested anchor is found.
+   * 
+   * <p>
+   * 
+   * @return true if nested anchors were found
+   */
+  private boolean getText(StringBuffer sb, Node node,
+      boolean abortOnNestedAnchors) {
+    if (getTextHelper(sb, node, abortOnNestedAnchors, 0)) {
+      return true;
+    }
+    return false;
+  }
+
+  /**
+   * This is a convinience method, equivalent to
+   * {@link #getText(StringBuffer,Node,boolean) getText(sb, node, false)}.
+   * 
+   */
+  public void getText(StringBuffer sb, Node node) {
+    getText(sb, node, false);
+  }
+
+  // returns true if abortOnNestedAnchors is true and we find nested
+  // anchors
+  private boolean getTextHelper(StringBuffer sb, Node node,
+      boolean abortOnNestedAnchors, int anchorDepth) {
+    boolean abort = false;
+    NodeWalker walker = new NodeWalker(node);
+
+    while (walker.hasNext()) {
+
+      Node currentNode = walker.nextNode();
+      String nodeName = currentNode.getNodeName();
+      short nodeType = currentNode.getNodeType();
+
+      if ("script".equalsIgnoreCase(nodeName)) {
+        walker.skipChildren();
+      }
+      if ("style".equalsIgnoreCase(nodeName)) {
+        walker.skipChildren();
+      }
+      if (abortOnNestedAnchors && "a".equalsIgnoreCase(nodeName)) {
+        anchorDepth++;
+        if (anchorDepth > 1) {
+          abort = true;
+          break;
+        }
+      }
+      if (nodeType == Node.COMMENT_NODE) {
+        walker.skipChildren();
+      }
+      if (nodeType == Node.TEXT_NODE) {
+        // cleanup and trim the value
+        String text = currentNode.getNodeValue();
+        text = text.replaceAll("\\s+", " ");
+        text = text.trim();
+        if (text.length() > 0) {
+          if (sb.length() > 0)
+            sb.append(' ');
+          sb.append(text);
+        }
+      }
+    }
+
+    return abort;
+  }
+
+  /**
+   * This method takes a {@link StringBuffer} and a DOM {@link Node}, and will
+   * append the content text found beneath the first <code>title</code> node to
+   * the <code>StringBuffer</code>.
+   * 
+   * @return true if a title node was found, false otherwise
+   */
+  public boolean getTitle(StringBuffer sb, Node node) {
+
+    NodeWalker walker = new NodeWalker(node);
+
+    while (walker.hasNext()) {
+
+      Node currentNode = walker.nextNode();
+      String nodeName = currentNode.getNodeName();
+      short nodeType = currentNode.getNodeType();
+
+      if ("body".equalsIgnoreCase(nodeName)) { // stop after HEAD
+        return false;
+      }
+
+      if (nodeType == Node.ELEMENT_NODE) {
+        if ("title".equalsIgnoreCase(nodeName)) {
+          getText(sb, currentNode);
+          return true;
+        }
+      }
+    }
+
+    return false;
+  }
+
+  /** If Node contains a BASE tag then it's HREF is returned. */
+  URL getBase(Node node) {
+
+    NodeWalker walker = new NodeWalker(node);
+
+    while (walker.hasNext()) {
+
+      Node currentNode = walker.nextNode();
+      String nodeName = currentNode.getNodeName();
+      short nodeType = currentNode.getNodeType();
+
+      // is this node a BASE tag?
+      if (nodeType == Node.ELEMENT_NODE) {
+
+        if ("body".equalsIgnoreCase(nodeName)) { // stop after HEAD
+          return null;
+        }
+
+        if ("base".equalsIgnoreCase(nodeName)) {
+          NamedNodeMap attrs = currentNode.getAttributes();
+          for (int i = 0; i < attrs.getLength(); i++) {
+            Node attr = attrs.item(i);
+            if ("href".equalsIgnoreCase(attr.getNodeName())) {
+              try {
+                return new URL(attr.getNodeValue());
+              } catch (MalformedURLException e) {
+              }
+            }
+          }
+        }
+      }
+    }
+
+    // no.
+    return null;
+  }
+
+  private boolean hasOnlyWhiteSpace(Node node) {
+    String val = node.getNodeValue();
+    for (int i = 0; i < val.length(); i++) {
+      if (!Character.isWhitespace(val.charAt(i)))
+        return false;
+    }
+    return true;
+  }
+  
+  // this only covers a few cases of empty links that are symptomatic
+  // of nekohtml's DOM-fixup process...
+  private boolean shouldThrowAwayLink(Node node, NodeList children,
+      int childLen, LinkParams params) {
+    if (childLen == 0) {
+      // this has no inner structure
+      if (params.childLen == 0)
+        return false;
+      else
+        return true;
+    } else if ((childLen == 1)
+        && (children.item(0).getNodeType() == Node.ELEMENT_NODE)
+        && (params.elName.equalsIgnoreCase(children.item(0).getNodeName()))) {
+      // single nested link
+      return true;
+
+    } else if (childLen == 2) {
+
+      Node c0 = children.item(0);
+      Node c1 = children.item(1);
+
+      if ((c0.getNodeType() == Node.ELEMENT_NODE)
+          && (params.elName.equalsIgnoreCase(c0.getNodeName()))
+          && (c1.getNodeType() == Node.TEXT_NODE) && hasOnlyWhiteSpace(c1)) {
+        // single link followed by whitespace node
+        return true;
+      }
+
+      if ((c1.getNodeType() == Node.ELEMENT_NODE)
+          && (params.elName.equalsIgnoreCase(c1.getNodeName()))
+          && (c0.getNodeType() == Node.TEXT_NODE) && hasOnlyWhiteSpace(c0)) {
+        // whitespace node followed by single link
+        return true;
+      }
+
+    } else if (childLen == 3) {
+      Node c0 = children.item(0);
+      Node c1 = children.item(1);
+      Node c2 = children.item(2);
+
+      if ((c1.getNodeType() == Node.ELEMENT_NODE)
+          && (params.elName.equalsIgnoreCase(c1.getNodeName()))
+          && (c0.getNodeType() == Node.TEXT_NODE)
+          && (c2.getNodeType() == Node.TEXT_NODE) && hasOnlyWhiteSpace(c0)
+          && hasOnlyWhiteSpace(c2)) {
+        // single link surrounded by whitespace nodes
+        return true;
+      }
+    }
+
+    return false;
+  }
+
+  /**
+   * This method finds all anchors below the supplied DOM <code>node</code>, and
+   * creates appropriate {@link Outlink} records for each (relative to the
+   * supplied <code>base</code> URL), and adds them to the <code>outlinks</code>
+   * {@link ArrayList}.
+   * 
+   * <p>
+   * 
+   * Links without inner structure (tags, text, etc) are discarded, as are links
+   * which contain only single nested links and empty text nodes (this is a
+   * common DOM-fixup artifact, at least with nekohtml).
+   */
+  public void getOutlinks(URL base, ArrayList<Outlink> outlinks, Node node) {
+
+    NodeWalker walker = new NodeWalker(node);
+    while (walker.hasNext()) {
+
+      Node currentNode = walker.nextNode();
+      String nodeName = currentNode.getNodeName();
+      short nodeType = currentNode.getNodeType();
+      NodeList children = currentNode.getChildNodes();
+      int childLen = (children != null) ? children.getLength() : 0;
+
+      if (nodeType == Node.ELEMENT_NODE) {
+
+        nodeName = nodeName.toLowerCase();
+        LinkParams params = (LinkParams) linkParams.get(nodeName);
+        if (params != null) {
+          if (!shouldThrowAwayLink(currentNode, children, childLen, params)) {
+
+            StringBuffer linkText = new StringBuffer();
+            getText(linkText, currentNode, true);
+
+            NamedNodeMap attrs = currentNode.getAttributes();
+            String target = null;
+            boolean noFollow = false;
+            boolean post = false;
+            for (int i = 0; i < attrs.getLength(); i++) {
+              Node attr = attrs.item(i);
+              String attrName = attr.getNodeName();
+              if (params.attrName.equalsIgnoreCase(attrName)) {
+                target = attr.getNodeValue();
+              } else if ("rel".equalsIgnoreCase(attrName)
+                  && "nofollow".equalsIgnoreCase(attr.getNodeValue())) {
+                noFollow = true;
+              } else if ("method".equalsIgnoreCase(attrName)
+                  && "post".equalsIgnoreCase(attr.getNodeValue())) {
+                post = true;
+              }
+            }
+            if (target != null && !noFollow && !post)
+              try {
+
+                URL url = URLUtil.resolveURL(base, target);
+                outlinks.add(new Outlink(url.toString(), linkText.toString()
+                    .trim()));
+              } catch (MalformedURLException e) {
+                // don't care
+              }
+          }
+          // this should not have any children, skip them
+          if (params.childLen == 0)
+            continue;
+        }
+      }
+    }
+  }
+  
+  // This one is used by NUTCH-1918
+  public void getOutlinks(URL base, ArrayList<Outlink> outlinks, List<Link> tikaExtractedOutlinks) {
+    String target = null;
+    String anchor = null;
+    boolean noFollow = false;
+
+    for (Link link : tikaExtractedOutlinks) {
+      target = link.getUri();
+      noFollow = (link.getRel().toLowerCase().equals("nofollow")) ? true : false;
+      anchor = link.getText();
+
+      if (!ignoredTags.contains(link.getType())) {
+        if (target != null && !noFollow) {
+          try {
+            URL url = URLUtil.resolveURL(base, target);
+            
+            // clean the anchor
+            anchor = anchor.replaceAll("\\s+", " ");
+            anchor = anchor.trim();
+            
+            outlinks.add(new Outlink(url.toString(), anchor));
+          } catch (MalformedURLException e) {
+            // don't care
+          }
+        }
+      }
+    }
+  }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java b/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java
new file mode 100644
index 0000000..294bde9
--- /dev/null
+++ b/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java
@@ -0,0 +1,214 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.tika;
+
+import java.net.URL;
+
+import org.apache.nutch.parse.HTMLMetaTags;
+import org.w3c.dom.*;
+
+/**
+ * Class for parsing META Directives from DOM trees. This class handles
+ * specifically Robots META directives (all, none, nofollow, noindex), finding
+ * BASE HREF tags, and HTTP-EQUIV no-cache instructions. All meta directives are
+ * stored in a HTMLMetaTags instance.
+ */
+public class HTMLMetaProcessor {
+
+  /**
+   * Utility class with indicators for the robots directives "noindex" and
+   * "nofollow", and HTTP-EQUIV/no-cache
+   */
+
+  /**
+   * Sets the indicators in <code>robotsMeta</code> to appropriate values, based
+   * on any META tags found under the given <code>node</code>.
+   */
+  public static final void getMetaTags(HTMLMetaTags metaTags, Node node,
+      URL currURL) {
+
+    metaTags.reset();
+    getMetaTagsHelper(metaTags, node, currURL);
+  }
+
+  private static final void getMetaTagsHelper(HTMLMetaTags metaTags, Node node,
+      URL currURL) {
+
+    if (node.getNodeType() == Node.ELEMENT_NODE) {
+
+      if ("body".equalsIgnoreCase(node.getNodeName())) {
+        // META tags should not be under body
+        return;
+      }
+
+      if ("meta".equalsIgnoreCase(node.getNodeName())) {
+        NamedNodeMap attrs = node.getAttributes();
+        Node nameNode = null;
+        Node equivNode = null;
+        Node contentNode = null;
+        // Retrieves name, http-equiv and content attribues
+        for (int i = 0; i < attrs.getLength(); i++) {
+          Node attr = attrs.item(i);
+          String attrName = attr.getNodeName().toLowerCase();
+          if (attrName.equals("name")) {
+            nameNode = attr;
+          } else if (attrName.equals("http-equiv")) {
+            equivNode = attr;
+          } else if (attrName.equals("content")) {
+            contentNode = attr;
+          }
+        }
+
+        if (nameNode != null) {
+          if (contentNode != null) {
+            String name = nameNode.getNodeValue().toLowerCase();
+            metaTags.getGeneralTags().add(name, contentNode.getNodeValue());
+            if ("robots".equals(name)) {
+
+              if (contentNode != null) {
+                String directives = contentNode.getNodeValue().toLowerCase();
+                int index = directives.indexOf("none");
+
+                if (index >= 0) {
+                  metaTags.setNoIndex();
+                  metaTags.setNoFollow();
+                }
+
+                index = directives.indexOf("all");
+                if (index >= 0) {
+                  // do nothing...
+                }
+
+                index = directives.indexOf("noindex");
+                if (index >= 0) {
+                  metaTags.setNoIndex();
+                }
+
+                index = directives.indexOf("nofollow");
+                if (index >= 0) {
+                  metaTags.setNoFollow();
+                }
+
+                index = directives.indexOf("noarchive");
+                if (index >= 0) {
+                  metaTags.setNoCache();
+                }
+              }
+
+            } // end if (name == robots)
+          }
+        }
+
+        if (equivNode != null) {
+          if (contentNode != null) {
+            String name = equivNode.getNodeValue().toLowerCase();
+            String content = contentNode.getNodeValue();
+            metaTags.getHttpEquivTags().setProperty(name, content);
+            if ("pragma".equals(name)) {
+              content = content.toLowerCase();
+              int index = content.indexOf("no-cache");
+              if (index >= 0)
+                metaTags.setNoCache();
+            } else if ("refresh".equals(name)) {
+              int idx = content.indexOf(';');
+              String time = null;
+              if (idx == -1) { // just the refresh time
+                time = content;
+              } else
+                time = content.substring(0, idx);
+              try {
+                metaTags.setRefreshTime(Integer.parseInt(time));
+                // skip this if we couldn't parse the time
+                metaTags.setRefresh(true);
+              } catch (Exception e) {
+                ;
+              }
+              URL refreshUrl = null;
+              if (metaTags.getRefresh() && idx != -1) { // set the URL
+                idx = content.toLowerCase().indexOf("url=");
+                if (idx == -1) { // assume a mis-formatted entry with just the
+                                 // url
+                  idx = content.indexOf(';') + 1;
+                } else
+                  idx += 4;
+                if (idx != -1) {
+                  String url = content.substring(idx);
+                  try {
+                    refreshUrl = new URL(url);
+                  } catch (Exception e) {
+                    // XXX according to the spec, this has to be an absolute
+                    // XXX url. However, many websites use relative URLs and
+                    // XXX expect browsers to handle that.
+                    // XXX Unfortunately, in some cases this may create a
+                    // XXX infinitely recursive paths (a crawler trap)...
+                    // if (!url.startsWith("/")) url = "/" + url;
+                    try {
+                      refreshUrl = new URL(currURL, url);
+                    } catch (Exception e1) {
+                      refreshUrl = null;
+                    }
+                  }
+                }
+              }
+              if (metaTags.getRefresh()) {
+                if (refreshUrl == null) {
+                  // apparently only refresh time was present. set the URL
+                  // to the same URL.
+                  refreshUrl = currURL;
+                }
+                metaTags.setRefreshHref(refreshUrl);
+              }
+            }
+          }
+        }
+
+      } else if ("base".equalsIgnoreCase(node.getNodeName())) {
+        NamedNodeMap attrs = node.getAttributes();
+        Node hrefNode = attrs.getNamedItem("href");
+
+        if (hrefNode != null) {
+          String urlString = hrefNode.getNodeValue();
+
+          URL url = null;
+          try {
+            if (currURL == null)
+              url = new URL(urlString);
+            else
+              url = new URL(currURL, urlString);
+          } catch (Exception e) {
+            ;
+          }
+
+          if (url != null)
+            metaTags.setBaseHref(url);
+        }
+
+      }
+
+    }
+
+    NodeList children = node.getChildNodes();
+    if (children != null) {
+      int len = children.getLength();
+      for (int i = 0; i < len; i++) {
+        getMetaTagsHelper(metaTags, children.item(i), currURL);
+      }
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/TikaParser.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/TikaParser.java b/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/TikaParser.java
new file mode 100644
index 0000000..5d7eca9
--- /dev/null
+++ b/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/TikaParser.java
@@ -0,0 +1,286 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse.tika;
+
+import java.io.ByteArrayInputStream;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.commons.lang.StringUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.html.dom.HTMLDocumentImpl;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.parse.HTMLMetaTags;
+import org.apache.nutch.parse.HtmlParseFilters;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.OutlinkExtractor;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.ParseResult;
+import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.protocol.Content;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.html.BoilerpipeContentHandler;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.html.HtmlMapper;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.sax.Link;
+import org.apache.tika.sax.LinkContentHandler;
+import org.apache.tika.sax.TeeContentHandler;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.w3c.dom.DocumentFragment;
+import org.xml.sax.ContentHandler;
+
+/**
+ * Wrapper for Tika parsers. Mimics the HTMLParser but using the XHTML
+ * representation returned by Tika as SAX events
+ ***/
+
+public class TikaParser implements org.apache.nutch.parse.Parser {
+
+  public static final Logger LOG = LoggerFactory.getLogger(TikaParser.class);
+
+  private Configuration conf;
+  private TikaConfig tikaConfig = null;
+  private DOMContentUtils utils;
+  private HtmlParseFilters htmlParseFilters;
+  private String cachingPolicy;
+  private HtmlMapper HTMLMapper;
+  private boolean upperCaseElementNames = true;
+
+  @SuppressWarnings("deprecation")
+  public ParseResult getParse(Content content) {
+    String mimeType = content.getContentType();
+    
+    boolean useBoilerpipe = getConf().get("tika.extractor", "none").equals("boilerpipe");
+    String boilerpipeExtractorName = getConf().get("tika.extractor.boilerpipe.algorithm", "ArticleExtractor");
+
+    URL base;
+    try {
+      base = new URL(content.getBaseUrl());
+    } catch (MalformedURLException e) {
+      return new ParseStatus(e)
+          .getEmptyParseResult(content.getUrl(), getConf());
+    }
+
+    // get the right parser using the mime type as a clue
+    Parser parser = tikaConfig.getParser(MediaType.parse(mimeType));
+    byte[] raw = content.getContent();
+
+    if (parser == null) {
+      String message = "Can't retrieve Tika parser for mime-type " + mimeType;
+      LOG.error(message);
+      return new ParseStatus(ParseStatus.FAILED, message).getEmptyParseResult(
+          content.getUrl(), getConf());
+    }
+
+    LOG.debug("Using Tika parser " + parser.getClass().getName()
+        + " for mime-type " + mimeType);
+
+    Metadata tikamd = new Metadata();
+
+    HTMLDocumentImpl doc = new HTMLDocumentImpl();
+    doc.setErrorChecking(false);
+    DocumentFragment root = doc.createDocumentFragment();
+
+    ContentHandler domHandler;
+    
+    // Check whether to use Tika's BoilerplateContentHandler
+    if (useBoilerpipe) {
+      BoilerpipeContentHandler bpHandler = new BoilerpipeContentHandler((ContentHandler)new DOMBuilder(doc, root),
+      BoilerpipeExtractorRepository.getExtractor(boilerpipeExtractorName));
+      bpHandler.setIncludeMarkup(true);
+      domHandler = (ContentHandler)bpHandler;
+    } else {
+      DOMBuilder domBuilder = new DOMBuilder(doc, root);
+      domBuilder.setUpperCaseElementNames(upperCaseElementNames);
+      domBuilder.setDefaultNamespaceURI(XHTMLContentHandler.XHTML);
+      domHandler = (ContentHandler)domBuilder;
+    }
+
+    LinkContentHandler linkContentHandler = new LinkContentHandler();
+
+    ParseContext context = new ParseContext();
+    TeeContentHandler teeContentHandler = new TeeContentHandler(domHandler, linkContentHandler);
+    
+    if (HTMLMapper != null)
+      context.set(HtmlMapper.class, HTMLMapper);
+    tikamd.set(Metadata.CONTENT_TYPE, mimeType);
+    try {
+      parser.parse(new ByteArrayInputStream(raw), (ContentHandler)teeContentHandler, tikamd, context);
+    } catch (Exception e) {
+      LOG.error("Error parsing " + content.getUrl(), e);
+      return new ParseStatus(ParseStatus.FAILED, e.getMessage())
+          .getEmptyParseResult(content.getUrl(), getConf());
+    }
+
+    HTMLMetaTags metaTags = new HTMLMetaTags();
+    String text = "";
+    String title = "";
+    Outlink[] outlinks = new Outlink[0];
+    org.apache.nutch.metadata.Metadata nutchMetadata = new org.apache.nutch.metadata.Metadata();
+
+    // we have converted the sax events generated by Tika into a DOM object
+    // so we can now use the usual HTML resources from Nutch
+    // get meta directives
+    HTMLMetaProcessor.getMetaTags(metaTags, root, base);
+    if (LOG.isTraceEnabled()) {
+      LOG.trace("Meta tags for " + base + ": " + metaTags.toString());
+    }
+
+    // check meta directives
+    if (!metaTags.getNoIndex()) { // okay to index
+      StringBuffer sb = new StringBuffer();
+      if (LOG.isTraceEnabled()) {
+        LOG.trace("Getting text...");
+      }
+      utils.getText(sb, root); // extract text
+      text = sb.toString();
+      sb.setLength(0);
+      if (LOG.isTraceEnabled()) {
+        LOG.trace("Getting title...");
+      }
+      utils.getTitle(sb, root); // extract title
+      title = sb.toString().trim();
+    }
+
+    if (!metaTags.getNoFollow()) { // okay to follow links
+      ArrayList<Outlink> l = new ArrayList<Outlink>(); // extract outlinks
+      URL baseTag = utils.getBase(root);
+      if (LOG.isTraceEnabled()) {
+        LOG.trace("Getting links...");
+      }
+      
+      // pre-1233 outlink extraction
+      //utils.getOutlinks(baseTag != null ? baseTag : base, l, root);
+      // Get outlinks from Tika
+      List<Link> tikaExtractedOutlinks = linkContentHandler.getLinks();
+      utils.getOutlinks(baseTag != null ? baseTag : base, l, tikaExtractedOutlinks);
+      outlinks = l.toArray(new Outlink[l.size()]);
+      if (LOG.isTraceEnabled()) {
+        LOG.trace("found " + outlinks.length + " outlinks in "
+            + content.getUrl());
+      }
+    }
+
+    // populate Nutch metadata with Tika metadata
+    String[] TikaMDNames = tikamd.names();
+    for (String tikaMDName : TikaMDNames) {
+      if (tikaMDName.equalsIgnoreCase(Metadata.TITLE))
+        continue;
+      String[] values = tikamd.getValues(tikaMDName);
+      for (String v : values)
+        nutchMetadata.add(tikaMDName, v);
+    }
+
+    // no outlinks? try OutlinkExtractor e.g works for mime types where no
+    // explicit markup for anchors
+
+    if (outlinks.length == 0) {
+      outlinks = OutlinkExtractor.getOutlinks(text, getConf());
+    }
+
+    ParseStatus status = new ParseStatus(ParseStatus.SUCCESS);
+    if (metaTags.getRefresh()) {
+      status.setMinorCode(ParseStatus.SUCCESS_REDIRECT);
+      status.setArgs(new String[] { metaTags.getRefreshHref().toString(),
+          Integer.toString(metaTags.getRefreshTime()) });
+    }
+    ParseData parseData = new ParseData(status, title, outlinks,
+        content.getMetadata(), nutchMetadata);
+    ParseResult parseResult = ParseResult.createParseResult(content.getUrl(),
+        new ParseImpl(text, parseData));
+
+    // run filters on parse
+    ParseResult filteredParse = this.htmlParseFilters.filter(content,
+        parseResult, metaTags, root);
+    if (metaTags.getNoCache()) { // not okay to cache
+      for (Map.Entry<org.apache.hadoop.io.Text, Parse> entry : filteredParse)
+        entry.getValue().getData().getParseMeta()
+            .set(Nutch.CACHING_FORBIDDEN_KEY, cachingPolicy);
+    }
+    return filteredParse;
+  }
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    this.tikaConfig = null;
+
+    // do we want a custom Tika configuration file
+    // deprecated since Tika 0.7 which is based on
+    // a service provider based configuration
+    String customConfFile = conf.get("tika.config.file");
+    if (customConfFile != null) {
+      try {
+        // see if a Tika config file can be found in the job file
+        URL customTikaConfig = conf.getResource(customConfFile);
+        if (customTikaConfig != null)
+          tikaConfig = new TikaConfig(customTikaConfig);
+      } catch (Exception e1) {
+        String message = "Problem loading custom Tika configuration from "
+            + customConfFile;
+        LOG.error(message, e1);
+      }
+    } else {
+      try {
+        tikaConfig = new TikaConfig(this.getClass().getClassLoader());
+      } catch (Exception e2) {
+        String message = "Problem loading default Tika configuration";
+        LOG.error(message, e2);
+      }
+    }
+
+    // use a custom htmlmapper
+    String htmlmapperClassName = conf.get("tika.htmlmapper.classname");
+    if (StringUtils.isNotBlank(htmlmapperClassName)) {
+      try {
+        Class HTMLMapperClass = Class.forName(htmlmapperClassName);
+        boolean interfaceOK = HtmlMapper.class
+            .isAssignableFrom(HTMLMapperClass);
+        if (!interfaceOK) {
+          throw new RuntimeException("Class " + htmlmapperClassName
+              + " does not implement HtmlMapper");
+        }
+        HTMLMapper = (HtmlMapper) HTMLMapperClass.newInstance();
+      } catch (Exception e) {
+        LOG.error("Can't generate instance for class " + htmlmapperClassName);
+        throw new RuntimeException("Can't generate instance for class "
+            + htmlmapperClassName);
+      }
+    }
+
+    this.htmlParseFilters = new HtmlParseFilters(getConf());
+    this.utils = new DOMContentUtils(conf);
+    this.cachingPolicy = getConf().get("parser.caching.forbidden.policy",
+        Nutch.CACHING_FORBIDDEN_CONTENT);
+    this.upperCaseElementNames = getConf().getBoolean(
+        "tika.uppercase.element.names", true);
+  }
+
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/XMLCharacterRecognizer.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/XMLCharacterRecognizer.java b/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/XMLCharacterRecognizer.java
new file mode 100644
index 0000000..d625c33
--- /dev/null
+++ b/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/XMLCharacterRecognizer.java
@@ -0,0 +1,112 @@
+/*
+ * XXX ab@apache.org: This class is copied verbatim from Xalan-J 2.6.0
+ * XXX distribution, org.apache.xml.utils.XMLCharacterRecognizer,
+ * XXX in order to avoid dependency on Xalan.
+ */
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Id: XMLCharacterRecognizer.java 823614 2009-10-09 17:02:32Z ab $
+ */
+package org.apache.nutch.parse.tika;
+
+/**
+ * Class used to verify whether the specified <var>ch</var> conforms to the XML
+ * 1.0 definition of whitespace.
+ */
+class XMLCharacterRecognizer {
+
+  /**
+   * Returns whether the specified <var>ch</var> conforms to the XML 1.0
+   * definition of whitespace. Refer to <A
+   * href="http://www.w3.org/TR/1998/REC-xml-19980210#NT-S"> the definition of
+   * <CODE>S</CODE></A> for details.
+   * 
+   * @param ch
+   *          Character to check as XML whitespace.
+   * @return =true if <var>ch</var> is XML whitespace; otherwise =false.
+   */
+  static boolean isWhiteSpace(char ch) {
+    return (ch == 0x20) || (ch == 0x09) || (ch == 0xD) || (ch == 0xA);
+  }
+
+  /**
+   * Tell if the string is whitespace.
+   * 
+   * @param ch
+   *          Character array to check as XML whitespace.
+   * @param start
+   *          Start index of characters in the array
+   * @param length
+   *          Number of characters in the array
+   * @return True if the characters in the array are XML whitespace; otherwise,
+   *         false.
+   */
+  static boolean isWhiteSpace(char ch[], int start, int length) {
+
+    int end = start + length;
+
+    for (int s = start; s < end; s++) {
+      if (!isWhiteSpace(ch[s]))
+        return false;
+    }
+
+    return true;
+  }
+
+  /**
+   * Tell if the string is whitespace.
+   * 
+   * @param buf
+   *          StringBuffer to check as XML whitespace.
+   * @return True if characters in buffer are XML whitespace, false otherwise
+   */
+  static boolean isWhiteSpace(StringBuffer buf) {
+
+    int n = buf.length();
+
+    for (int i = 0; i < n; i++) {
+      if (!isWhiteSpace(buf.charAt(i)))
+        return false;
+    }
+
+    return true;
+  }
+
+  /**
+   * Tell if the string is whitespace.
+   * 
+   * @param s
+   *          String to check as XML whitespace.
+   * @return True if characters in buffer are XML whitespace, false otherwise
+   */
+  static boolean isWhiteSpace(String s) {
+
+    if (null != s) {
+      int n = s.length();
+
+      for (int i = 0; i < n; i++) {
+        if (!isWhiteSpace(s.charAt(i)))
+          return false;
+      }
+    }
+
+    return true;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/package-info.java b/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/package-info.java
new file mode 100644
index 0000000..19e3f47
--- /dev/null
+++ b/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/package-info.java
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Parse various document formats with help of
+ * <a href="http://tika.apache.org/">Apache Tika</a>.
+ */
+package org.apache.nutch.parse.tika;
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestDOMContentUtils.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestDOMContentUtils.java b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestDOMContentUtils.java
new file mode 100644
index 0000000..96029a6
--- /dev/null
+++ b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestDOMContentUtils.java
@@ -0,0 +1,337 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tika;
+
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.tika.DOMContentUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.NutchConfiguration;
+
+import java.io.ByteArrayInputStream;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.StringTokenizer;
+
+import org.xml.sax.*;
+import org.w3c.dom.*;
+import org.apache.html.dom.*;
+import org.cyberneko.html.parsers.DOMFragmentParser;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+/**
+ * Unit tests for DOMContentUtils.
+ */
+public class TestDOMContentUtils {
+
+  private static final String[] testPages = {
+
+      new String("<html><head><title> title </title><script> script </script>"
+          + "</head><body> body <a href=\"http://www.nutch.org\">"
+          + " anchor </a><!--comment-->" + "</body></html>"),
+
+      new String("<html><head><title> title </title><script> script </script>"
+          + "</head><body> body <a href=\"/\">" + " home </a><!--comment-->"
+          + "<style> style </style>" + " <a href=\"bot.html\">" + " bots </a>"
+          + "</body></html>"),
+
+      new String("<html><head><title> </title>" + "</head><body> "
+          + "<a href=\"/\"> separate this " + "<a href=\"ok\"> from this"
+          + "</a></a>" + "</body></html>"),
+
+      // this one relies on certain neko fixup behavior, possibly
+      // distributing the anchors into the LI's-but not the other
+      // anchors (outside of them, instead)! So you get a tree that
+      // looks like:
+      // ... <li> <a href=/> home </a> </li>
+      // <li> <a href=/> <a href="1"> 1 </a> </a> </li>
+      // <li> <a href=/> <a href="1"> <a href="2"> 2 </a> </a> </a> </li>
+      new String("<html><head><title> my title </title>"
+          + "</head><body> body " + "<ul>" + "<li> <a href=\"/\"> home"
+          + "<li> <a href=\"1\"> 1" + "<li> <a href=\"2\"> 2" + "</ul>"
+          + "</body></html>"),
+
+      // test frameset link extraction. The invalid frame in the middle
+      // will be
+      // fixed to a third standalone frame.
+      new String("<html><head><title> my title </title>"
+          + "</head><frameset rows=\"20,*\"> " + "<frame src=\"top.html\">"
+          + "</frame>" + "<frameset cols=\"20,*\">"
+          + "<frame src=\"left.html\">" + "<frame src=\"invalid.html\"/>"
+          + "</frame>" + "<frame src=\"right.html\">" + "</frame>"
+          + "</frameset>" + "</frameset>" + "</body></html>"),
+
+      // test <area> and <iframe> link extraction + url normalization
+      new String(
+          "<html><head><title> my title </title>"
+              + "</head><body>"
+              + "<img src=\"logo.gif\" usemap=\"#green\" border=\"0\">"
+              + "<map name=\"green\">"
+              + "<area shape=\"polygon\" coords=\"19,44,45,11,87\" href=\"../index.html\">"
+              + "<area shape=\"rect\" coords=\"128,132,241,179\" href=\"#bottom\">"
+              + "<area shape=\"circle\" coords=\"68,211,35\" href=\"../bot.html\">"
+              + "</map>" + "<a name=\"bottom\"/><h1> the bottom </h1> "
+              + "<iframe src=\"../docs/index.html\"/>" + "</body></html>"),
+
+      // test whitespace processing for plain text extraction
+      new String(
+          "<html><head>\n <title> my\t\n  title\r\n </title>\n"
+              + " </head>\n"
+              + " <body>\n"
+              + "    <h1> Whitespace\ttest  </h1> \n"
+              + "\t<a href=\"../index.html\">\n  \twhitespace  test\r\n\t</a>  \t\n"
+              + "    <p> This is<span> a whitespace<span></span> test</span>. Newlines\n"
+              + "should appear as space too.</p><p>Tabs\tare spaces too.\n</p>"
+              + "    This\t<b>is a</b> break -&gt;<br>and the line after<i> break</i>.<br>\n"
+              + "<table>"
+              + "    <tr><td>one</td><td>two</td><td>three</td></tr>\n"
+              + "    <tr><td>space here </td><td> space there</td><td>no space</td></tr>"
+              + "\t<tr><td>one\r\ntwo</td><td>two\tthree</td><td>three\r\tfour</td></tr>\n"
+              + "</table>put some text here<Br>and there."
+              + "<h2>End\tthis\rmadness\n!</h2>\r\n"
+              + "         .        .        .         ." + "</body>  </html>"),
+
+      // test that <a rel=nofollow> links are not returned
+      new String("<html><head></head><body>"
+          + "<a href=\"http://www.nutch.org\" rel=\"nofollow\"> ignore </a>"
+          + "<a rel=\"nofollow\" href=\"http://www.nutch.org\"> ignore </a>"
+          + "</body></html>"),
+      // test that POST form actions are skipped
+      new String("<html><head></head><body>"
+          + "<form method='POST' action='/search.jsp'><input type=text>"
+          + "<input type=submit><p>test1</p></form>"
+          + "<form method='GET' action='/dummy.jsp'><input type=text>"
+          + "<input type=submit><p>test2</p></form></body></html>"),
+      // test that all form actions are skipped
+      new String("<html><head></head><body>"
+          + "<form method='POST' action='/search.jsp'><input type=text>"
+          + "<input type=submit><p>test1</p></form>"
+          + "<form method='GET' action='/dummy.jsp'><input type=text>"
+          + "<input type=submit><p>test2</p></form></body></html>"),
+      new String("<html><head><title> title </title>" + "</head><body>"
+          + "<a href=\";x\">anchor1</a>" + "<a href=\"g;x\">anchor2</a>"
+          + "<a href=\"g;x?y#s\">anchor3</a>" + "</body></html>"),
+      new String("<html><head><title> title </title>" + "</head><body>"
+          + "<a href=\"g\">anchor1</a>" + "<a href=\"g?y#s\">anchor2</a>"
+          + "<a href=\"?y=1\">anchor3</a>" + "<a href=\"?y=1#s\">anchor4</a>"
+          + "<a href=\"?y=1;somethingelse\">anchor5</a>" + "</body></html>"), };
+
+  private static int SKIP = 9;
+
+  private static String[] testBaseHrefs = { "http://www.nutch.org",
+      "http://www.nutch.org/docs/foo.html", "http://www.nutch.org/docs/",
+      "http://www.nutch.org/docs/", "http://www.nutch.org/frames/",
+      "http://www.nutch.org/maps/", "http://www.nutch.org/whitespace/",
+      "http://www.nutch.org//", "http://www.nutch.org/",
+      "http://www.nutch.org/", "http://www.nutch.org/",
+      "http://www.nutch.org/;something" };
+
+  private static final DocumentFragment testDOMs[] = new DocumentFragment[testPages.length];
+
+  private static URL[] testBaseHrefURLs = new URL[testPages.length];
+
+  private static final String[] answerText = {
+      "title body anchor",
+      "title body home bots",
+      "separate this from this",
+      "my title body home 1 2",
+      "my title",
+      "my title the bottom",
+      "my title Whitespace test whitespace test "
+          + "This is a whitespace test . Newlines should appear as space too. "
+          + "Tabs are spaces too. This is a break -> and the line after break . "
+          + "one two three space here space there no space "
+          + "one two two three three four put some text here and there. "
+          + "End this madness ! . . . .", "ignore ignore", "test1 test2",
+      "test1 test2", "title anchor1 anchor2 anchor3",
+      "title anchor1 anchor2 anchor3 anchor4 anchor5" };
+
+  private static final String[] answerTitle = { "title", "title", "",
+      "my title", "my title", "my title", "my title", "", "", "", "title",
+      "title" };
+
+  // note: should be in page-order
+  private static Outlink[][] answerOutlinks;
+
+  private static Configuration conf;
+  private static DOMContentUtils utils = null;
+
+  @Before
+  public void setup() throws Exception {
+    conf = NutchConfiguration.create();
+    conf.setBoolean("parser.html.form.use_action", true);
+    utils = new DOMContentUtils(conf);
+    DOMFragmentParser parser = new DOMFragmentParser();
+    parser.setFeature(
+        "http://cyberneko.org/html/features/scanner/allow-selfclosing-iframe",
+        true);
+    for (int i = 0; i < testPages.length; i++) {
+      DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();
+      try {
+        parser.parse(
+            new InputSource(new ByteArrayInputStream(testPages[i].getBytes())),
+            node);
+        testBaseHrefURLs[i] = new URL(testBaseHrefs[i]);
+      } catch (Exception e) {
+        Assert.assertTrue("caught exception: " + e, false);
+      }
+      testDOMs[i] = node;
+    }
+    answerOutlinks = new Outlink[][] {
+        { new Outlink("http://www.nutch.org", "anchor"), },
+        { new Outlink("http://www.nutch.org/", "home"),
+            new Outlink("http://www.nutch.org/docs/bot.html", "bots"), },
+        { new Outlink("http://www.nutch.org/", "separate this"),
+            new Outlink("http://www.nutch.org/docs/ok", "from this"), },
+        { new Outlink("http://www.nutch.org/", "home"),
+            new Outlink("http://www.nutch.org/docs/1", "1"),
+            new Outlink("http://www.nutch.org/docs/2", "2"), },
+        { new Outlink("http://www.nutch.org/frames/top.html", ""),
+            new Outlink("http://www.nutch.org/frames/left.html", ""),
+            new Outlink("http://www.nutch.org/frames/invalid.html", ""),
+            new Outlink("http://www.nutch.org/frames/right.html", ""), },
+        { new Outlink("http://www.nutch.org/maps/logo.gif", ""),
+            new Outlink("http://www.nutch.org/index.html", ""),
+            new Outlink("http://www.nutch.org/maps/#bottom", ""),
+            new Outlink("http://www.nutch.org/bot.html", ""),
+            new Outlink("http://www.nutch.org/docs/index.html", ""), },
+        { new Outlink("http://www.nutch.org/index.html", "whitespace test"), },
+        {},
+        { new Outlink("http://www.nutch.org/dummy.jsp", "test2"), },
+        {},
+        { new Outlink("http://www.nutch.org/;x", "anchor1"),
+            new Outlink("http://www.nutch.org/g;x", "anchor2"),
+            new Outlink("http://www.nutch.org/g;x?y#s", "anchor3") },
+        {
+            // this is tricky - see RFC3986 section 5.4.1 example 7
+            new Outlink("http://www.nutch.org/g", "anchor1"),
+            new Outlink("http://www.nutch.org/g?y#s", "anchor2"),
+            new Outlink("http://www.nutch.org/;something?y=1", "anchor3"),
+            new Outlink("http://www.nutch.org/;something?y=1#s", "anchor4"),
+            new Outlink("http://www.nutch.org/;something?y=1;somethingelse",
+                "anchor5") } };
+
+  }
+
+  private static boolean equalsIgnoreWhitespace(String s1, String s2) {
+    StringTokenizer st1 = new StringTokenizer(s1);
+    StringTokenizer st2 = new StringTokenizer(s2);
+
+    while (st1.hasMoreTokens()) {
+      if (!st2.hasMoreTokens())
+        return false;
+      if (!st1.nextToken().equals(st2.nextToken()))
+        return false;
+    }
+    if (st2.hasMoreTokens())
+      return false;
+    return true;
+  }
+
+  @Test
+  public void testGetText() throws Exception {
+    if (testDOMs[0] == null)
+      setup();
+    for (int i = 0; i < testPages.length; i++) {
+      StringBuffer sb = new StringBuffer();
+      utils.getText(sb, testDOMs[i]);
+      String text = sb.toString();
+      Assert.assertTrue(
+          "expecting text: " + answerText[i]
+              + System.getProperty("line.separator")
+              + System.getProperty("line.separator") + "got text: " + text,
+          equalsIgnoreWhitespace(answerText[i], text));
+    }
+  }
+
+  @Test
+  public void testGetTitle() throws Exception {
+    if (testDOMs[0] == null)
+      setup();
+    for (int i = 0; i < testPages.length; i++) {
+      StringBuffer sb = new StringBuffer();
+      utils.getTitle(sb, testDOMs[i]);
+      String text = sb.toString();
+      Assert.assertTrue(
+          "expecting text: " + answerText[i]
+              + System.getProperty("line.separator")
+              + System.getProperty("line.separator") + "got text: " + text,
+          equalsIgnoreWhitespace(answerTitle[i], text));
+    }
+  }
+
+  @Test
+  public void testGetOutlinks() throws Exception {
+    if (testDOMs[0] == null)
+      setup();
+    for (int i = 0; i < testPages.length; i++) {
+      ArrayList<Outlink> outlinks = new ArrayList<Outlink>();
+      if (i == SKIP) {
+        conf.setBoolean("parser.html.form.use_action", false);
+        utils.setConf(conf);
+      } else {
+        conf.setBoolean("parser.html.form.use_action", true);
+        utils.setConf(conf);
+      }
+      utils.getOutlinks(testBaseHrefURLs[i], outlinks, testDOMs[i]);
+      Outlink[] outlinkArr = new Outlink[outlinks.size()];
+      outlinkArr = outlinks.toArray(outlinkArr);
+      compareOutlinks(answerOutlinks[i], outlinkArr);
+    }
+  }
+
+  private static final void appendOutlinks(StringBuffer sb, Outlink[] o) {
+    for (int i = 0; i < o.length; i++) {
+      sb.append(o[i].toString());
+      sb.append(System.getProperty("line.separator"));
+    }
+  }
+
+  private static final String outlinksString(Outlink[] o) {
+    StringBuffer sb = new StringBuffer();
+    appendOutlinks(sb, o);
+    return sb.toString();
+  }
+
+  private static final void compareOutlinks(Outlink[] o1, Outlink[] o2) {
+    if (o1.length != o2.length) {
+      Assert.assertTrue(
+          "got wrong number of outlinks (expecting " + o1.length + ", got "
+              + o2.length + ")" + System.getProperty("line.separator")
+              + "answer: " + System.getProperty("line.separator")
+              + outlinksString(o1) + System.getProperty("line.separator")
+              + "got: " + System.getProperty("line.separator")
+              + outlinksString(o2) + System.getProperty("line.separator"),
+          false);
+    }
+
+    for (int i = 0; i < o1.length; i++) {
+      if (!o1[i].equals(o2[i])) {
+        Assert.assertTrue(
+            "got wrong outlinks at position " + i
+                + System.getProperty("line.separator") + "answer: "
+                + System.getProperty("line.separator") + "'" + o1[i].getToUrl()
+                + "', anchor: '" + o1[i].getAnchor() + "'"
+                + System.getProperty("line.separator") + "got: "
+                + System.getProperty("line.separator") + "'" + o2[i].getToUrl()
+                + "', anchor: '" + o2[i].getAnchor() + "'", false);
+      }
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestFeedParser.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestFeedParser.java b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestFeedParser.java
new file mode 100644
index 0000000..c9394dc
--- /dev/null
+++ b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestFeedParser.java
@@ -0,0 +1,121 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tika;
+
+import org.junit.Assert;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.parse.tika.TikaParser;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.util.NutchConfiguration;
+
+/**
+ * 
+ * @author mattmann / jnioche
+ * 
+ *         Test Suite for the RSS feeds with the {@link TikaParser}.
+ * 
+ */
+public class TestFeedParser {
+
+  private String fileSeparator = System.getProperty("file.separator");
+
+  // This system property is defined in ./src/plugin/build-plugin.xml
+  private String sampleDir = System.getProperty("test.data", ".");
+
+  private String[] sampleFiles = { "rsstest.rss" };
+
+  public static final Logger LOG = LoggerFactory.getLogger(TestFeedParser.class
+      .getName());
+
+  /**
+   * <p>
+   * The test method: tests out the following 2 asserts:
+   * </p>
+   * 
+   * <ul>
+   * <li>There are 3 outlinks read from the sample rss file</li>
+   * <li>The 3 outlinks read are in fact the correct outlinks from the sample
+   * file</li>
+   * </ul>
+   */
+  @Test
+  public void testIt() throws ProtocolException, ParseException {
+    String urlString;
+    Protocol protocol;
+    Content content;
+    Parse parse;
+
+    Configuration conf = NutchConfiguration.create();
+    for (int i = 0; i < sampleFiles.length; i++) {
+      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
+
+      protocol = new ProtocolFactory(conf).getProtocol(urlString);
+      content = protocol.getProtocolOutput(new Text(urlString),
+          new CrawlDatum()).getContent();
+      parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
+          .get(content.getUrl());
+
+      // check that there are 2 outlinks:
+      // unlike the original parse-rss
+      // tika ignores the URL and description of the channel
+
+      // http://test.channel.com
+      // http://www-scf.usc.edu/~mattmann/
+      // http://www.nutch.org
+
+      ParseData theParseData = parse.getData();
+
+      Outlink[] theOutlinks = theParseData.getOutlinks();
+
+      Assert.assertTrue("There aren't 2 outlinks read!",
+          theOutlinks.length == 2);
+
+      // now check to make sure that those are the two outlinks
+      boolean hasLink1 = false, hasLink2 = false;
+
+      for (int j = 0; j < theOutlinks.length; j++) {
+        if (theOutlinks[j].getToUrl().equals(
+            "http://www-scf.usc.edu/~mattmann/")) {
+          hasLink1 = true;
+        }
+
+        if (theOutlinks[j].getToUrl().equals("http://www.nutch.org/")) {
+          hasLink2 = true;
+        }
+      }
+
+      if (!hasLink1 || !hasLink2) {
+        Assert.fail("Outlinks read from sample rss file are not correct!");
+      }
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestImageMetadata.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestImageMetadata.java b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestImageMetadata.java
new file mode 100644
index 0000000..b1762e6
--- /dev/null
+++ b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestImageMetadata.java
@@ -0,0 +1,67 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tika;
+
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.parse.ParseException;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.junit.Assert;
+import org.junit.Test;
+
+/**
+ * Test extraction of image metadata
+ */
+public class TestImageMetadata {
+
+  private String fileSeparator = System.getProperty("file.separator");
+  // This system property is defined in ./src/plugin/build-plugin.xml
+  private String sampleDir = System.getProperty("test.data", ".");
+  // Make sure sample files are copied to "test.data" as specified in
+  private String[] sampleFiles = { "nutch_logo_tm.gif", };
+
+  @Test
+  public void testIt() throws ProtocolException, ParseException {
+    String urlString;
+    Protocol protocol;
+    Content content;
+    Parse parse;
+
+    for (int i = 0; i < sampleFiles.length; i++) {
+      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
+
+      Configuration conf = NutchConfiguration.create();
+      protocol = new ProtocolFactory(conf).getProtocol(urlString);
+      content = protocol.getProtocolOutput(new Text(urlString),
+          new CrawlDatum()).getContent();
+      parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
+          .get(content.getUrl());
+
+      Assert.assertEquals("121", parse.getData().getMeta("width"));
+      Assert.assertEquals("48", parse.getData().getMeta("height"));
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestMSWordParser.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestMSWordParser.java b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestMSWordParser.java
new file mode 100644
index 0000000..576b3df
--- /dev/null
+++ b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestMSWordParser.java
@@ -0,0 +1,92 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tika;
+
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.parse.ParseException;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import java.io.File;
+
+/**
+ * Unit tests for MSWordParser.
+ * 
+ * @author John Xing
+ */
+public class TestMSWordParser {
+
+  private String fileSeparator = System.getProperty("file.separator");
+  // This system property is defined in ./src/plugin/build-plugin.xml
+  private String sampleDir = System.getProperty("test.data", ".");
+  // Make sure sample files are copied to "test.data" as specified in
+  // ./src/plugin/parse-msword/build.xml during plugin compilation.
+  // Check ./src/plugin/parse-msword/sample/README.txt for what they are.
+  private String[] sampleFiles = { "word97.doc" };
+
+  private String expectedText = "This is a sample doc file prepared for nutch.";
+
+  private Configuration conf;
+
+  @Before
+  public void setUp() {
+    conf = NutchConfiguration.create();
+    conf.set("file.content.limit", "-1");
+  }
+
+  public String getTextContent(String fileName) throws ProtocolException,
+      ParseException {
+    String urlString = "file:" + sampleDir + fileSeparator + fileName;
+    Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
+    Content content = protocol.getProtocolOutput(new Text(urlString),
+        new CrawlDatum()).getContent();
+    Parse parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
+        .get(content.getUrl());
+    return parse.getText();
+  }
+
+  @Test
+  public void testIt() throws ProtocolException, ParseException {
+    for (int i = 0; i < sampleFiles.length; i++) {
+      String found = getTextContent(sampleFiles[i]);
+      Assert.assertTrue("text found : '" + found + "'",
+          found.startsWith(expectedText));
+    }
+  }
+
+  @Test
+  public void testOpeningDocs() throws ProtocolException, ParseException {
+    String[] filenames = new File(sampleDir).list();
+    for (int i = 0; i < filenames.length; i++) {
+      if (filenames[i].endsWith(".doc") == false)
+        continue;
+      Assert.assertTrue("cann't read content of " + filenames[i],
+          getTextContent(filenames[i]).length() > 0);
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestOOParser.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestOOParser.java b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestOOParser.java
new file mode 100644
index 0000000..6960bad
--- /dev/null
+++ b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestOOParser.java
@@ -0,0 +1,107 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tika;
+
+import java.io.FileInputStream;
+import java.io.InputStreamReader;
+
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.protocol.*;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Assert;
+import org.junit.Test;
+
+/**
+ * Unit tests for OOParser.
+ * 
+ * @author Andrzej Bialecki
+ */
+public class TestOOParser {
+
+  private String fileSeparator = System.getProperty("file.separator");
+  // This system property is defined in ./src/plugin/build-plugin.xml
+  private String sampleDir = System.getProperty("test.data", ".");
+  // Make sure sample files are copied to "test.data" as specified in
+  // ./src/plugin/parse-oo/build.xml during plugin compilation.
+  private String[] sampleFiles = { "ootest.odt", "ootest.sxw" };
+
+  private String expectedText;
+
+  private String sampleText = "ootest.txt";
+
+  @Test
+  public void testIt() throws ProtocolException, ParseException {
+    String urlString;
+    Content content;
+    Parse parse;
+    Configuration conf = NutchConfiguration.create();
+    Protocol protocol;
+    ProtocolFactory factory = new ProtocolFactory(conf);
+
+    System.out.println("Expected : " + expectedText);
+
+    for (int i = 0; i < sampleFiles.length; i++) {
+      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
+
+      if (sampleFiles[i].startsWith("ootest") == false)
+        continue;
+
+      protocol = factory.getProtocol(urlString);
+      content = protocol.getProtocolOutput(new Text(urlString),
+          new CrawlDatum()).getContent();
+      parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
+          .get(content.getUrl());
+
+      String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim();
+
+      // simply test for the presence of a text - the ordering of the elements
+      // may differ from what was expected
+      // in the previous tests
+      Assert.assertTrue(text != null && text.length() > 0);
+
+      System.out.println("Found " + sampleFiles[i] + ": " + text);
+    }
+  }
+
+  public TestOOParser() {
+    try {
+      // read the test string
+      FileInputStream fis = new FileInputStream(sampleDir + fileSeparator
+          + sampleText);
+      StringBuffer sb = new StringBuffer();
+      int len = 0;
+      InputStreamReader isr = new InputStreamReader(fis, "UTF-8");
+      char[] buf = new char[1024];
+      while ((len = isr.read(buf)) > 0) {
+        sb.append(buf, 0, len);
+      }
+      isr.close();
+      expectedText = sb.toString();
+      // normalize space
+      expectedText = expectedText.replaceAll("[ \t\r\n]+", " ");
+    } catch (Exception e) {
+      e.printStackTrace();
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestPdfParser.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestPdfParser.java b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestPdfParser.java
new file mode 100644
index 0000000..9884f0c
--- /dev/null
+++ b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestPdfParser.java
@@ -0,0 +1,73 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tika;
+
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.parse.ParseException;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.junit.Assert;
+import org.junit.Test;
+
+/**
+ * Unit tests for PdfParser.
+ * 
+ * @author John Xing
+ */
+public class TestPdfParser {
+
+  private String fileSeparator = System.getProperty("file.separator");
+  // This system property is defined in ./src/plugin/build-plugin.xml
+  private String sampleDir = System.getProperty("test.data", ".");
+  // Make sure sample files are copied to "test.data" as specified in
+  // ./src/plugin/parse-pdf/build.xml during plugin compilation.
+  // Check ./src/plugin/parse-pdf/sample/README.txt for what they are.
+  private String[] sampleFiles = { "pdftest.pdf", "encrypted.pdf" };
+
+  private String expectedText = "A VERY SMALL PDF FILE";
+
+  @Test
+  public void testIt() throws ProtocolException, ParseException {
+    String urlString;
+    Protocol protocol;
+    Content content;
+    Parse parse;
+
+    for (int i = 0; i < sampleFiles.length; i++) {
+      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
+
+      Configuration conf = NutchConfiguration.create();
+      protocol = new ProtocolFactory(conf).getProtocol(urlString);
+      content = protocol.getProtocolOutput(new Text(urlString),
+          new CrawlDatum()).getContent();
+      parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
+          .get(content.getUrl());
+
+      int index = parse.getText().indexOf(expectedText);
+      Assert.assertTrue(index > 0);
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestRTFParser.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestRTFParser.java b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestRTFParser.java
new file mode 100644
index 0000000..f15d821
--- /dev/null
+++ b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestRTFParser.java
@@ -0,0 +1,81 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.tika;
+
+// Nutch imports
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metadata.DublinCore;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.util.NutchConfiguration;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.junit.Assert;
+import org.junit.Ignore;
+import org.junit.Test;
+
+/**
+ * Unit tests for TestRTFParser. (Adapted from John Xing msword unit tests).
+ * 
+ * @author Andy Hedges
+ */
+public class TestRTFParser {
+
+  private String fileSeparator = System.getProperty("file.separator");
+  // This system property is defined in ./src/plugin/build-plugin.xml
+  private String sampleDir = System.getProperty("test.data", ".");
+  // Make sure sample files are copied to "test.data" as specified in
+  // ./src/plugin/parse-rtf/build.xml during plugin compilation.
+  // Check ./src/plugin/parse-rtf/sample/README.txt for what they are.
+  private String rtfFile = "test.rtf";
+
+  @Ignore("There seems to be an issue with line 71 e.g. text.trim()")
+  @Test
+  public void testIt() throws ProtocolException, ParseException {
+
+    String urlString;
+    Protocol protocol;
+    Content content;
+    Parse parse;
+
+    Configuration conf = NutchConfiguration.create();
+    urlString = "file:" + sampleDir + fileSeparator + rtfFile;
+    protocol = new ProtocolFactory(conf).getProtocol(urlString);
+    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum())
+        .getContent();
+    parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(
+        content.getUrl());
+    String text = parse.getText();
+    Assert.assertEquals("The quick brown fox jumps over the lazy dog",
+        text.trim());
+
+    String title = parse.getData().getTitle();
+    Metadata meta = parse.getData().getParseMeta();
+
+    Assert.assertEquals("test rft document", title);
+    Assert.assertEquals("tests", meta.get(DublinCore.SUBJECT));
+
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestRobotsMetaProcessor.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestRobotsMetaProcessor.java b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestRobotsMetaProcessor.java
new file mode 100644
index 0000000..4224f93
--- /dev/null
+++ b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestRobotsMetaProcessor.java
@@ -0,0 +1,156 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tika;
+
+import org.apache.nutch.parse.HTMLMetaTags;
+import org.apache.nutch.parse.tika.HTMLMetaProcessor;
+
+import java.io.ByteArrayInputStream;
+import java.net.URL;
+
+import org.xml.sax.*;
+import org.w3c.dom.*;
+import org.apache.html.dom.*;
+import org.cyberneko.html.parsers.DOMFragmentParser;
+import org.junit.Assert;
+import org.junit.Test;
+
+/** Unit tests for HTMLMetaProcessor. */
+public class TestRobotsMetaProcessor {
+
+  /*
+   * 
+   * some sample tags:
+   * 
+   * <meta name="robots" content="index,follow"> <meta name="robots"
+   * content="noindex,follow"> <meta name="robots" content="index,nofollow">
+   * <meta name="robots" content="noindex,nofollow">
+   * 
+   * <META HTTP-EQUIV="Pragma" CONTENT="no-cache">
+   */
+
+  public static String[] tests = {
+      "<html><head><title>test page</title>"
+          + "<META NAME=\"ROBOTS\" CONTENT=\"NONE\"> "
+          + "<META HTTP-EQUIV=\"PRAGMA\" CONTENT=\"NO-CACHE\"> "
+          + "</head><body>" + " some text" + "</body></html>",
+
+      "<html><head><title>test page</title>"
+          + "<meta name=\"robots\" content=\"all\"> "
+          + "<meta http-equiv=\"pragma\" content=\"no-cache\"> "
+          + "</head><body>" + " some text" + "</body></html>",
+
+      "<html><head><title>test page</title>"
+          + "<MeTa NaMe=\"RoBoTs\" CoNtEnT=\"nOnE\"> "
+          + "<MeTa HtTp-EqUiV=\"pRaGmA\" cOnTeNt=\"No-CaChE\"> "
+          + "</head><body>" + " some text" + "</body></html>",
+
+      "<html><head><title>test page</title>"
+          + "<meta name=\"robots\" content=\"none\"> " + "</head><body>"
+          + " some text" + "</body></html>",
+
+      "<html><head><title>test page</title>"
+          + "<meta name=\"robots\" content=\"noindex,nofollow\"> "
+          + "</head><body>" + " some text" + "</body></html>",
+
+      "<html><head><title>test page</title>"
+          + "<meta name=\"robots\" content=\"noindex,follow\"> "
+          + "</head><body>" + " some text" + "</body></html>",
+
+      "<html><head><title>test page</title>"
+          + "<meta name=\"robots\" content=\"index,nofollow\"> "
+          + "</head><body>" + " some text" + "</body></html>",
+
+      "<html><head><title>test page</title>"
+          + "<meta name=\"robots\" content=\"index,follow\"> "
+          + "<base href=\"http://www.nutch.org/\">" + "</head><body>"
+          + " some text" + "</body></html>",
+
+      "<html><head><title>test page</title>" + "<meta name=\"robots\"> "
+          + "<base href=\"http://www.nutch.org/base/\">" + "</head><body>"
+          + " some text" + "</body></html>",
+
+  };
+
+  public static final boolean[][] answers = { { true, true, true }, // NONE
+      { false, false, true }, // all
+      { true, true, true }, // nOnE
+      { true, true, false }, // none
+      { true, true, false }, // noindex,nofollow
+      { true, false, false }, // noindex,follow
+      { false, true, false }, // index,nofollow
+      { false, false, false }, // index,follow
+      { false, false, false }, // missing!
+  };
+
+  private URL[][] currURLsAndAnswers;
+
+  @Test
+  public void testRobotsMetaProcessor() {
+    DOMFragmentParser parser = new DOMFragmentParser();
+    ;
+
+    try {
+      currURLsAndAnswers = new URL[][] {
+          { new URL("http://www.nutch.org"), null },
+          { new URL("http://www.nutch.org"), null },
+          { new URL("http://www.nutch.org"), null },
+          { new URL("http://www.nutch.org"), null },
+          { new URL("http://www.nutch.org"), null },
+          { new URL("http://www.nutch.org"), null },
+          { new URL("http://www.nutch.org"), null },
+          { new URL("http://www.nutch.org/foo/"),
+              new URL("http://www.nutch.org/") },
+          { new URL("http://www.nutch.org"),
+              new URL("http://www.nutch.org/base/") } };
+    } catch (Exception e) {
+      Assert.assertTrue("couldn't make test URLs!", false);
+    }
+
+    for (int i = 0; i < tests.length; i++) {
+      byte[] bytes = tests[i].getBytes();
+
+      DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();
+
+      try {
+        parser.parse(new InputSource(new ByteArrayInputStream(bytes)), node);
+      } catch (Exception e) {
+        e.printStackTrace();
+      }
+
+      HTMLMetaTags robotsMeta = new HTMLMetaTags();
+      HTMLMetaProcessor.getMetaTags(robotsMeta, node, currURLsAndAnswers[i][0]);
+
+      Assert.assertTrue("got index wrong on test " + i,
+          robotsMeta.getNoIndex() == answers[i][0]);
+      Assert.assertTrue("got follow wrong on test " + i,
+          robotsMeta.getNoFollow() == answers[i][1]);
+      Assert.assertTrue("got cache wrong on test " + i,
+          robotsMeta.getNoCache() == answers[i][2]);
+      Assert
+          .assertTrue(
+              "got base href wrong on test " + i + " (got "
+                  + robotsMeta.getBaseHref() + ")",
+              ((robotsMeta.getBaseHref() == null) && (currURLsAndAnswers[i][1] == null))
+                  || ((robotsMeta.getBaseHref() != null) && robotsMeta
+                      .getBaseHref().equals(currURLsAndAnswers[i][1])));
+
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/src/test/resources/encrypted.pdf
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-tika/src/test/resources/encrypted.pdf b/nutch-plugins/parse-tika/src/test/resources/encrypted.pdf
new file mode 100644
index 0000000..383cebb
Binary files /dev/null and b/nutch-plugins/parse-tika/src/test/resources/encrypted.pdf differ

[47/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build for nutch-core and nutch-plugins

Posted by th...@apache.org.

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/crawl/LinkDbMerger.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/crawl/LinkDbMerger.java b/nutch-core/src/main/java/org/apache/nutch/crawl/LinkDbMerger.java
new file mode 100644
index 0000000..39923ac
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/crawl/LinkDbMerger.java
@@ -0,0 +1,204 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.crawl;
+
+import java.io.IOException;
+import java.text.SimpleDateFormat;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.Random;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.FileInputFormat;
+import org.apache.hadoop.mapred.FileOutputFormat;
+import org.apache.hadoop.mapred.JobClient;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.MapFileOutputFormat;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reducer;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.hadoop.mapred.SequenceFileInputFormat;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.NutchJob;
+import org.apache.nutch.util.TimingUtil;
+
+/**
+ * This tool merges several LinkDb-s into one, optionally filtering URLs through
+ * the current URLFilters, to skip prohibited URLs and links.
+ * 
+ * <p>
+ * It's possible to use this tool just for filtering - in that case only one
+ * LinkDb should be specified in arguments.
+ * </p>
+ * <p>
+ * If more than one LinkDb contains information about the same URL, all inlinks
+ * are accumulated, but only at most <code>linkdb.max.inlinks</code> inlinks will
+ * ever be added.
+ * </p>
+ * <p>
+ * If activated, URLFilters will be applied to both the target URLs and to any
+ * incoming link URL. If a target URL is prohibited, all inlinks to that target
+ * will be removed, including the target URL. If some of incoming links are
+ * prohibited, only they will be removed, and they won't count when checking the
+ * above-mentioned maximum limit.
+ * 
+ * @author Andrzej Bialecki
+ */
+public class LinkDbMerger extends Configured implements Tool,
+    Reducer<Text, Inlinks, Text, Inlinks> {
+  private static final Logger LOG = LoggerFactory.getLogger(LinkDbMerger.class);
+
+  private int maxInlinks;
+
+  public LinkDbMerger() {
+
+  }
+
+  public LinkDbMerger(Configuration conf) {
+    setConf(conf);
+  }
+
+  public void reduce(Text key, Iterator<Inlinks> values,
+      OutputCollector<Text, Inlinks> output, Reporter reporter)
+      throws IOException {
+
+    Inlinks result = new Inlinks();
+
+    while (values.hasNext()) {
+      Inlinks inlinks = values.next();
+
+      int end = Math.min(maxInlinks - result.size(), inlinks.size());
+      Iterator<Inlink> it = inlinks.iterator();
+      int i = 0;
+      while (it.hasNext() && i++ < end) {
+        result.add(it.next());
+      }
+    }
+    if (result.size() == 0)
+      return;
+    output.collect(key, result);
+
+  }
+
+  public void configure(JobConf job) {
+    maxInlinks = job.getInt("linkdb.max.inlinks", 10000);
+  }
+
+  public void close() throws IOException {
+  }
+
+  public void merge(Path output, Path[] dbs, boolean normalize, boolean filter)
+      throws Exception {
+    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+    long start = System.currentTimeMillis();
+    LOG.info("LinkDb merge: starting at " + sdf.format(start));
+
+    JobConf job = createMergeJob(getConf(), output, normalize, filter);
+    for (int i = 0; i < dbs.length; i++) {
+      FileInputFormat.addInputPath(job, new Path(dbs[i], LinkDb.CURRENT_NAME));
+    }
+    JobClient.runJob(job);
+    FileSystem fs = FileSystem.get(getConf());
+    fs.mkdirs(output);
+    fs.rename(FileOutputFormat.getOutputPath(job), new Path(output,
+        LinkDb.CURRENT_NAME));
+
+    long end = System.currentTimeMillis();
+    LOG.info("LinkDb merge: finished at " + sdf.format(end) + ", elapsed: "
+        + TimingUtil.elapsedTime(start, end));
+  }
+
+  public static JobConf createMergeJob(Configuration config, Path linkDb,
+      boolean normalize, boolean filter) {
+    Path newLinkDb = new Path("linkdb-merge-"
+        + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
+
+    JobConf job = new NutchJob(config);
+    job.setJobName("linkdb merge " + linkDb);
+
+    job.setInputFormat(SequenceFileInputFormat.class);
+
+    job.setMapperClass(LinkDbFilter.class);
+    job.setBoolean(LinkDbFilter.URL_NORMALIZING, normalize);
+    job.setBoolean(LinkDbFilter.URL_FILTERING, filter);
+    job.setReducerClass(LinkDbMerger.class);
+
+    FileOutputFormat.setOutputPath(job, newLinkDb);
+    job.setOutputFormat(MapFileOutputFormat.class);
+    job.setBoolean("mapred.output.compress", true);
+    job.setOutputKeyClass(Text.class);
+    job.setOutputValueClass(Inlinks.class);
+
+    // https://issues.apache.org/jira/browse/NUTCH-1069
+    job.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);
+
+    return job;
+  }
+
+  /**
+   * @param args
+   */
+  public static void main(String[] args) throws Exception {
+    int res = ToolRunner.run(NutchConfiguration.create(), new LinkDbMerger(),
+        args);
+    System.exit(res);
+  }
+
+  public int run(String[] args) throws Exception {
+    if (args.length < 2) {
+      System.err
+          .println("Usage: LinkDbMerger <output_linkdb> <linkdb1> [<linkdb2> <linkdb3> ...] [-normalize] [-filter]");
+      System.err.println("\toutput_linkdb\toutput LinkDb");
+      System.err
+          .println("\tlinkdb1 ...\tinput LinkDb-s (single input LinkDb is ok)");
+      System.err
+          .println("\t-normalize\tuse URLNormalizer on both fromUrls and toUrls in linkdb(s) (usually not needed)");
+      System.err
+          .println("\t-filter\tuse URLFilters on both fromUrls and toUrls in linkdb(s)");
+      return -1;
+    }
+    Path output = new Path(args[0]);
+    ArrayList<Path> dbs = new ArrayList<Path>();
+    boolean normalize = false;
+    boolean filter = false;
+    for (int i = 1; i < args.length; i++) {
+      if (args[i].equals("-filter")) {
+        filter = true;
+      } else if (args[i].equals("-normalize")) {
+        normalize = true;
+      } else
+        dbs.add(new Path(args[i]));
+    }
+    try {
+      merge(output, dbs.toArray(new Path[dbs.size()]), normalize, filter);
+      return 0;
+    } catch (Exception e) {
+      LOG.error("LinkDbMerger: " + StringUtils.stringifyException(e));
+      return -1;
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/crawl/LinkDbReader.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/crawl/LinkDbReader.java b/nutch-core/src/main/java/org/apache/nutch/crawl/LinkDbReader.java
new file mode 100644
index 0000000..2e50e9a
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/crawl/LinkDbReader.java
@@ -0,0 +1,203 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+import java.io.IOException;
+
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+// Commons Logging imports
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.io.*;
+import org.apache.hadoop.fs.*;
+import org.apache.hadoop.mapred.*;
+import org.apache.hadoop.mapred.lib.HashPartitioner;
+import org.apache.hadoop.util.*;
+import org.apache.hadoop.conf.Configuration;
+
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.NutchJob;
+import org.apache.nutch.util.TimingUtil;
+
+import java.text.SimpleDateFormat;
+import java.util.Iterator;
+import java.io.Closeable;
+
+/** . */
+public class LinkDbReader extends Configured implements Tool, Closeable {
+  public static final Logger LOG = LoggerFactory.getLogger(LinkDbReader.class);
+
+  private static final Partitioner<WritableComparable, Writable> PARTITIONER = new HashPartitioner<WritableComparable, Writable>();
+
+  private FileSystem fs;
+  private Path directory;
+  private MapFile.Reader[] readers;
+
+  public LinkDbReader() {
+
+  }
+
+  public LinkDbReader(Configuration conf, Path directory) throws Exception {
+    setConf(conf);
+    init(directory);
+  }
+
+  public void init(Path directory) throws Exception {
+    this.fs = FileSystem.get(getConf());
+    this.directory = directory;
+  }
+
+  public String[] getAnchors(Text url) throws IOException {
+    Inlinks inlinks = getInlinks(url);
+    if (inlinks == null)
+      return null;
+    return inlinks.getAnchors();
+  }
+
+  public Inlinks getInlinks(Text url) throws IOException {
+
+    if (readers == null) {
+      synchronized (this) {
+        readers = MapFileOutputFormat.getReaders(fs, new Path(directory,
+            LinkDb.CURRENT_NAME), getConf());
+      }
+    }
+
+    return (Inlinks) MapFileOutputFormat.getEntry(readers, PARTITIONER, url,
+        new Inlinks());
+  }
+
+  public void close() throws IOException {
+    if (readers != null) {
+      for (int i = 0; i < readers.length; i++) {
+        readers[i].close();
+      }
+    }
+  }
+  
+  public static class LinkDBDumpMapper implements Mapper<Text, Inlinks, Text, Inlinks> {
+    Pattern pattern = null;
+    Matcher matcher = null;
+    
+    public void configure(JobConf job) {
+      if (job.get("linkdb.regex", null) != null) {
+        pattern = Pattern.compile(job.get("linkdb.regex"));
+      }
+    }
+
+    public void close() {}
+    public void map(Text key, Inlinks value, OutputCollector<Text, Inlinks> output, Reporter reporter)
+            throws IOException {
+
+      if (pattern != null) {
+        matcher = pattern.matcher(key.toString());
+        if (!matcher.matches()) {
+          return;
+        }
+      }
+
+      output.collect(key, value);
+    }
+  }
+
+  public void processDumpJob(String linkdb, String output, String regex) throws IOException {
+    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+    long start = System.currentTimeMillis();
+    if (LOG.isInfoEnabled()) {
+      LOG.info("LinkDb dump: starting at " + sdf.format(start));
+      LOG.info("LinkDb dump: db: " + linkdb);
+    }
+    Path outFolder = new Path(output);
+
+    JobConf job = new NutchJob(getConf());
+    job.setJobName("read " + linkdb);
+    
+    if (regex != null) {
+      job.set("linkdb.regex", regex);
+      job.setMapperClass(LinkDBDumpMapper.class);
+    }
+
+    FileInputFormat.addInputPath(job, new Path(linkdb, LinkDb.CURRENT_NAME));
+    job.setInputFormat(SequenceFileInputFormat.class);
+
+    FileOutputFormat.setOutputPath(job, outFolder);
+    job.setOutputFormat(TextOutputFormat.class);
+    job.setOutputKeyClass(Text.class);
+    job.setOutputValueClass(Inlinks.class);
+
+    JobClient.runJob(job);
+
+    long end = System.currentTimeMillis();
+    LOG.info("LinkDb dump: finished at " + sdf.format(end) + ", elapsed: "
+        + TimingUtil.elapsedTime(start, end));
+  }
+
+  public static void main(String[] args) throws Exception {
+    int res = ToolRunner.run(NutchConfiguration.create(), new LinkDbReader(),
+        args);
+    System.exit(res);
+  }
+
+  public int run(String[] args) throws Exception {
+    if (args.length < 2) {
+      System.err
+          .println("Usage: LinkDbReader <linkdb> (-dump <out_dir> [-regex <regex>]) | -url <url>");
+      System.err
+          .println("\t-dump <out_dir>\tdump whole link db to a text file in <out_dir>");
+      System.err
+          .println("\t\t-regex <regex>\trestrict to url's matching expression");
+      System.err
+          .println("\t-url <url>\tprint information about <url> to System.out");
+      return -1;
+    }
+    try {
+      if (args[1].equals("-dump")) {
+        String regex = null;
+        for (int i = 2; i < args.length; i++) {
+          if (args[i].equals("-regex")) {
+            regex = args[++i];
+          }
+        }
+        processDumpJob(args[0], args[2], regex);
+        return 0;
+      } else if (args[1].equals("-url")) {
+        init(new Path(args[0]));
+        Inlinks links = getInlinks(new Text(args[2]));
+        if (links == null) {
+          System.out.println(" - no link information.");
+        } else {
+          Iterator<Inlink> it = links.iterator();
+          while (it.hasNext()) {
+            System.out.println(it.next().toString());
+          }
+        }
+        return 0;
+      } else {
+        System.err.println("Error: wrong argument " + args[1]);
+        return -1;
+      }
+    } catch (Exception e) {
+      LOG.error("LinkDbReader: " + StringUtils.stringifyException(e));
+      return -1;
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/crawl/MD5Signature.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/crawl/MD5Signature.java b/nutch-core/src/main/java/org/apache/nutch/crawl/MD5Signature.java
new file mode 100644
index 0000000..f6ec8dd
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/crawl/MD5Signature.java
@@ -0,0 +1,39 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+import org.apache.hadoop.io.MD5Hash;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.protocol.Content;
+
+/**
+ * Default implementation of a page signature. It calculates an MD5 hash of the
+ * raw binary content of a page. In case there is no content, it calculates a
+ * hash from the page's URL.
+ * 
+ * @author Andrzej Bialecki &lt;ab@getopt.org&gt;
+ */
+public class MD5Signature extends Signature {
+
+  public byte[] calculate(Content content, Parse parse) {
+    byte[] data = content.getContent();
+    if (data == null)
+      data = content.getUrl().getBytes();
+    return MD5Hash.digest(data).getDigest();
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/crawl/MimeAdaptiveFetchSchedule.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/crawl/MimeAdaptiveFetchSchedule.java b/nutch-core/src/main/java/org/apache/nutch/crawl/MimeAdaptiveFetchSchedule.java
new file mode 100644
index 0000000..4fe5cef
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/crawl/MimeAdaptiveFetchSchedule.java
@@ -0,0 +1,236 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.Reader;
+import java.util.HashMap;
+
+import org.apache.commons.lang.StringUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metadata.HttpHeaders;
+import org.apache.nutch.util.MimeUtil;
+import org.apache.nutch.util.NutchConfiguration;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Extension of @see AdaptiveFetchSchedule that allows for more flexible
+ * configuration of DEC and INC factors for various MIME-types.
+ * 
+ * This class can be typically used in cases where a recrawl consists of many
+ * different MIME-types. It's not very common for MIME-types other than
+ * text/html to change frequently. Using this class you can configure different
+ * factors per MIME-type so to prefer frequently changing MIME-types over
+ * others.
+ * 
+ * For it to work this class relies on the Content-Type MetaData key being
+ * present in the CrawlDB. This can either be done when injecting new URL's or
+ * by adding "Content-Type" to the db.parsemeta.to.crawldb configuration setting
+ * to force MIME-types of newly discovered URL's to be added to the CrawlDB.
+ * 
+ * @author markus
+ */
+public class MimeAdaptiveFetchSchedule extends AdaptiveFetchSchedule {
+  // Loggg
+  public static final Logger LOG = LoggerFactory
+      .getLogger(MimeAdaptiveFetchSchedule.class);
+
+  // Conf directives
+  public static final String SCHEDULE_INC_RATE = "db.fetch.schedule.adaptive.inc_rate";
+  public static final String SCHEDULE_DEC_RATE = "db.fetch.schedule.adaptive.dec_rate";
+  public static final String SCHEDULE_MIME_FILE = "db.fetch.schedule.mime.file";
+
+  // Default values for DEC and INC rate
+  private float defaultIncRate;
+  private float defaultDecRate;
+
+  // Structure to store inc and dec rates per MIME-type
+  private class AdaptiveRate {
+    public float inc;
+    public float dec;
+
+    public AdaptiveRate(Float inc, Float dec) {
+      this.inc = inc;
+      this.dec = dec;
+    }
+  }
+
+  // Here we store the mime's and their delta's
+  private HashMap<String, AdaptiveRate> mimeMap;
+
+  public void setConf(Configuration conf) {
+    super.setConf(conf);
+    if (conf == null)
+      return;
+
+    // Read and set the default INC and DEC rates in case we cannot set values
+    // based on MIME-type
+    defaultIncRate = conf.getFloat(SCHEDULE_INC_RATE, 0.2f);
+    defaultDecRate = conf.getFloat(SCHEDULE_DEC_RATE, 0.2f);
+
+    // Where's the mime/factor file?
+    Reader mimeFile = conf.getConfResourceAsReader(conf.get(SCHEDULE_MIME_FILE,
+        "adaptive-mimetypes.txt"));
+
+    try {
+      readMimeFile(mimeFile);
+    } catch (IOException e) {
+      LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
+    }
+  }
+
+  @Override
+  public CrawlDatum setFetchSchedule(Text url, CrawlDatum datum,
+      long prevFetchTime, long prevModifiedTime, long fetchTime,
+      long modifiedTime, int state) {
+
+    // Set defaults
+    INC_RATE = defaultIncRate;
+    DEC_RATE = defaultDecRate;
+
+    // Check if the Content-Type field is available in the CrawlDatum
+    if (datum.getMetaData().containsKey(HttpHeaders.WRITABLE_CONTENT_TYPE)) {
+      // Get the MIME-type of the current URL
+      String currentMime = MimeUtil.cleanMimeType(datum.getMetaData()
+          .get(HttpHeaders.WRITABLE_CONTENT_TYPE).toString());
+
+      // Check if this MIME-type exists in our map
+      if (mimeMap.containsKey(currentMime)) {
+        // Yes, set the INC and DEC rates for this MIME-type
+        INC_RATE = mimeMap.get(currentMime).inc;
+        DEC_RATE = mimeMap.get(currentMime).dec;
+      }
+    }
+
+    return super.setFetchSchedule(url, datum, prevFetchTime, prevModifiedTime,
+        fetchTime, modifiedTime, state);
+  }
+
+  /**
+   * Reads the mime types and their associated INC/DEC factors in a HashMap
+   * 
+   * @param mimeFile
+   *          Reader
+   * @return void
+   */
+  private void readMimeFile(Reader mimeFile) throws IOException {
+    // Instance of our mime/factor map
+    mimeMap = new HashMap<String, AdaptiveRate>();
+
+    // Open a reader
+    BufferedReader reader = new BufferedReader(mimeFile);
+
+    String line = null;
+    String[] splits = null;
+
+    // Read all lines
+    while ((line = reader.readLine()) != null) {
+      // Skip blank lines and comments
+      if (StringUtils.isNotBlank(line) && !line.startsWith("#")) {
+        // Split the line by TAB
+        splits = line.split("\t");
+
+        // Sanity check, we need two or three items
+        if (splits.length == 3) {
+          // Add a lower cased MIME-type and the factor to the map
+          mimeMap.put(StringUtils.lowerCase(splits[0]), new AdaptiveRate(
+              new Float(splits[1]), new Float(splits[2])));
+        } else {
+          LOG.warn("Invalid configuration line in: " + line);
+        }
+      }
+    }
+  }
+
+  public static void main(String[] args) throws Exception {
+    FetchSchedule fs = new MimeAdaptiveFetchSchedule();
+    fs.setConf(NutchConfiguration.create());
+    // we start the time at 0, for simplicity
+    long curTime = 0;
+    long delta = 1000L * 3600L * 24L; // 2 hours
+    // we trigger the update of the page every 30 days
+    long update = 1000L * 3600L * 24L * 30L; // 30 days
+    boolean changed = true;
+    long lastModified = 0;
+    int miss = 0;
+    int totalMiss = 0;
+    int maxMiss = 0;
+    int fetchCnt = 0;
+    int changeCnt = 0;
+
+    // initial fetchInterval is 10 days
+    CrawlDatum p = new CrawlDatum(1, 3600 * 24 * 30, 1.0f);
+
+    // Set a default MIME-type to test with
+    org.apache.hadoop.io.MapWritable x = new org.apache.hadoop.io.MapWritable();
+    x.put(HttpHeaders.WRITABLE_CONTENT_TYPE, new Text(
+        "text/html; charset=utf-8"));
+    p.setMetaData(x);
+
+    p.setFetchTime(0);
+    LOG.info(p.toString());
+
+    // let's move the timeline a couple of deltas
+    for (int i = 0; i < 10000; i++) {
+      if (lastModified + update < curTime) {
+        // System.out.println("i=" + i + ", lastModified=" + lastModified +
+        // ", update=" + update + ", curTime=" + curTime);
+        changed = true;
+        changeCnt++;
+        lastModified = curTime;
+      }
+
+      LOG.info(i + ". " + changed + "\twill fetch at "
+          + (p.getFetchTime() / delta) + "\tinterval "
+          + (p.getFetchInterval() / SECONDS_PER_DAY) + " days" + "\t missed "
+          + miss);
+
+      if (p.getFetchTime() <= curTime) {
+        fetchCnt++;
+        fs.setFetchSchedule(new Text("http://www.example.com"), p, p
+            .getFetchTime(), p.getModifiedTime(), curTime, lastModified,
+            changed ? FetchSchedule.STATUS_MODIFIED
+                : FetchSchedule.STATUS_NOTMODIFIED);
+
+        LOG.info("\tfetched & adjusted: " + "\twill fetch at "
+            + (p.getFetchTime() / delta) + "\tinterval "
+            + (p.getFetchInterval() / SECONDS_PER_DAY) + " days");
+
+        if (!changed)
+          miss++;
+        if (miss > maxMiss)
+          maxMiss = miss;
+        changed = false;
+        totalMiss += miss;
+        miss = 0;
+      }
+
+      if (changed)
+        miss++;
+      curTime += delta;
+    }
+    LOG.info("Total missed: " + totalMiss + ", max miss: " + maxMiss);
+    LOG.info("Page changed " + changeCnt + " times, fetched " + fetchCnt
+        + " times.");
+  }
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/crawl/NutchWritable.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/crawl/NutchWritable.java b/nutch-core/src/main/java/org/apache/nutch/crawl/NutchWritable.java
new file mode 100644
index 0000000..589b8b9
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/crawl/NutchWritable.java
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.crawl;
+
+import org.apache.hadoop.io.Writable;
+import org.apache.nutch.util.GenericWritableConfigurable;
+
+@SuppressWarnings("unchecked")
+public class NutchWritable extends GenericWritableConfigurable {
+
+  private static Class<? extends Writable>[] CLASSES = null;
+
+  static {
+    CLASSES = (Class<? extends Writable>[]) new Class<?>[] {
+        org.apache.hadoop.io.NullWritable.class,
+        org.apache.hadoop.io.BooleanWritable.class,
+        org.apache.hadoop.io.LongWritable.class,
+        org.apache.hadoop.io.ByteWritable.class,
+        org.apache.hadoop.io.BytesWritable.class,
+        org.apache.hadoop.io.FloatWritable.class,
+        org.apache.hadoop.io.IntWritable.class,
+        org.apache.hadoop.io.MapWritable.class,
+        org.apache.hadoop.io.Text.class, org.apache.hadoop.io.MD5Hash.class,
+        org.apache.nutch.crawl.CrawlDatum.class,
+        org.apache.nutch.crawl.Inlink.class,
+        org.apache.nutch.crawl.Inlinks.class,
+        org.apache.nutch.indexer.NutchIndexAction.class,
+        org.apache.nutch.metadata.Metadata.class,
+        org.apache.nutch.parse.Outlink.class,
+        org.apache.nutch.parse.ParseText.class,
+        org.apache.nutch.parse.ParseData.class,
+        org.apache.nutch.parse.ParseImpl.class,
+        org.apache.nutch.parse.ParseStatus.class,
+        org.apache.nutch.protocol.Content.class,
+        org.apache.nutch.protocol.ProtocolStatus.class,
+        org.apache.nutch.scoring.webgraph.LinkDatum.class,
+        org.apache.nutch.hostdb.HostDatum.class };
+  }
+
+  public NutchWritable() {
+  }
+
+  public NutchWritable(Writable instance) {
+    set(instance);
+  }
+
+  @Override
+  protected Class<? extends Writable>[] getTypes() {
+    return CLASSES;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/crawl/Signature.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/crawl/Signature.java b/nutch-core/src/main/java/org/apache/nutch/crawl/Signature.java
new file mode 100644
index 0000000..21dfe07
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/crawl/Signature.java
@@ -0,0 +1,37 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.protocol.Content;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configurable;
+
+public abstract class Signature implements Configurable {
+  protected Configuration conf;
+
+  public abstract byte[] calculate(Content content, Parse parse);
+
+  public Configuration getConf() {
+    return conf;
+  }
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/crawl/SignatureComparator.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/crawl/SignatureComparator.java b/nutch-core/src/main/java/org/apache/nutch/crawl/SignatureComparator.java
new file mode 100644
index 0000000..d217d93
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/crawl/SignatureComparator.java
@@ -0,0 +1,57 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+import java.util.Comparator;
+
+public class SignatureComparator implements Comparator<Object> {
+  public int compare(Object o1, Object o2) {
+    return _compare(o1, o2);
+  }
+
+  public static int _compare(Object o1, Object o2) {
+    if (o1 == null && o2 == null)
+      return 0;
+    if (o1 == null)
+      return -1;
+    if (o2 == null)
+      return 1;
+    if (!(o1 instanceof byte[]))
+      return -1;
+    if (!(o2 instanceof byte[]))
+      return 1;
+    byte[] data1 = (byte[]) o1;
+    byte[] data2 = (byte[]) o2;
+    return _compare(data1, 0, data1.length, data2, 0, data2.length);
+  }
+
+  public static int _compare(byte[] data1, int s1, int l1, byte[] data2,
+      int s2, int l2) {
+    if (l2 > l1)
+      return -1;
+    if (l2 < l1)
+      return 1;
+    int res = 0;
+    for (int i = 0; i < l1; i++) {
+      res = (data1[s1 + i] - data2[s2 + i]);
+      if (res != 0)
+        return res;
+    }
+    return 0;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/crawl/SignatureFactory.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/crawl/SignatureFactory.java b/nutch-core/src/main/java/org/apache/nutch/crawl/SignatureFactory.java
new file mode 100644
index 0000000..16d8cc0
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/crawl/SignatureFactory.java
@@ -0,0 +1,62 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+// Commons Logging imports
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.ObjectCache;
+
+/**
+ * Factory class, which instantiates a Signature implementation according to the
+ * current Configuration configuration. This newly created instance is cached in
+ * the Configuration instance, so that it could be later retrieved.
+ * 
+ * @author Andrzej Bialecki &lt;ab@getopt.org&gt;
+ */
+public class SignatureFactory {
+  private static final Logger LOG = LoggerFactory
+      .getLogger(SignatureFactory.class);
+
+  private SignatureFactory() {
+  } // no public ctor
+
+  /** Return the default Signature implementation. */
+  public synchronized static Signature getSignature(Configuration conf) {
+    String clazz = conf.get("db.signature.class", MD5Signature.class.getName());
+    ObjectCache objectCache = ObjectCache.get(conf);
+    Signature impl = (Signature) objectCache.getObject(clazz);
+    if (impl == null) {
+      try {
+        if (LOG.isInfoEnabled()) {
+          LOG.info("Using Signature impl: " + clazz);
+        }
+        Class<?> implClass = Class.forName(clazz);
+        impl = (Signature) implClass.newInstance();
+        impl.setConf(conf);
+        objectCache.setObject(clazz, impl);
+      } catch (Exception e) {
+        throw new RuntimeException("Couldn't create " + clazz, e);
+      }
+    }
+    return impl;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/crawl/TextMD5Signature.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/crawl/TextMD5Signature.java b/nutch-core/src/main/java/org/apache/nutch/crawl/TextMD5Signature.java
new file mode 100644
index 0000000..b88cfa6
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/crawl/TextMD5Signature.java
@@ -0,0 +1,42 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+import org.apache.hadoop.io.MD5Hash;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.protocol.Content;
+
+/**
+ * Implementation of a page signature. It calculates an MD5 hash of the textual
+ * content of a page. In case there is no content, it calculates a hash from the
+ * page's URL.
+ */
+public class TextMD5Signature extends Signature {
+
+  Signature fallback = new MD5Signature();
+
+  public byte[] calculate(Content content, Parse parse) {
+    String text = parse.getText();
+
+    if (text == null || text.length() == 0) {
+      return fallback.calculate(content, parse);
+    }
+
+    return MD5Hash.digest(text).getDigest();
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/crawl/TextProfileSignature.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/crawl/TextProfileSignature.java b/nutch-core/src/main/java/org/apache/nutch/crawl/TextProfileSignature.java
new file mode 100644
index 0000000..5d930f9
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/crawl/TextProfileSignature.java
@@ -0,0 +1,199 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.Iterator;
+
+import org.apache.hadoop.io.MD5Hash;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.StringUtil;
+import org.apache.nutch.util.NutchConfiguration;
+
+/**
+ * <p>
+ * An implementation of a page signature. It calculates an MD5 hash of a plain
+ * text "profile" of a page. In case there is no text, it calculates a hash
+ * using the {@link MD5Signature}.
+ * </p>
+ * <p>
+ * The algorithm to calculate a page "profile" takes the plain text version of a
+ * page and performs the following steps:
+ * <ul>
+ * <li>remove all characters except letters and digits, and bring all characters
+ * to lower case,</li>
+ * <li>split the text into tokens (all consecutive non-whitespace characters),</li>
+ * <li>discard tokens equal or shorter than MIN_TOKEN_LEN (default 2
+ * characters),</li>
+ * <li>sort the list of tokens by decreasing frequency,</li>
+ * <li>round down the counts of tokens to the nearest multiple of QUANT (
+ * <code>QUANT = QUANT_RATE * maxFreq</code>, where <code>QUANT_RATE</code> is
+ * 0.01f by default, and <code>maxFreq</code> is the maximum token frequency).
+ * If <code>maxFreq</code> is higher than 1, then QUANT is always higher than 2
+ * (which means that tokens with frequency 1 are always discarded).</li>
+ * <li>tokens, which frequency after quantization falls below QUANT, are
+ * discarded.</li>
+ * <li>create a list of tokens and their quantized frequency, separated by
+ * spaces, in the order of decreasing frequency.</li>
+ * </ul>
+ * This list is then submitted to an MD5 hash calculation.
+ * 
+ * @author Andrzej Bialecki &lt;ab@getopt.org&gt;
+ */
+public class TextProfileSignature extends Signature {
+
+  Signature fallback = new MD5Signature();
+
+  public byte[] calculate(Content content, Parse parse) {
+    int MIN_TOKEN_LEN = getConf().getInt(
+        "db.signature.text_profile.min_token_len", 2);
+    float QUANT_RATE = getConf().getFloat(
+        "db.signature.text_profile.quant_rate", 0.01f);
+    HashMap<String, Token> tokens = new HashMap<String, Token>();
+    String text = null;
+    if (parse != null)
+      text = parse.getText();
+    if (text == null || text.length() == 0)
+      return fallback.calculate(content, parse);
+    StringBuffer curToken = new StringBuffer();
+    int maxFreq = 0;
+    for (int i = 0; i < text.length(); i++) {
+      char c = text.charAt(i);
+      if (Character.isLetterOrDigit(c)) {
+        curToken.append(Character.toLowerCase(c));
+      } else {
+        if (curToken.length() > 0) {
+          if (curToken.length() > MIN_TOKEN_LEN) {
+            // add it
+            String s = curToken.toString();
+            Token tok = tokens.get(s);
+            if (tok == null) {
+              tok = new Token(0, s);
+              tokens.put(s, tok);
+            }
+            tok.cnt++;
+            if (tok.cnt > maxFreq)
+              maxFreq = tok.cnt;
+          }
+          curToken.setLength(0);
+        }
+      }
+    }
+    // check the last token
+    if (curToken.length() > MIN_TOKEN_LEN) {
+      // add it
+      String s = curToken.toString();
+      Token tok = tokens.get(s);
+      if (tok == null) {
+        tok = new Token(0, s);
+        tokens.put(s, tok);
+      }
+      tok.cnt++;
+      if (tok.cnt > maxFreq)
+        maxFreq = tok.cnt;
+    }
+    Iterator<Token> it = tokens.values().iterator();
+    ArrayList<Token> profile = new ArrayList<Token>();
+    // calculate the QUANT value
+    int QUANT = Math.round(maxFreq * QUANT_RATE);
+    if (QUANT < 2) {
+      if (maxFreq > 1)
+        QUANT = 2;
+      else
+        QUANT = 1;
+    }
+    while (it.hasNext()) {
+      Token t = it.next();
+      // round down to the nearest QUANT
+      t.cnt = (t.cnt / QUANT) * QUANT;
+      // discard the frequencies below the QUANT
+      if (t.cnt < QUANT) {
+        continue;
+      }
+      profile.add(t);
+    }
+    Collections.sort(profile, new TokenComparator());
+    StringBuffer newText = new StringBuffer();
+    it = profile.iterator();
+    while (it.hasNext()) {
+      Token t = it.next();
+      if (newText.length() > 0)
+        newText.append("\n");
+      newText.append(t.toString());
+    }
+    return MD5Hash.digest(newText.toString()).getDigest();
+  }
+
+  private static class Token {
+    public int cnt;
+    public String val;
+
+    public Token(int cnt, String val) {
+      this.cnt = cnt;
+      this.val = val;
+    }
+
+    public String toString() {
+      return val + " " + cnt;
+    }
+  }
+
+  private static class TokenComparator implements Comparator<Token> {
+    public int compare(Token t1, Token t2) {
+      return t2.cnt - t1.cnt;
+    }
+  }
+
+  public static void main(String[] args) throws Exception {
+    TextProfileSignature sig = new TextProfileSignature();
+    sig.setConf(NutchConfiguration.create());
+    HashMap<String, byte[]> res = new HashMap<String, byte[]>();
+    File[] files = new File(args[0]).listFiles();
+    for (int i = 0; i < files.length; i++) {
+      FileInputStream fis = new FileInputStream(files[i]);
+      BufferedReader br = new BufferedReader(
+          new InputStreamReader(fis, "UTF-8"));
+      StringBuffer text = new StringBuffer();
+      String line = null;
+      while ((line = br.readLine()) != null) {
+        if (text.length() > 0)
+          text.append("\n");
+        text.append(line);
+      }
+      br.close();
+      byte[] signature = sig.calculate(null, new ParseImpl(text.toString(),
+          null));
+      res.put(files[i].toString(), signature);
+    }
+    Iterator<String> it = res.keySet().iterator();
+    while (it.hasNext()) {
+      String name = it.next();
+      byte[] signature = res.get(name);
+      System.out.println(name + "\t" + StringUtil.toHexString(signature));
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/crawl/URLPartitioner.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/crawl/URLPartitioner.java b/nutch-core/src/main/java/org/apache/nutch/crawl/URLPartitioner.java
new file mode 100644
index 0000000..4675f83
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/crawl/URLPartitioner.java
@@ -0,0 +1,96 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+import java.net.InetAddress;
+import java.net.URL;
+import java.net.MalformedURLException;
+import java.net.UnknownHostException;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.io.*;
+import org.apache.hadoop.mapred.*;
+import org.apache.nutch.net.URLNormalizers;
+import org.apache.nutch.util.URLUtil;
+
+/**
+ * Partition urls by host, domain name or IP depending on the value of the
+ * parameter 'partition.url.mode' which can be 'byHost', 'byDomain' or 'byIP'
+ */
+public class URLPartitioner implements Partitioner<Text, Writable> {
+  private static final Logger LOG = LoggerFactory
+      .getLogger(URLPartitioner.class);
+
+  public static final String PARTITION_MODE_KEY = "partition.url.mode";
+
+  public static final String PARTITION_MODE_HOST = "byHost";
+  public static final String PARTITION_MODE_DOMAIN = "byDomain";
+  public static final String PARTITION_MODE_IP = "byIP";
+
+  private int seed;
+  private URLNormalizers normalizers;
+  private String mode = PARTITION_MODE_HOST;
+
+  public void configure(JobConf job) {
+    seed = job.getInt("partition.url.seed", 0);
+    mode = job.get(PARTITION_MODE_KEY, PARTITION_MODE_HOST);
+    // check that the mode is known
+    if (!mode.equals(PARTITION_MODE_IP) && !mode.equals(PARTITION_MODE_DOMAIN)
+        && !mode.equals(PARTITION_MODE_HOST)) {
+      LOG.error("Unknown partition mode : " + mode + " - forcing to byHost");
+      mode = PARTITION_MODE_HOST;
+    }
+    normalizers = new URLNormalizers(job, URLNormalizers.SCOPE_PARTITION);
+  }
+
+  public void close() {
+  }
+
+  /** Hash by domain name. */
+  public int getPartition(Text key, Writable value, int numReduceTasks) {
+    String urlString = key.toString();
+    URL url = null;
+    int hashCode = urlString.hashCode();
+    try {
+      urlString = normalizers.normalize(urlString,
+          URLNormalizers.SCOPE_PARTITION);
+      url = new URL(urlString);
+      hashCode = url.getHost().hashCode();
+    } catch (MalformedURLException e) {
+      LOG.warn("Malformed URL: '" + urlString + "'");
+    }
+
+    if (mode.equals(PARTITION_MODE_DOMAIN) && url != null)
+      hashCode = URLUtil.getDomainName(url).hashCode();
+    else if (mode.equals(PARTITION_MODE_IP)) {
+      try {
+        InetAddress address = InetAddress.getByName(url.getHost());
+        hashCode = address.getHostAddress().hashCode();
+      } catch (UnknownHostException e) {
+        Generator.LOG.info("Couldn't find IP for host: " + url.getHost());
+      }
+    }
+
+    // make hosts wind up in different partitions on different runs
+    hashCode ^= seed;
+
+    return (hashCode & Integer.MAX_VALUE) % numReduceTasks;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/crawl/package.html
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/crawl/package.html b/nutch-core/src/main/java/org/apache/nutch/crawl/package.html
new file mode 100644
index 0000000..05eeb50
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/crawl/package.html
@@ -0,0 +1,5 @@
+<html>
+<body>
+Crawl control code and tools to run the crawler.
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/fetcher/FetchItem.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/fetcher/FetchItem.java b/nutch-core/src/main/java/org/apache/nutch/fetcher/FetchItem.java
new file mode 100644
index 0000000..3ad4970
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/fetcher/FetchItem.java
@@ -0,0 +1,118 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.fetcher;
+
+import java.net.InetAddress;
+import java.net.URL;
+import java.net.UnknownHostException;
+
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.util.URLUtil;
+import org.slf4j.LoggerFactory;
+import org.slf4j.Logger;
+
+/**
+ * This class describes the item to be fetched.
+ */
+public class FetchItem {
+
+  private static final Logger LOG = LoggerFactory.getLogger(FetchItem.class);
+
+  int outlinkDepth = 0;
+  String queueID;
+  Text url;
+  URL u;
+  CrawlDatum datum;
+
+  public FetchItem(Text url, URL u, CrawlDatum datum, String queueID) {
+    this(url, u, datum, queueID, 0);
+  }
+
+  public FetchItem(Text url, URL u, CrawlDatum datum, String queueID,
+      int outlinkDepth) {
+    this.url = url;
+    this.u = u;
+    this.datum = datum;
+    this.queueID = queueID;
+    this.outlinkDepth = outlinkDepth;
+  }
+
+  /**
+   * Create an item. Queue id will be created based on <code>queueMode</code>
+   * argument, either as a protocol + hostname pair, protocol + IP address
+   * pair or protocol+domain pair.
+   */
+  public static FetchItem create(Text url, CrawlDatum datum, String queueMode) {
+    return create(url, datum, queueMode, 0);
+  }
+
+  public static FetchItem create(Text url, CrawlDatum datum,
+      String queueMode, int outlinkDepth) {
+    String queueID;
+    URL u = null;
+    try {
+      u = new URL(url.toString());
+    } catch (Exception e) {
+      LOG.warn("Cannot parse url: " + url, e);
+      return null;
+    }
+    final String proto = u.getProtocol().toLowerCase();
+    String key;
+    if (FetchItemQueues.QUEUE_MODE_IP.equalsIgnoreCase(queueMode)) {
+      try {
+        final InetAddress addr = InetAddress.getByName(u.getHost());
+        key = addr.getHostAddress();
+      } catch (final UnknownHostException e) {
+        // unable to resolve it, so don't fall back to host name
+        LOG.warn("Unable to resolve: " + u.getHost() + ", skipping.");
+        return null;
+      }
+    } else if (FetchItemQueues.QUEUE_MODE_DOMAIN.equalsIgnoreCase(queueMode)) {
+      key = URLUtil.getDomainName(u);
+      if (key == null) {
+        LOG.warn("Unknown domain for url: " + url
+            + ", using URL string as key");
+        key = u.toExternalForm();
+      }
+    } else {
+      key = u.getHost();
+      if (key == null) {
+        LOG.warn("Unknown host for url: " + url + ", using URL string as key");
+        key = u.toExternalForm();
+      }
+    }
+    queueID = proto + "://" + key.toLowerCase();
+    return new FetchItem(url, u, datum, queueID, outlinkDepth);
+  }
+
+  public CrawlDatum getDatum() {
+    return datum;
+  }
+
+  public String getQueueID() {
+    return queueID;
+  }
+
+  public Text getUrl() {
+    return url;
+  }
+
+  public URL getURL2() {
+    return u;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/fetcher/FetchItemQueue.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/fetcher/FetchItemQueue.java b/nutch-core/src/main/java/org/apache/nutch/fetcher/FetchItemQueue.java
new file mode 100644
index 0000000..182c063
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/fetcher/FetchItemQueue.java
@@ -0,0 +1,139 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.fetcher;
+
+import java.util.Collections;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.concurrent.atomic.AtomicLong;
+
+import org.apache.hadoop.conf.Configuration;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * This class handles FetchItems which come from the same host ID (be it a
+ * proto/hostname or proto/IP pair). It also keeps track of requests in
+ * progress and elapsed time between requests.
+ */
+public class FetchItemQueue {
+  
+  private static final Logger LOG = LoggerFactory.getLogger(FetchItemQueues.class);
+
+  List<FetchItem> queue = Collections
+      .synchronizedList(new LinkedList<FetchItem>());
+  AtomicInteger inProgress = new AtomicInteger();
+  AtomicLong nextFetchTime = new AtomicLong();
+  AtomicInteger exceptionCounter = new AtomicInteger();
+  long crawlDelay;
+  long minCrawlDelay;
+  int maxThreads;
+  Configuration conf;
+
+  public FetchItemQueue(Configuration conf, int maxThreads, long crawlDelay,
+      long minCrawlDelay) {
+    this.conf = conf;
+    this.maxThreads = maxThreads;
+    this.crawlDelay = crawlDelay;
+    this.minCrawlDelay = minCrawlDelay;
+    // ready to start
+    setEndTime(System.currentTimeMillis() - crawlDelay);
+  }
+
+  public synchronized int emptyQueue() {
+    int presize = queue.size();
+    queue.clear();
+    return presize;
+  }
+
+  public int getQueueSize() {
+    return queue.size();
+  }
+
+  public int getInProgressSize() {
+    return inProgress.get();
+  }
+
+  public int incrementExceptionCounter() {
+    return exceptionCounter.incrementAndGet();
+  }
+
+  public void finishFetchItem(FetchItem it, boolean asap) {
+    if (it != null) {
+      inProgress.decrementAndGet();
+      setEndTime(System.currentTimeMillis(), asap);
+    }
+  }
+
+  public void addFetchItem(FetchItem it) {
+    if (it == null)
+      return;
+    queue.add(it);
+  }
+
+  public void addInProgressFetchItem(FetchItem it) {
+    if (it == null)
+      return;
+    inProgress.incrementAndGet();
+  }
+
+  public FetchItem getFetchItem() {
+    if (inProgress.get() >= maxThreads)
+      return null;
+    long now = System.currentTimeMillis();
+    if (nextFetchTime.get() > now)
+      return null;
+    FetchItem it = null;
+    if (queue.size() == 0)
+      return null;
+    try {
+      it = queue.remove(0);
+      inProgress.incrementAndGet();
+    } catch (Exception e) {
+      LOG.error(
+          "Cannot remove FetchItem from queue or cannot add it to inProgress queue",
+          e);
+    }
+    return it;
+  }
+
+  public synchronized void dump() {
+    LOG.info("  maxThreads    = " + maxThreads);
+    LOG.info("  inProgress    = " + inProgress.get());
+    LOG.info("  crawlDelay    = " + crawlDelay);
+    LOG.info("  minCrawlDelay = " + minCrawlDelay);
+    LOG.info("  nextFetchTime = " + nextFetchTime.get());
+    LOG.info("  now           = " + System.currentTimeMillis());
+    for (int i = 0; i < queue.size(); i++) {
+      FetchItem it = queue.get(i);
+      LOG.info("  " + i + ". " + it.url);
+    }
+  }
+
+  private void setEndTime(long endTime) {
+    setEndTime(endTime, false);
+  }
+
+  private void setEndTime(long endTime, boolean asap) {
+    if (!asap)
+      nextFetchTime.set(endTime
+          + (maxThreads > 1 ? minCrawlDelay : crawlDelay));
+    else
+      nextFetchTime.set(endTime);
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/fetcher/FetchItemQueues.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/fetcher/FetchItemQueues.java b/nutch-core/src/main/java/org/apache/nutch/fetcher/FetchItemQueues.java
new file mode 100644
index 0000000..4473ff0
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/fetcher/FetchItemQueues.java
@@ -0,0 +1,212 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.fetcher;
+
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Convenience class - a collection of queues that keeps track of the total
+ * number of items, and provides items eligible for fetching from any queue.
+ */
+public class FetchItemQueues {
+
+  private static final Logger LOG = LoggerFactory.getLogger(FetchItemQueues.class);
+  
+  public static final String DEFAULT_ID = "default";
+  Map<String, FetchItemQueue> queues = new HashMap<String, FetchItemQueue>();
+  AtomicInteger totalSize = new AtomicInteger(0);
+  int maxThreads;
+  long crawlDelay;
+  long minCrawlDelay;
+  long timelimit = -1;
+  int maxExceptionsPerQueue = -1;
+  Configuration conf;
+
+  public static final String QUEUE_MODE_HOST = "byHost";
+  public static final String QUEUE_MODE_DOMAIN = "byDomain";
+  public static final String QUEUE_MODE_IP = "byIP";
+
+  String queueMode;
+
+  public FetchItemQueues(Configuration conf) {
+    this.conf = conf;
+    this.maxThreads = conf.getInt("fetcher.threads.per.queue", 1);
+    queueMode = conf.get("fetcher.queue.mode", QUEUE_MODE_HOST);
+    // check that the mode is known
+    if (!queueMode.equals(QUEUE_MODE_IP)
+        && !queueMode.equals(QUEUE_MODE_DOMAIN)
+        && !queueMode.equals(QUEUE_MODE_HOST)) {
+      LOG.error("Unknown partition mode : " + queueMode
+          + " - forcing to byHost");
+      queueMode = QUEUE_MODE_HOST;
+    }
+    LOG.info("Using queue mode : " + queueMode);
+
+    this.crawlDelay = (long) (conf.getFloat("fetcher.server.delay", 1.0f) * 1000);
+    this.minCrawlDelay = (long) (conf.getFloat("fetcher.server.min.delay",
+        0.0f) * 1000);
+    this.timelimit = conf.getLong("fetcher.timelimit", -1);
+    this.maxExceptionsPerQueue = conf.getInt(
+        "fetcher.max.exceptions.per.queue", -1);
+  }
+
+  public int getTotalSize() {
+    return totalSize.get();
+  }
+
+  public int getQueueCount() {
+    return queues.size();
+  }
+
+  public void addFetchItem(Text url, CrawlDatum datum) {
+    FetchItem it = FetchItem.create(url, datum, queueMode);
+    if (it != null)
+      addFetchItem(it);
+  }
+
+  public synchronized void addFetchItem(FetchItem it) {
+    FetchItemQueue fiq = getFetchItemQueue(it.queueID);
+    fiq.addFetchItem(it);
+    totalSize.incrementAndGet();
+  }
+
+  public void finishFetchItem(FetchItem it) {
+    finishFetchItem(it, false);
+  }
+
+  public void finishFetchItem(FetchItem it, boolean asap) {
+    FetchItemQueue fiq = queues.get(it.queueID);
+    if (fiq == null) {
+      LOG.warn("Attempting to finish item from unknown queue: " + it);
+      return;
+    }
+    fiq.finishFetchItem(it, asap);
+  }
+
+  public synchronized FetchItemQueue getFetchItemQueue(String id) {
+    FetchItemQueue fiq = queues.get(id);
+    if (fiq == null) {
+      // initialize queue
+      fiq = new FetchItemQueue(conf, maxThreads, crawlDelay, minCrawlDelay);
+      queues.put(id, fiq);
+    }
+    return fiq;
+  }
+
+  public synchronized FetchItem getFetchItem() {
+    Iterator<Map.Entry<String, FetchItemQueue>> it = queues.entrySet()
+        .iterator();
+    while (it.hasNext()) {
+      FetchItemQueue fiq = it.next().getValue();
+      // reap empty queues
+      if (fiq.getQueueSize() == 0 && fiq.getInProgressSize() == 0) {
+        it.remove();
+        continue;
+      }
+      FetchItem fit = fiq.getFetchItem();
+      if (fit != null) {
+        totalSize.decrementAndGet();
+        return fit;
+      }
+    }
+    return null;
+  }
+
+  // called only once the feeder has stopped
+  public synchronized int checkTimelimit() {
+    int count = 0;
+
+    if (System.currentTimeMillis() >= timelimit && timelimit != -1) {
+      // emptying the queues
+      count = emptyQueues();
+
+      // there might also be a case where totalsize !=0 but number of queues
+      // == 0
+      // in which case we simply force it to 0 to avoid blocking
+      if (totalSize.get() != 0 && queues.size() == 0)
+        totalSize.set(0);
+    }
+    return count;
+  }
+
+  // empties the queues (used by timebomb and throughput threshold)
+  public synchronized int emptyQueues() {
+    int count = 0;
+
+    for (String id : queues.keySet()) {
+      FetchItemQueue fiq = queues.get(id);
+      if (fiq.getQueueSize() == 0)
+        continue;
+      LOG.info("* queue: " + id + " >> dropping! ");
+      int deleted = fiq.emptyQueue();
+      for (int i = 0; i < deleted; i++) {
+        totalSize.decrementAndGet();
+      }
+      count += deleted;
+    }
+
+    return count;
+  }
+
+  /**
+   * Increment the exception counter of a queue in case of an exception e.g.
+   * timeout; when higher than a given threshold simply empty the queue.
+   * 
+   * @param queueid
+   * @return number of purged items
+   */
+  public synchronized int checkExceptionThreshold(String queueid) {
+    FetchItemQueue fiq = queues.get(queueid);
+    if (fiq == null) {
+      return 0;
+    }
+    if (fiq.getQueueSize() == 0) {
+      return 0;
+    }
+    int excCount = fiq.incrementExceptionCounter();
+    if (maxExceptionsPerQueue != -1 && excCount >= maxExceptionsPerQueue) {
+      // too many exceptions for items in this queue - purge it
+      int deleted = fiq.emptyQueue();
+      LOG.info("* queue: " + queueid + " >> removed " + deleted
+          + " URLs from queue because " + excCount + " exceptions occurred");
+      for (int i = 0; i < deleted; i++) {
+        totalSize.decrementAndGet();
+      }
+      return deleted;
+    }
+    return 0;
+  }
+
+  public synchronized void dump() {
+    for (String id : queues.keySet()) {
+      FetchItemQueue fiq = queues.get(id);
+      if (fiq.getQueueSize() == 0)
+        continue;
+      LOG.info("* queue: " + id);
+      fiq.dump();
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/fetcher/FetchNode.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/fetcher/FetchNode.java b/nutch-core/src/main/java/org/apache/nutch/fetcher/FetchNode.java
new file mode 100644
index 0000000..892c90f
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/fetcher/FetchNode.java
@@ -0,0 +1,59 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.fetcher;
+
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.parse.Outlink;
+
+public class FetchNode {
+  private Text url = null;
+  private Outlink[] outlinks;
+  private int status = 0;
+  private String title = null;
+  private long fetchTime = 0;
+  
+  public Text getUrl() {
+    return url;
+  }
+  public void setUrl(Text url) {
+    this.url = url;
+  }
+  public Outlink[] getOutlinks() {
+    return outlinks;
+  }
+  public void setOutlinks(Outlink[] links) {
+    this.outlinks = links;
+  }
+  public int getStatus() {
+    return status;
+  }
+  public void setStatus(int status) {
+    this.status = status;
+  }
+  public String getTitle() {
+    return title;
+  }
+  public void setTitle(String title) {
+    this.title = title;
+  }
+  public long getFetchTime() {
+    return fetchTime;
+  }
+  public void setFetchTime(long fetchTime) {
+    this.fetchTime = fetchTime;
+  }  
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/fetcher/FetchNodeDb.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/fetcher/FetchNodeDb.java b/nutch-core/src/main/java/org/apache/nutch/fetcher/FetchNodeDb.java
new file mode 100644
index 0000000..2e69f31
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/fetcher/FetchNodeDb.java
@@ -0,0 +1,49 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.fetcher;
+
+import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
+
+
+public class FetchNodeDb {
+
+  private Map<Integer, FetchNode> fetchNodeDbMap;
+  private int index;
+  private static FetchNodeDb fetchNodeDbInstance = null;
+  
+  public FetchNodeDb(){    
+    fetchNodeDbMap = new ConcurrentHashMap<Integer, FetchNode>();
+    index = 1;
+  }
+  
+  public static FetchNodeDb getInstance(){
+    
+    if(fetchNodeDbInstance == null){
+      fetchNodeDbInstance = new FetchNodeDb();
+    }
+    return fetchNodeDbInstance;
+  }
+  
+  public void put(String url, FetchNode fetchNode){
+    System.out.println("FetchNodeDb : putting node - " + fetchNode.hashCode());
+    fetchNodeDbMap.put(index++, fetchNode);    
+  }  
+  public Map<Integer, FetchNode> getFetchNodeDb(){
+    return fetchNodeDbMap;
+  }
+}
\ No newline at end of file

[10/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build for nutch-core and nutch-plugins

Posted by th...@apache.org.

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/src/test/resources/nutch.html
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-tika/src/test/resources/nutch.html b/nutch-plugins/parse-tika/src/test/resources/nutch.html
new file mode 100644
index 0000000..0aa7c98
--- /dev/null
+++ b/nutch-plugins/parse-tika/src/test/resources/nutch.html
@@ -0,0 +1,519 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html>
+<head>
+<META http-equiv="Content-Type" content="text/html; charset=UTF-8">
+<meta content="Apache Forrest" name="Generator">
+<meta name="Forrest-version" content="0.8">
+<meta name="Forrest-skin-name" content="lucene">
+<title>Welcome to Nutch!</title>
+<link type="text/css" href="skin/basic.css" rel="stylesheet">
+<link media="screen" type="text/css" href="skin/screen.css" rel="stylesheet">
+<link media="print" type="text/css" href="skin/print.css" rel="stylesheet">
+<link type="text/css" href="skin/profile.css" rel="stylesheet">
+<script src="skin/getBlank.js" language="javascript" type="text/javascript"></script><script src="skin/getMenu.js" language="javascript" type="text/javascript"></script><script src="skin/fontsize.js" language="javascript" type="text/javascript"></script>
+<link rel="shortcut icon" href="images/favicon.ico">
+</head>
+<body onload="init()">
+<script type="text/javascript">ndeSetTextSize();</script>
+<div id="top">
+<!--+
+    |breadtrail
+    +-->
+<div class="breadtrail">
+<a href="http://www.apache.org/">Apache</a> &gt; <a href="http://lucene.apache.org/">Lucene</a> &gt; <a href="http://lucene.apache.org/nutch/">Nutch</a><script src="skin/breadcrumbs.js" language="JavaScript" type="text/javascript"></script>
+</div>
+<!--+
+    |header
+    +-->
+<div class="header">
+<!--+
+    |start group logo
+    +-->
+<div class="grouplogo">
+<a href="http://lucene.apache.org/"><img class="logoImage" alt="Lucene" src="images/lucene_green_150.gif" title="Apache Lucene"></a>
+</div>
+<!--+
+    |end group logo
+    +-->
+<!--+
+    |start Project Logo
+    +-->
+<div class="projectlogo">
+<a href="http://lucene.apache.org/nutch/"><img class="logoImage" alt="Nutch" src="images/nutch-logo.gif" title="Open Source Web Search Software"></a>
+</div>
+<!--+
+    |end Project Logo
+    +-->
+<!--+
+    |start Search
+    +-->
+<div class="searchbox">
+<form action="http://search.lucidimagination.com/p:nutch" method="get" class="roundtopsmall">
+<input onFocus="getBlank (this, 'Search the site with Solr');" size="25" name="q" id="query" type="text" value="Search the site with Solr">&nbsp; 
+                    <input name="Search" value="Search" type="submit">
+</form>
+<div style="position: relative; top: -5px; left: -10px">Powered by <a href="http://www.lucidimagination.com" style="color: #033268">Lucid Imagination</a>
+</div>
+</div>
+<!--+
+    |end search
+    +-->
+<!--+
+    |start Tabs
+    +-->
+<ul id="tabs">
+<li class="current">
+<a class="selected" href="index.html">Main</a>
+</li>
+<li>
+<a class="unselected" href="http://wiki.apache.org/nutch/">Wiki</a>
+</li>
+<li>
+<a class="unselected" href="http://issues.apache.org/jira/browse/Nutch">Jira</a>
+</li>
+</ul>
+<!--+
+    |end Tabs
+    +-->
+</div>
+</div>
+<div id="main">
+<div id="publishedStrip">
+<!--+
+    |start Subtabs
+    +-->
+<div id="level2tabs"></div>
+<!--+
+    |end Endtabs
+    +-->
+<script type="text/javascript"><!--
+document.write("Last Published: " + document.lastModified);
+//  --></script>
+</div>
+<!--+
+    |breadtrail
+    +-->
+<div class="breadtrail">
+
+             &nbsp;
+           </div>
+<!--+
+    |start Menu, mainarea
+    +-->
+<!--+
+    |start Menu
+    +-->
+<div id="menu">
+<div onclick="SwitchMenu('menu_selected_1.1', 'skin/')" id="menu_selected_1.1Title" class="menutitle" style="background-image: url('skin/images/chapter_open.gif');">Project</div>
+<div id="menu_selected_1.1" class="selectedmenuitemgroup" style="display: block;">
+<div class="menupage">
+<div class="menupagetitle">News</div>
+</div>
+<div class="menuitem">
+<a href="about.html">About</a>
+</div>
+<div class="menuitem">
+<a href="credits.html">Credits</a>
+</div>
+<div class="menuitem">
+<a href="http://www.cafepress.com/nutch/">Buy Stuff</a>
+</div>
+</div>
+<div onclick="SwitchMenu('menu_1.2', 'skin/')" id="menu_1.2Title" class="menutitle">Documentation</div>
+<div id="menu_1.2" class="menuitemgroup">
+<div class="menuitem">
+<a href="http://wiki.apache.org/nutch/FAQ">FAQ</a>
+</div>
+<div class="menuitem">
+<a href="http://wiki.apache.org/nutch/">Wiki</a>
+</div>
+<div class="menuitem">
+<a href="tutorial.html">Tutorial (0.7.2)</a>
+</div>
+<div class="menuitem">
+<a href="tutorial8.html">Tutorial (0.8.x)</a>
+</div>
+<div class="menuitem">
+<a href="bot.html">Robot     </a>
+</div>
+<div class="menuitem">
+<a href="i18n.html">i18n</a>
+</div>
+<div class="menuitem">
+<a href="apidocs-1.0/index.html">API Docs (1.0)</a>
+</div>
+<div class="menuitem">
+<a href="apidocs-0.9/index.html">API Docs (0.9)</a>
+</div>
+<div class="menuitem">
+<a href="apidocs-0.8.x/index.html">API Docs (0.8.x)</a>
+</div>
+<div class="menuitem">
+<a href="apidocs/index.html">API Docs (0.7.2)</a>
+</div>
+<div class="menuitem">
+<a href="http://lucene.zones.apache.org:8080/hudson/job/Nutch-Nightly/ws/trunk/build/docs/api/index.html">API Docs (nightly)</a>
+</div>
+</div>
+<div onclick="SwitchMenu('menu_1.3', 'skin/')" id="menu_1.3Title" class="menutitle">Resources</div>
+<div id="menu_1.3" class="menuitemgroup">
+<div class="menuitem">
+<a href="release/">Download</a>
+</div>
+<div class="menuitem">
+<a href="nightly.html">Nightly builds</a>
+</div>
+<div class="menuitem">
+<a href="mailing_lists.html">Mailing Lists</a>
+</div>
+<div class="menuitem">
+<a href="issue_tracking.html">Issue Tracking</a>
+</div>
+<div class="menuitem">
+<a href="version_control.html">Version Control</a>
+</div>
+</div>
+<div onclick="SwitchMenu('menu_1.4', 'skin/')" id="menu_1.4Title" class="menutitle">Related Projects</div>
+<div id="menu_1.4" class="menuitemgroup">
+<div class="menuitem">
+<a href="http://lucene.apache.org/java/">Lucene Java</a>
+</div>
+<div class="menuitem">
+<a href="http://lucene.apache.org/hadoop/">Hadoop</a>
+</div>
+<div class="menuitem">
+<a href="http://incubator.apache.org/solr/">Solr</a>
+</div>
+</div>
+<div id="credit">
+<hr>
+<a href="http://forrest.apache.org/"><img border="0" title="Built with Apache Forrest" alt="Built with Apache Forrest - logo" src="images/built-with-forrest-button.png" style="width: 88px;height: 31px;"></a>
+</div>
+<div id="roundbottom">
+<img style="display: none" class="corner" height="15" width="15" alt="" src="skin/images/rc-b-l-15-1body-2menu-3menu.png"></div>
+<!--+
+  |alternative credits
+  +-->
+<div id="credit2"></div>
+</div>
+<!--+
+    |end Menu
+    +-->
+<!--+
+    |start content
+    +-->
+<div id="content">
+<div title="Portable Document Format" class="pdflink">
+<a class="dida" href="index.pdf"><img alt="PDF -icon" src="skin/images/pdfdoc.gif" class="skin"><br>
+        PDF</a>
+</div>
+<h1>Welcome to Nutch!</h1>
+<div id="minitoc-area">
+<ul class="minitoc">
+<li>
+<a href="#News">News</a>
+<ul class="minitoc">
+<li>
+<a href="#14+August+2009+-+Lucene+at+US+ApacheCon">14 August 2009 - Lucene at US ApacheCon</a>
+</li>
+<li>
+<a href="#23+March+2009+-+Apache+Nutch+1.0+Released">23 March 2009 - Apache Nutch 1.0 Released</a>
+</li>
+<li>
+<a href="#09+February+2009+-+Lucene+at+ApacheCon+Europe+2009+in%0A%09%09%09Amsterdam">09 February 2009 - Lucene at ApacheCon Europe 2009 in
+			Amsterdam</a>
+</li>
+<li>
+<a href="#2+April+2007%3A+Nutch+0.9+Released">2 April 2007: Nutch 0.9 Released</a>
+</li>
+<li>
+<a href="#24+September+2006%3A+Nutch+0.8.1+Released">24 September 2006: Nutch 0.8.1 Released</a>
+</li>
+<li>
+<a href="#25+July+2006%3A+Nutch+0.8+Released">25 July 2006: Nutch 0.8 Released</a>
+</li>
+<li>
+<a href="#31+March+2006%3A+Nutch+0.7.2+Released">31 March 2006: Nutch 0.7.2 Released</a>
+</li>
+<li>
+<a href="#1+October+2005%3A+Nutch+0.7.1+Released">1 October 2005: Nutch 0.7.1 Released</a>
+</li>
+<li>
+<a href="#17+August+2005%3A+Nutch+0.7+Released">17 August 2005: Nutch 0.7 Released</a>
+</li>
+<li>
+<a href="#June+2005%3A+Nutch+graduates+from+Incubator">June 2005: Nutch graduates from Incubator</a>
+</li>
+<li>
+<a href="#January+2005%3A+Nutch+Joins+Apache+Incubator">January 2005: Nutch Joins Apache Incubator</a>
+</li>
+<li>
+<a href="#September+2004%3A+Creative+Commons+launches+Nutch-based+Search">September 2004: Creative Commons launches Nutch-based Search</a>
+</li>
+<li>
+<a href="#September+2004%3A+Oregon+State+University+switches+to+Nutch">September 2004: Oregon State University switches to Nutch</a>
+</li>
+</ul>
+</li>
+</ul>
+</div> 
+
+    
+<a name="N1000D"></a><a name="News"></a>
+<h2 class="h3">News</h2>
+<div class="section">
+<a name="N10013"></a><a name="14+August+2009+-+Lucene+at+US+ApacheCon"></a>
+<h3 class="h4">14 August 2009 - Lucene at US ApacheCon</h3>
+<p>
+        
+<a href="http://www.us.apachecon.com/c/acus2009/" title="ApacheCon US 2009">
+            <img alt="ApacheCon Logo" class="float-right" src="http://www.apache.org/events/current-event-125x125.png">
+        </a>
+        ApacheCon US is once again in the Bay Area and Lucene is coming
+        along for the ride! The Lucene community has planned two full
+        days of talks, plus a meetup and the usual bevy of training.
+        With a well-balanced mix of first time and veteran ApacheCon
+        speakers, the
+        <a href="http://www.us.apachecon.com/c/acus2009/schedule#lucene">Lucene track</a>
+        at ApacheCon US promises to have something for everyone. Be sure
+        not to miss:
+    </p>
+<p> Training:</p>
+<ul>
+        
+<li>
+            
+<a href="http://www.us.apachecon.com/c/acus2009/sessions/437">Lucene Boot Camp</a>
+            - A two day training session, Nov. 2nd &amp; 3rd
+        </li>
+        
+<li>
+            
+<a href="http://www.us.apachecon.com/c/acus2009/sessions/375">Solr Day</a>
+            - A one day training session, Nov. 2nd
+        </li>
+    
+</ul>
+<p>Thursday, Nov. 5th</p>
+<ul>
+        
+<li>
+            
+<a href="http://www.us.apachecon.com/c/acus2009/sessions/428">Introduction to the Lucene Ecosystem
+            </a>
+            - Grant Ingersoll @ 9:00
+        </li>
+        
+<li>
+            
+<a href="http://www.us.apachecon.com/c/acus2009/sessions/461">Lucene Basics and New Features</a>
+            - Michael Busch @ 10:00
+        </li>
+        
+<li>
+            
+<a href="http://www.us.apachecon.com/c/acus2009/sessions/331">Apache Solr: Out of the Box</a>
+            - Chris Hostetter @ 14:00
+        </li>
+        
+<li>
+            
+<a href="http://www.us.apachecon.com/c/acus2009/sessions/427">Introduction to Nutch</a>
+            - Andrzej Bialecki @ 15:00
+        </li>
+        
+<li>
+            
+<a href="http://www.us.apachecon.com/c/acus2009/sessions/430">Lucene and Solr Performance Tuning</a>
+            - Mark Miller @ 16:30
+        </li>
+    
+</ul>
+<p>Friday, Nov. 6th</p>
+<ul>
+        
+<li>
+            
+<a href="http://www.us.apachecon.com/c/acus2009/sessions/332">Implementing an Information Retrieval
+                Framework for an Organizational Repository</a>
+            - Sithu D Sudarsan @ 9:00
+        </li>
+        
+<li>
+            
+<a href="http://www.us.apachecon.com/c/acus2009/sessions/333">Apache Mahout - Going from raw data to
+                Information</a>
+            - Isabel Drost @ 10:00
+        </li>
+        
+<li>
+            
+<a href="http://www.us.apachecon.com/c/acus2009/sessions/334">MIME Magic with Apache Tika</a>
+            - Jukka Zitting @ 11:30
+        </li>
+        
+<li>
+            
+<a href="http://www.us.apachecon.com/c/acus2009/sessions/335">Building Intelligent Search Applications
+                with the Lucene Ecosystem</a>
+            - Ted Dunning @ 14:00
+        </li>
+        
+<li>
+            
+<a href="http://www.us.apachecon.com/c/acus2009/sessions/462">Realtime Search</a>
+            - Jason Rutherglen @ 15:00
+        </li>
+    
+</ul>
+<a name="N10091"></a><a name="23+March+2009+-+Apache+Nutch+1.0+Released"></a>
+<h3 class="h4">23 March 2009 - Apache Nutch 1.0 Released</h3>
+<p>The 1.0 release of Nutch is now available. This release includes several major feature improvements
+      such as new indexing framework, new scoring framework, Apache Solr integration just to mention a few.
+      See <a href="http://www.apache.org/dist/lucene/nutch/CHANGES-1.0.txt">
+      list of changes</a>  made in this version. The release is available
+      <a href="http://lucene.apache.org/nutch/release/">here</a>.</p>
+<a name="N100A3"></a><a name="09+February+2009+-+Lucene+at+ApacheCon+Europe+2009+in%0A%09%09%09Amsterdam"></a>
+<h3 class="h4">09 February 2009 - Lucene at ApacheCon Europe 2009 in
+			Amsterdam</h3>
+<p>
+			
+<a href="http://www.eu.apachecon.com/c/aceu2009/" title="ApacheCon EU 2009">
+				<img alt="ApacheCon EU 2009 Logo" class="float-right" src="http://www.eu.apachecon.com/page_attachments/0000/0115/125x125_basic.gif">
+			</a>
+
+			Lucene will be extremely well represented at
+			<a href="http://www.eu.apachecon.com/c/aceu2009/">ApacheCon EU 2009</a>
+			in Amsterdam, Netherlands this March 23-27, 2009:
+		</p>
+<ul>
+			
+<li>
+				
+<a href="http://eu.apachecon.com/c/aceu2009/sessions/197">Lucene Boot Camp</a>
+				- A two day training session, March 23 &amp; 24th</li>
+                
+<li>
+<a href="http://eu.apachecon.com/c/aceu2009/sessions/201">Solr Boot Camp</a> - A one day training session, March 24th</li>
+                
+<li>
+<a href="http://eu.apachecon.com/c/aceu2009/sessions/136">Introducing Apache Mahout</a> - Grant Ingersoll. March 25th @ 10:30</li>
+                
+<li>
+<a href="http://eu.apachecon.com/c/aceu2009/sessions/137">Lucene/Solr Case Studies</a> - Erik Hatcher. March 25th @ 11:30</li>
+                
+<li>
+<a href="http://eu.apachecon.com/c/aceu2009/sessions/138">Advanced Indexing Techniques with Apache Lucene</a> - Michael Busch. March 25th @ 14:00</li>  
+                   
+<li>
+<a href="http://eu.apachecon.com/c/aceu2009/sessions/251">Apache Solr - A Case Study</a> - Uri Boness. March 26th @ 17:30</li>
+           
+<li>
+<a href="http://eu.apachecon.com/c/aceu2009/sessions/250">Best of breed - httpd, forrest, solr and droids</a> - Thorsten Scherler. March 27th @ 17:30</li>
+           
+<li>
+<a href="http://eu.apachecon.com/c/aceu2009/sessions/165">Apache Droids - an intelligent standalone robot framework</a> - Thorsten Scherler. March 26th @ 15:00</li>
+
+               
+</ul>
+<a name="N100EF"></a><a name="2+April+2007%3A+Nutch+0.9+Released"></a>
+<h3 class="h4">2 April 2007: Nutch 0.9 Released</h3>
+<p>The 0.9 release of Nutch is now available. This is the second release of Nutch
+      based entirely on the underlying Hadoop platform. This release includes several critical
+      bug fixes, as well as key speedups described in more detail at 
+      <a href="http://blog.foofactory.fi/2007/03/twice-speed-half-size.html">Sami Siren's blog</a>.
+      See <a href="http://www.apache.org/dist/lucene/nutch/CHANGES-0.9.txt">
+      list of changes</a>  made in this version. The release is available
+      <a href="http://lucene.apache.org/nutch/release/">here</a>.</p>
+<a name="N10105"></a><a name="24+September+2006%3A+Nutch+0.8.1+Released"></a>
+<h3 class="h4">24 September 2006: Nutch 0.8.1 Released</h3>
+<p>The 0.8.1 release of Nutch is now available. This is a maintenance release to 0.8 branch fixing many serous bugs found in version 0.8.
+      See <a href="http://www.apache.org/dist/lucene/nutch/CHANGES-0.8.1.txt">
+      list of changes</a>  made in this version. The release is available
+      <a href="http://lucene.apache.org/nutch/release/">here</a>.</p>
+<a name="N10117"></a><a name="25+July+2006%3A+Nutch+0.8+Released"></a>
+<h3 class="h4">25 July 2006: Nutch 0.8 Released</h3>
+<p>The 0.8 release of Nutch is now available. This is the first release of Nutch based on
+      hadoop architecure. See <a href="http://svn.apache.org/viewvc/lucene/nutch/tags/release-0.8/CHANGES.txt?view=markup">
+      CHANGES.txt</a> for list of changes made in this version. The release is available
+      <a href="http://lucene.apache.org/nutch/release/">here</a>.</p>
+<a name="N10129"></a><a name="31+March+2006%3A+Nutch+0.7.2+Released"></a>
+<h3 class="h4">31 March 2006: Nutch 0.7.2 Released</h3>
+<p>The 0.7.2 release of Nutch is now available. This is a bug fix release for 0.7 branch. See
+      <a href="http://svn.apache.org/viewcvs.cgi/lucene/nutch/branches/branch-0.7/CHANGES.txt?rev=390158">
+      CHANGES.txt</a> for details. The release is available
+      <a href="http://lucene.apache.org/nutch/release/">here</a>.</p>
+<a name="N1013B"></a><a name="1+October+2005%3A+Nutch+0.7.1+Released"></a>
+<h3 class="h4">1 October 2005: Nutch 0.7.1 Released</h3>
+<p>The 0.7.1 release of Nutch is now available. This is a bug fix release. See
+      <a href="http://svn.apache.org/viewcvs.cgi/lucene/nutch/branches/branch-0.7/CHANGES.txt?rev=292986">
+      CHANGES.txt</a> for details. The release is available
+      <a href="http://lucene.apache.org/nutch/release/">here</a>.</p>
+<a name="N1014D"></a><a name="17+August+2005%3A+Nutch+0.7+Released"></a>
+<h3 class="h4">17 August 2005: Nutch 0.7 Released</h3>
+<p>This is the first Nutch release as an Apache Lucene sub-project. See 
+      <a href="http://svn.apache.org/viewcvs.cgi/lucene/nutch/trunk/CHANGES.txt?rev=233150">
+      CHANGES.txt</a> for details. The release is available 
+      <a href="http://lucene.apache.org/nutch/release/">here</a>.</p>
+<a name="N1015F"></a><a name="June+2005%3A+Nutch+graduates+from+Incubator"></a>
+<h3 class="h4">June 2005: Nutch graduates from Incubator</h3>
+<p>Nutch has now graduated from the Apache incubator, and is now
+      a Subproject of Lucene.</p>
+<a name="N10169"></a><a name="January+2005%3A+Nutch+Joins+Apache+Incubator"></a>
+<h3 class="h4">January 2005: Nutch Joins Apache Incubator</h3>
+<p>Nutch is a two-year-old open source project, previously
+        hosted at Sourceforge and backed by its own non-profit
+        organization. The non-profit was founded in order to assign
+        copyright, so that we could retain the right to change the
+        license. We have now determined that the Apache license is the
+        appropriate license for Nutch and no longer require the
+        overhead of an independent non-profit organization. Nutch's
+        board of directors and its developers were both polled and
+        supported the move to the Apache foundation.</p>
+<a name="N10173"></a><a name="September+2004%3A+Creative+Commons+launches+Nutch-based+Search"></a>
+<h3 class="h4">September 2004: Creative Commons launches Nutch-based Search</h3>
+<p>Creative Commons unveiled a beta version of its search
+      engine, which scours the web for text, images, audio, and video
+      free to re-use on certain terms a search refinement offered by
+      no other company or organization.</p>
+<p>See the <a href="http://creativecommons.org/press-releases/entry/5064">Creative
+      Commons Press Release</a> for more details.</p>
+<a name="N10184"></a><a name="September+2004%3A+Oregon+State+University+switches+to+Nutch"></a>
+<h3 class="h4">September 2004: Oregon State University switches to Nutch</h3>
+<p>Oregon State University is converting its searching
+      infrastructure from Googletm to the open source project
+      Nutch. The effort to replace the Googletm will realize
+      significant cost savings for Oregon State University, while
+      promoting both the Nutch Search Engine and transparency in
+      search engine use and management.</p>
+<p>For more details see the announcement by OSU's <a href="http://osuosl.org/news_folder/nutch">Open Source
+      Lab</a>.</p>
+</div>
+
+  
+</div>
+<!--+
+    |end content
+    +-->
+<div class="clearboth">&nbsp;</div>
+</div>
+<div id="footer">
+<!--+
+    |start bottomstrip
+    +-->
+<div class="lastmodified">
+<script type="text/javascript"><!--
+document.write("Last Published: " + document.lastModified);
+//  --></script>
+</div>
+<div class="copyright">
+        Copyright &copy;
+         2006 <a href="http://www.apache.org/licenses/">The Apache Software Foundation.</a>
+</div>
+<div id="logos"></div>
+<!--+
+    |end bottomstrip
+    +-->
+</div>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/src/test/resources/nutch_logo_tm.gif
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-tika/src/test/resources/nutch_logo_tm.gif b/nutch-plugins/parse-tika/src/test/resources/nutch_logo_tm.gif
new file mode 100644
index 0000000..0545a60
Binary files /dev/null and b/nutch-plugins/parse-tika/src/test/resources/nutch_logo_tm.gif differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/src/test/resources/ootest.odt
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-tika/src/test/resources/ootest.odt b/nutch-plugins/parse-tika/src/test/resources/ootest.odt
new file mode 100644
index 0000000..e36e389
Binary files /dev/null and b/nutch-plugins/parse-tika/src/test/resources/ootest.odt differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/src/test/resources/ootest.sxw
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-tika/src/test/resources/ootest.sxw b/nutch-plugins/parse-tika/src/test/resources/ootest.sxw
new file mode 100644
index 0000000..260b1c2
Binary files /dev/null and b/nutch-plugins/parse-tika/src/test/resources/ootest.sxw differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/src/test/resources/ootest.txt
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-tika/src/test/resources/ootest.txt b/nutch-plugins/parse-tika/src/test/resources/ootest.txt
new file mode 100644
index 0000000..685f89a
--- /dev/null
+++ b/nutch-plugins/parse-tika/src/test/resources/ootest.txt
@@ -0,0 +1,30 @@
+\ufeffAbcedfg				?????
+Abcdefg
+Abcdefg
+abcdefg
+
+
+
+
+
+
+
+
+
+
+ http://www.openoffice.org
+
+Title
+Col1
+Col2
+Col3
+head
+Cell1
+Cell2
+Cel3
+total
+TOTAL
+TOTAL
+TOTAL
+
+Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Integer a leo in lacus malesuada ornare. Mauris sagittis. Nam vestibulum. Nunc gravida vestibulum augue. Praesent sed lectus quis lectus adipiscing bibendum. Sed nulla. Duis posuere justo eget urna. Proin lorem orci, vestibulum ut, consequat molestie, eleifend a, nibh. Mauris sed lacus. Etiam blandit tincidunt neque. Cras ac sapien. Duis erat. 

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/src/test/resources/pdftest.pdf
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-tika/src/test/resources/pdftest.pdf b/nutch-plugins/parse-tika/src/test/resources/pdftest.pdf
new file mode 100644
index 0000000..e7c6e62
--- /dev/null
+++ b/nutch-plugins/parse-tika/src/test/resources/pdftest.pdf
@@ -0,0 +1,157 @@
+%PDF-1.2 
+%\ufffd\ufffd\ufffd\ufffd
+ 
+9 0 obj
+<<
+/Length 10 0 R
+/Filter /FlateDecode 
+>>
+stream
+H\ufffd\u0350\ufffdJ\ufffd0\ufffd\ufffd \ufffd\ufffd{\ufffd\ufffdf\ufffd$M\ufffd\ufffdn\ufffd-\ufffd\ufffd\ufffd[&je\ufffd\ufffd\ufffd\u06e4\ufffd~\ufffd$\ufffd\ufffd\ufffd}\ufffd\u0245\ufffdIj\ufffd\ufffd\ufffds\ufffd\ufffd\ufffd\ufffd~\ufffdX\ufffd-],\ufffd\ufffd$Y\ufffd\ufffd\ufffd)\ufffd'N\ufffdu\ufffd1!\ufffd\ufffd\ufffdV\ufffd?\ufffd\ufffd?
+\ufffdb1Rbb\ufffd\u0489\ufffdH\ufffd[\ufffd\ufffdTD:#\ufffd&\u062d\ufffd\ufffdX\ufffd\ufffd\ufffdi\ufffd$qnf\ufffd\ufffd\ufffd\ufffd\ufffd]\ufffd\ufffd\ufffd\ufffd\ufffd\ufffda\ufffd\ufffd{\ufffd\ufffd\u0623\ufffd\ufffd\ufffdq|J\ufffdLs]\ufffdQ\ufffdI\ufffd\ufffdj\ufffd%\ufffd\ufffd9\ufffd\ufffd`\ufffd\u09ba\ufffd\ufffdU\ufffdite\ufffdz\ufffd$\ufffd\ufffd\ufffd\ufffdOeB\ufffd\u0112\u04af\ufffdR\ufffd\ufffd@z\u0717\ufffd\ufffd\ufffdg\ufffd\ufffd\ufffd<\ufffd\ufffd\ufffd
+endstream
+endobj
+10 0 obj
+246
+endobj
+4 0 obj
+<<
+/Type /Page
+/Parent 5 0 R
+/Resources <<
+/Font <<
+/F0 6 0 R 
+/F1 7 0 R 
+>>
+/ProcSet 2 0 R
+>>
+/Contents 9 0 R
+>>
+endobj
+6 0 obj
+<<
+/Type /Font
+/Subtype /TrueType
+/Name /F0
+/BaseFont /Arial
+/Encoding /WinAnsiEncoding
+>>
+endobj
+7 0 obj
+<<
+/Type /Font
+/Subtype /TrueType
+/Name /F1
+/BaseFont /BookAntiqua,Bold
+/FirstChar 31
+/LastChar 255
+/Widths [ 750 250 278 402 606 500 889 833 227 333 333 444 606 250 333 250 
+296 500 500 500 500 500 500 500 500 500 500 250 250 606 606 606 
+444 747 778 667 722 833 611 556 833 833 389 389 778 611 1000 833 
+833 611 833 722 611 667 778 778 1000 667 667 667 333 606 333 606 
+500 333 500 611 444 611 500 389 556 611 333 333 611 333 889 611 
+556 611 611 389 444 333 611 556 833 500 556 500 310 606 310 606 
+750 500 750 333 500 500 1000 500 500 333 1000 611 389 1000 750 750 
+750 750 278 278 500 500 606 500 1000 333 998 444 389 833 750 750 
+667 250 278 500 500 606 500 606 500 333 747 438 500 606 333 747 
+500 400 549 361 361 333 576 641 250 333 361 488 500 889 890 889 
+444 778 778 778 778 778 778 1000 722 611 611 611 611 389 389 389 
+389 833 833 833 833 833 833 833 606 833 778 778 778 778 667 611 
+611 500 500 500 500 500 500 778 444 500 500 500 500 333 333 333 
+333 556 611 556 556 556 556 556 549 556 611 611 611 611 556 611 
+556 ]
+/Encoding /WinAnsiEncoding
+/FontDescriptor 8 0 R
+>>
+endobj
+8 0 obj
+<<
+/Type /FontDescriptor
+/FontName /BookAntiqua,Bold
+/Flags 16418
+/FontBBox [ -250 -260 1236 930 ]
+/MissingWidth 750
+/StemV 146
+/StemH 146
+/ItalicAngle 0
+/CapHeight 930
+/XHeight 651
+/Ascent 930
+/Descent 260
+/Leading 210
+/MaxWidth 1030
+/AvgWidth 460
+>>
+endobj
+2 0 obj
+[ /PDF /Text  ]
+endobj
+5 0 obj
+<<
+/Kids [4 0 R ]
+/Count 1
+/Type /Pages
+/MediaBox [ 0 0 612 792 ]
+>>
+endobj
+1 0 obj
+<<
+/Creator (1725.fm)
+/CreationDate (1-Jan-3 18:15PM)
+/Title (1725.PDF)
+/Author (Unknown)
+/Producer (Acrobat PDFWriter 3.02 for Windows)
+/Keywords ()
+/Subject ()
+>>
+endobj
+3 0 obj
+<<
+/Pages 5 0 R
+/Type /Catalog
+/DefaultGray 11 0 R
+/DefaultRGB  12 0 R
+>>
+endobj
+11 0 obj
+[/CalGray
+<<
+/WhitePoint [0.9505 1 1.0891 ]
+/Gamma 0.2468 
+>>
+]
+endobj
+12 0 obj
+[/CalRGB
+<<
+/WhitePoint [0.9505 1 1.0891 ]
+/Gamma [0.2468 0.2468 0.2468 ]
+/Matrix [0.4361 0.2225 0.0139 0.3851 0.7169 0.0971 0.1431 0.0606 0.7141 ]
+>>
+]
+endobj
+xref
+0 13
+0000000000 65535 f
+0000002172 00000 n
+0000002046 00000 n
+0000002363 00000 n
+0000000375 00000 n
+0000002080 00000 n
+0000000518 00000 n
+0000000633 00000 n
+0000001760 00000 n
+0000000021 00000 n
+0000000352 00000 n
+0000002460 00000 n
+0000002548 00000 n
+trailer
+<<
+/Size 13
+/Root 3 0 R
+/Info 1 0 R
+/ID [<47149510433dd4882f05f8c124223734><47149510433dd4882f05f8c124223734>]
+>>
+startxref
+2726
+%%EOF

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/src/test/resources/rsstest.rss
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-tika/src/test/resources/rsstest.rss b/nutch-plugins/parse-tika/src/test/resources/rsstest.rss
new file mode 100644
index 0000000..6c4ae48
--- /dev/null
+++ b/nutch-plugins/parse-tika/src/test/resources/rsstest.rss
@@ -0,0 +1,37 @@
+<?xml version="1.0" encoding="ISO-8859-1" ?>
+<!--
+	Licensed to the Apache Software Foundation (ASF) under one or more
+	contributor license agreements.  See the NOTICE file distributed with
+	this work for additional information regarding copyright ownership.
+	The ASF licenses this file to You under the Apache License, Version 2.0
+	(the "License"); you may not use this file except in compliance with
+	the License.  You may obtain a copy of the License at
+	
+	http://www.apache.org/licenses/LICENSE-2.0
+	
+	Unless required by applicable law or agreed to in writing, software
+	distributed under the License is distributed on an "AS IS" BASIS,
+	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	See the License for the specific language governing permissions and
+	limitations under the License.
+-->
+<rss version="0.91">
+    <channel>
+      <title>TestChannel</title>
+      <link>http://test.channel.com/</link> 
+      <description>Sample RSS File for Junit test</description> 
+      <language>en-us</language>
+      
+      <item>
+        <title>Home Page of Chris Mattmann</title>
+        <link>http://www-scf.usc.edu/~mattmann/</link>
+        <description>Chris Mattmann's home page</description>
+      </item>
+
+      <item>
+        <title>Awesome Open Source Search Engine</title> 
+        <link>http://www.nutch.org/</link> 
+        <description>Yup, that's what it is</description> 
+      </item>
+   </channel>
+</rss>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/src/test/resources/test.rtf
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-tika/src/test/resources/test.rtf b/nutch-plugins/parse-tika/src/test/resources/test.rtf
new file mode 100644
index 0000000..c67a6c8
--- /dev/null
+++ b/nutch-plugins/parse-tika/src/test/resources/test.rtf
@@ -0,0 +1,17 @@
+{\rtf1\ansi\deff1\adeflang1025
+{\fonttbl{\f0\froman\fprq2\fcharset0 Times;}{\f1\froman\fprq2\fcharset0 Times New Roman;}{\f2\fmodern\fprq1\fcharset0 Courier New;}{\f3\froman\fprq2\fcharset0 Times New Roman;}{\f4\fnil\fprq2\fcharset0 Interface User;}{\f5\fnil\fprq2\fcharset0 Lucidasans;}{\f6\fnil\fprq0\fcharset0 Lucidasans;}}
+{\colortbl;\red0\green0\blue0;\red0\green0\blue128;\red128\green128\blue128;}
+{\stylesheet{\s1\cf0{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\rtlch\af5\afs24\lang255\ltrch\dbch\af4\afs24\langfe255\loch\f1\fs24\lang1033\snext1 Default;}
+{\s2\sa120\cf0{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\rtlch\afs24\lang255\ltrch\dbch\afs24\langfe255\loch\f1\fs24\lang1033\sbasedon1\snext2 Text body;}
+{\s3\cf0{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\rtlch\af6\afs24\lang255\ltrch\dbch\af1\afs24\langfe255\loch\f1\fs24\lang1033\sbasedon2\snext3 List;}
+{\s4\sb120\sa120\cf0{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\rtlch\af6\afs20\lang255\ai\ltrch\dbch\afs20\langfe255\ai\loch\f1\fs20\lang1033\i\sbasedon1\snext4 Caption;}
+{\s5\cf0{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\rtlch\af6\afs24\lang255\ltrch\dbch\afs24\langfe255\loch\f1\fs24\lang1033\sbasedon1\snext5 Index;}
+{\*\cs7\cf0\rtlch\af2\afs24\lang255\ltrch\dbch\af2\afs24\langfe255\loch\f2\fs24\lang1033 Teletype;}
+{\*\cs8\cf2\ul\rtlch\afs24\lang255\ltrch\dbch\afs24\langfe255\loch\fs24\lang1033 Internet Link;}
+}
+{\info{\title test rft document}{\subject tests}{\creatim\yr2004\mo9\dy20\hr19\min36}{\revtim\yr1601\mo1\dy1\hr0\min0}{\printim\yr1601\mo1\dy1\hr0\min0}{\comment StarWriter}{\vern6450}}\deftab709
+{\*\pgdsctbl
+{\pgdsc0\pgdscuse195\pgwsxn11905\pghsxn16837\marglsxn1800\margrsxn1800\margtsxn1440\margbsxn1440\pgdscnxt0 Default;}}
+{\*\pgdscno0}\paperh16837\paperw11905\margl1800\margr1800\margt1440\margb1440\sectd\sbknone\pgwsxn11905\pghsxn16837\marglsxn1800\margrsxn1800\margtsxn1440\margbsxn1440\ftnbj\ftnstart1\ftnrstcont\ftnnar\aenddoc\aftnrstcont\aftnstart1\aftnnrlc
+\pard\plain \ltrpar\s1\cf0{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\ql\rtlch\af5\afs24\lang255\ltrch\dbch\af4\afs24\langfe255\loch\f1\fs24\lang1033{\loch\f2\fs24\lang1033\i0\b0\*\cs7\cf0\rtlch\ltrch\dbch\loch\f2\fs24\lang1033 The quick brown fox jumps over the lazy dog}
+\par }
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/src/test/resources/word97.doc
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-tika/src/test/resources/word97.doc b/nutch-plugins/parse-tika/src/test/resources/word97.doc
new file mode 100644
index 0000000..4d012da
Binary files /dev/null and b/nutch-plugins/parse-tika/src/test/resources/word97.doc differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-zip/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-zip/build.xml b/nutch-plugins/parse-zip/build.xml
new file mode 100644
index 0000000..991ce31
--- /dev/null
+++ b/nutch-plugins/parse-zip/build.xml
@@ -0,0 +1,38 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="parse-zip" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+  <!-- Deploy Unit test dependencies -->
+  <target name="deps-test">
+    <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
+    <ant target="deploy" inheritall="false" dir="../protocol-file"/>
+   <!-- <ant target="deploy" inheritall="false" dir="../parse-text"/>-->
+  </target>
+
+
+  <!-- for junit test -->
+  <mkdir dir="${build.test}/data" />
+  <copy todir="${build.test}/data">
+    <fileset dir="sample">
+      <include name="*.zip" />
+    </fileset>
+  </copy>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-zip/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-zip/ivy.xml b/nutch-plugins/parse-zip/ivy.xml
new file mode 100644
index 0000000..1a86d68
--- /dev/null
+++ b/nutch-plugins/parse-zip/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-zip/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-zip/plugin.xml b/nutch-plugins/parse-zip/plugin.xml
new file mode 100644
index 0000000..35ec0eb
--- /dev/null
+++ b/nutch-plugins/parse-zip/plugin.xml
@@ -0,0 +1,46 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="parse-zip"
+   name="Zip Parse Plug-in"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="parse-zip.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+   <extension id="org.apache.nutch.parse.zip"
+              name="ZipParser" 
+              point="org.apache.nutch.parse.Parser">
+
+      <implementation id="org.apache.nutch.parse.zip.ZipParser" 
+                      class="org.apache.nutch.parse.zip.ZipParser">
+        <parameter name="contentType" value="application/zip"/>
+        <parameter name="pathSuffix"  value="zip"/>
+      </implementation>
+      
+   </extension>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-zip/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-zip/pom.xml b/nutch-plugins/parse-zip/pom.xml
new file mode 100644
index 0000000..b30b9a1
--- /dev/null
+++ b/nutch-plugins/parse-zip/pom.xml
@@ -0,0 +1,38 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>parse-zip</artifactId>
+    <packaging>jar</packaging>
+
+    <name>parse-zip</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-zip/src/main/java/org/apache/nutch/parse/zip/ZipParser.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-zip/src/main/java/org/apache/nutch/parse/zip/ZipParser.java b/nutch-plugins/parse-zip/src/main/java/org/apache/nutch/parse/zip/ZipParser.java
new file mode 100644
index 0000000..f441fd0
--- /dev/null
+++ b/nutch-plugins/parse-zip/src/main/java/org/apache/nutch/parse/zip/ZipParser.java
@@ -0,0 +1,144 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.zip;
+
+import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.ParseResult;
+import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.parse.Parser;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.hadoop.conf.Configuration;
+
+/**
+ * ZipParser class based on MSPowerPointParser class by Stephan Strittmatter.
+ * Nutch parse plugin for zip files - Content Type : application/zip
+ */
+public class ZipParser implements Parser {
+
+  private static final Logger LOG = LoggerFactory.getLogger(ZipParser.class);
+  private Configuration conf;
+
+  /** Creates a new instance of ZipParser */
+  public ZipParser() {
+  }
+
+  public ParseResult getParse(final Content content) {
+
+    String resultText = null;
+    String resultTitle = null;
+    Outlink[] outlinks = null;
+    List<Outlink> outLinksList = new ArrayList<Outlink>();
+
+    try {
+      final String contentLen = content.getMetadata().get(
+          Response.CONTENT_LENGTH);
+      final int len = Integer.parseInt(contentLen);
+      if (LOG.isDebugEnabled()) {
+        LOG.debug("ziplen: " + len);
+      }
+      final byte[] contentInBytes = content.getContent();
+
+      if (contentLen != null && contentInBytes.length != len) {
+        return new ParseStatus(ParseStatus.FAILED,
+            ParseStatus.FAILED_TRUNCATED, "Content truncated at "
+                + contentInBytes.length
+                + " bytes. Parser can't handle incomplete zip file.")
+            .getEmptyParseResult(content.getUrl(), getConf());
+      }
+
+      ZipTextExtractor extractor = new ZipTextExtractor(getConf());
+
+      // extract text
+      resultText = extractor.extractText(new ByteArrayInputStream(
+          contentInBytes), content.getUrl(), outLinksList);
+
+    } catch (Exception e) {
+      return new ParseStatus(ParseStatus.FAILED,
+          "Can't be handled as Zip document. " + e).getEmptyParseResult(
+          content.getUrl(), getConf());
+    }
+
+    if (resultText == null) {
+      resultText = "";
+    }
+
+    if (resultTitle == null) {
+      resultTitle = "";
+    }
+
+    outlinks = (Outlink[]) outLinksList.toArray(new Outlink[0]);
+    final ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS,
+        resultTitle, outlinks, content.getMetadata());
+
+    if (LOG.isTraceEnabled()) {
+      LOG.trace("Zip file parsed sucessfully !!");
+    }
+    return ParseResult.createParseResult(content.getUrl(), new ParseImpl(
+        resultText, parseData));
+  }
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+  }
+
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+  public static void main(String[] args) throws IOException {
+    if (args.length < 1) {
+      System.out.println("ZipParser <zip_file>");
+      System.exit(1);
+    }
+    File file = new File(args[0]);
+    String url = "file:"+file.getCanonicalPath();
+    FileInputStream in = new FileInputStream(file);
+    byte[] bytes = new byte[in.available()];
+    in.read(bytes);
+    in.close();
+    Configuration conf = NutchConfiguration.create();
+    ZipParser parser = new ZipParser();
+    parser.setConf(conf);
+    Metadata meta = new Metadata();
+    meta.add(Response.CONTENT_LENGTH, ""+file.length());
+    ParseResult parseResult = parser.getParse(new Content(url, url, bytes,
+        "application/zip", meta, conf));
+    Parse p = parseResult.get(url);
+    System.out.println(parseResult.size());
+    System.out.println("Parse Text:");
+    System.out.println(p.getText());
+    System.out.println("Parse Data:");
+    System.out.println(p.getData());
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-zip/src/main/java/org/apache/nutch/parse/zip/ZipTextExtractor.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-zip/src/main/java/org/apache/nutch/parse/zip/ZipTextExtractor.java b/nutch-plugins/parse-zip/src/main/java/org/apache/nutch/parse/zip/ZipTextExtractor.java
new file mode 100644
index 0000000..b454727
--- /dev/null
+++ b/nutch-plugins/parse-zip/src/main/java/org/apache/nutch/parse/zip/ZipTextExtractor.java
@@ -0,0 +1,120 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.zip;
+
+// JDK imports
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.List;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipInputStream;
+import java.net.URL;
+
+// Commons Logging imports
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
+
+// Nutch imports
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.protocol.Content;
+import org.apache.tika.Tika;
+
+/**
+ * 
+ * @author Rohit Kulkarni & Ashish Vaidya
+ */
+public class ZipTextExtractor {
+
+  public static final Logger LOG = LoggerFactory
+      .getLogger(ZipTextExtractor.class);
+
+  private Configuration conf;
+
+  /** Creates a new instance of ZipTextExtractor */
+  public ZipTextExtractor(Configuration conf) {
+    this.conf = conf;
+  }
+
+  public String extractText(InputStream input, String url,
+      List<Outlink> outLinksList) throws IOException {
+    String resultText = "";
+    ZipInputStream zin = new ZipInputStream(input);
+    ZipEntry entry;
+
+    while ((entry = zin.getNextEntry()) != null) {
+
+      if (!entry.isDirectory()) {
+        int size = (int) entry.getSize();
+        byte[] b = new byte[size];
+        for (int x = 0; x < size; x++) {
+          int err = zin.read();
+          if (err != -1) {
+            b[x] = (byte) err;
+          }
+        }
+        String newurl = url + "/";
+        String fname = entry.getName();
+        newurl += fname;
+        URL aURL = new URL(newurl);
+        String base = aURL.toString();
+        int i = fname.lastIndexOf('.');
+        if (i != -1) {
+          // Trying to resolve the Mime-Type
+          Tika tika = new Tika();
+          String contentType = tika.detect(fname);
+          try {
+            Metadata metadata = new Metadata();
+            metadata.set(Response.CONTENT_LENGTH,
+                Long.toString(entry.getSize()));
+            metadata.set(Response.CONTENT_TYPE, contentType);
+            Content content = new Content(newurl, base, b, contentType,
+                metadata, this.conf);
+            Parse parse = new ParseUtil(this.conf).parse(content).get(
+                content.getUrl());
+            ParseData theParseData = parse.getData();
+            Outlink[] theOutlinks = theParseData.getOutlinks();
+
+            for (int count = 0; count < theOutlinks.length; count++) {
+              outLinksList.add(new Outlink(theOutlinks[count].getToUrl(),
+                  theOutlinks[count].getAnchor()));
+            }
+
+            resultText += entry.getName() + " " + parse.getText() + " ";
+          } catch (ParseException e) {
+            if (LOG.isInfoEnabled()) {
+              LOG.info("fetch okay, but can't parse " + fname + ", reason: "
+                  + e.getMessage());
+            }
+          }
+        }
+      }
+    }
+
+    return resultText;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-zip/src/main/java/org/apache/nutch/parse/zip/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-zip/src/main/java/org/apache/nutch/parse/zip/package-info.java b/nutch-plugins/parse-zip/src/main/java/org/apache/nutch/parse/zip/package-info.java
new file mode 100644
index 0000000..fc81ee1
--- /dev/null
+++ b/nutch-plugins/parse-zip/src/main/java/org/apache/nutch/parse/zip/package-info.java
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Parse ZIP files: embedded files are recursively passed to appropriate parsers.
+ */
+package org.apache.nutch.parse.zip;
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-zip/src/test/java/org/apache/nutch/parse/zip/TestZipParser.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-zip/src/test/java/org/apache/nutch/parse/zip/TestZipParser.java b/nutch-plugins/parse-zip/src/test/java/org/apache/nutch/parse/zip/TestZipParser.java
new file mode 100644
index 0000000..17e386a
--- /dev/null
+++ b/nutch-plugins/parse-zip/src/test/java/org/apache/nutch/parse/zip/TestZipParser.java
@@ -0,0 +1,71 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.zip;
+
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.parse.ParseException;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.junit.Assert;
+import org.junit.Test;
+
+/**
+ * Based on Unit tests for MSWordParser by John Xing
+ * 
+ * @author Rohit Kulkarni & Ashish Vaidya
+ */
+public class TestZipParser {
+
+  private String fileSeparator = System.getProperty("file.separator");
+  // This system property is defined in ./src/plugin/build-plugin.xml
+  private String sampleDir = System.getProperty("test.data", ".");
+
+  // Make sure sample files are copied to "test.data"
+
+  private String[] sampleFiles = { "test.zip" };
+
+  private String expectedText = "textfile.txt This is text file number 1 ";
+
+  @Test
+  public void testIt() throws ProtocolException, ParseException {
+    String urlString;
+    Protocol protocol;
+    Content content;
+    Parse parse;
+
+    Configuration conf = NutchConfiguration.create();
+    for (int i = 0; i < sampleFiles.length; i++) {
+      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
+
+      protocol = new ProtocolFactory(conf).getProtocol(urlString);
+      content = protocol.getProtocolOutput(new Text(urlString),
+          new CrawlDatum()).getContent();
+      parse = new ParseUtil(conf).parseByExtensionId("parse-zip", content).get(
+          content.getUrl());
+      Assert.assertTrue(parse.getText().equals(expectedText));
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-zip/src/test/resources/test.zip
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-zip/src/test/resources/test.zip b/nutch-plugins/parse-zip/src/test/resources/test.zip
new file mode 100644
index 0000000..0c649d2
Binary files /dev/null and b/nutch-plugins/parse-zip/src/test/resources/test.zip differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parsefilter-naivebayes/build-ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parsefilter-naivebayes/build-ivy.xml b/nutch-plugins/parsefilter-naivebayes/build-ivy.xml
new file mode 100644
index 0000000..22bee5f
--- /dev/null
+++ b/nutch-plugins/parsefilter-naivebayes/build-ivy.xml
@@ -0,0 +1,54 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="parsefilter-naivebayes" default="deps-jar" xmlns:ivy="antlib:org.apache.ivy.ant">
+
+    <property name="ivy.install.version" value="2.1.0" />
+    <condition property="ivy.home" value="${env.IVY_HOME}">
+      <isset property="env.IVY_HOME" />
+    </condition>
+    <property name="ivy.home" value="${user.home}/.ant" />
+    <property name="ivy.checksums" value="" />
+    <property name="ivy.jar.dir" value="${ivy.home}/lib" />
+    <property name="ivy.jar.file" value="${ivy.jar.dir}/ivy.jar" />
+
+    <target name="download-ivy" unless="offline">
+
+        <mkdir dir="${ivy.jar.dir}"/>
+        <!-- download Ivy from web site so that it can be used even without any special installation -->
+        <get src="http://repo2.maven.org/maven2/org/apache/ivy/ivy/${ivy.install.version}/ivy-${ivy.install.version}.jar" 
+             dest="${ivy.jar.file}" usetimestamp="true"/>
+    </target>
+
+    <target name="init-ivy" depends="download-ivy">
+      <!-- try to load ivy here from ivy home, in case the user has not already dropped
+              it into ant's lib dir (note that the latter copy will always take precedence).
+              We will not fail as long as local lib dir exists (it may be empty) and
+              ivy is in at least one of ant's lib dir or the local lib dir. -->
+        <path id="ivy.lib.path">
+            <fileset dir="${ivy.jar.dir}" includes="*.jar"/>
+
+        </path>
+        <taskdef resource="org/apache/ivy/ant/antlib.xml"
+                 uri="antlib:org.apache.ivy.ant" classpathref="ivy.lib.path"/>
+    </target>
+
+  <target name="deps-jar" depends="init-ivy">
+    <ivy:retrieve pattern="lib/[artifact]-[revision].[ext]"/>
+  </target>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parsefilter-naivebayes/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parsefilter-naivebayes/build.xml b/nutch-plugins/parsefilter-naivebayes/build.xml
new file mode 100644
index 0000000..6fb7a9d
--- /dev/null
+++ b/nutch-plugins/parsefilter-naivebayes/build.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="parsefilter-naivebayes" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parsefilter-naivebayes/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parsefilter-naivebayes/ivy.xml b/nutch-plugins/parsefilter-naivebayes/ivy.xml
new file mode 100644
index 0000000..08cca2c
--- /dev/null
+++ b/nutch-plugins/parsefilter-naivebayes/ivy.xml
@@ -0,0 +1,49 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+
+    <dependency org="org.apache.mahout" name="mahout-math" rev="0.10.1" />
+    <dependency org="org.apache.mahout" name="mahout-core" rev="0.9" >
+      <exclude org="org.apache.mrunit" name="mrunit"/>
+    </dependency>
+    <dependency org="org.apache.lucene" name="lucene-core" rev="5.5.0" />
+    <dependency org="org.apache.lucene" name="lucene-analyzers-common" rev="5.5.0" />
+
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parsefilter-naivebayes/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parsefilter-naivebayes/plugin.xml b/nutch-plugins/parsefilter-naivebayes/plugin.xml
new file mode 100644
index 0000000..ac15041
--- /dev/null
+++ b/nutch-plugins/parsefilter-naivebayes/plugin.xml
@@ -0,0 +1,56 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="parsefilter-naivebayes"
+   name="Naive Bayes Parse Filter"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="parsefilter-naivebayes.jar">
+         <export name="*"/>
+      </library>
+      <library name="commons-cli-2.0-mahout.jar"/>
+      <library name="commons-lang3-3.1.jar"/>
+      <library name="commons-math3-3.2.jar"/>
+      <library name="guava-14.0.1.jar"/>
+      <library name="jackson-core-asl-1.9.12.jar"/>
+      <library name="jackson-mapper-asl-1.9.12.jar"/>
+      <library name="lucene-analyzers-common-5.5.0.jar"/>
+      <library name="lucene-core-5.5.0.jar"/>
+      <library name="mahout-core-0.9.jar"/>
+      <library name="mahout-math-0.10.1.jar"/>
+      <library name="slf4j-api-1.7.12.jar"/>
+      <library name="solr-commons-csv-3.5.0.jar"/>
+      <library name="t-digest-3.1.jar"/>
+      <library name="xmlpull-1.1.3.1.jar"/>
+      <library name="xpp3_min-1.1.4c.jar"/>
+      <library name="xstream-1.4.4.jar"/> 
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+   <extension id="org.apache.nutch.htmlparsefilter.naivebayes"
+        name="Nutch Parser Filter" point="org.apache.nutch.parse.HtmlParseFilter">
+      <implementation id="NaiveBayesHTMLParseFilter" 
+        class="org.apache.nutch.parsefilter.naivebayes.NaiveBayesParseFilter"/>
+   </extension>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parsefilter-naivebayes/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parsefilter-naivebayes/pom.xml b/nutch-plugins/parsefilter-naivebayes/pom.xml
new file mode 100644
index 0000000..0a99e47
--- /dev/null
+++ b/nutch-plugins/parsefilter-naivebayes/pom.xml
@@ -0,0 +1,38 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>parsefilter-naivebayes</artifactId>
+    <packaging>jar</packaging>
+
+    <name>parsefilter-naivebayes</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parsefilter-naivebayes/src/main/java/org/apache/nutch/parsefilter/naivebayes/Classify.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parsefilter-naivebayes/src/main/java/org/apache/nutch/parsefilter/naivebayes/Classify.java b/nutch-plugins/parsefilter-naivebayes/src/main/java/org/apache/nutch/parsefilter/naivebayes/Classify.java
new file mode 100644
index 0000000..d755ff6
--- /dev/null
+++ b/nutch-plugins/parsefilter-naivebayes/src/main/java/org/apache/nutch/parsefilter/naivebayes/Classify.java
@@ -0,0 +1,120 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parsefilter.naivebayes;
+
+import java.io.BufferedReader;
+import java.io.FileReader;
+import java.io.IOException;
+import java.util.HashMap;
+import java.io.InputStreamReader;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+
+public class Classify {
+
+  private static int uniquewords_size = 0;
+
+  private static int numof_ir = 0;
+  private static int numwords_ir = 0;
+  private static HashMap<String, Integer> wordfreq_ir = null;
+
+  private static int numof_r = 0;
+  private static int numwords_r = 0;
+  private static HashMap<String, Integer> wordfreq_r = null;
+  private static boolean ismodel = false;
+
+  public static HashMap<String, Integer> unflattenToHashmap(String line) {
+    HashMap<String, Integer> dict = new HashMap<String, Integer>();
+
+    String dictarray[] = line.split(",");
+
+    for (String field : dictarray) {
+
+      dict.put(field.split(":")[0], Integer.valueOf(field.split(":")[1]));
+    }
+
+    return dict;
+
+  }
+
+  public static String classify(String line) throws IOException {
+
+    double prob_ir = 0;
+    double prob_r = 0;
+
+    String result = "1";
+
+    String[] linearray = line.replaceAll("[^a-zA-Z ]", "").toLowerCase()
+        .split(" ");
+
+    // read the training file
+    // read the line
+    if (!ismodel) {
+      Configuration configuration = new Configuration();
+      FileSystem fs = FileSystem.get(configuration);
+
+      BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(
+          fs.open(new Path("naivebayes-model"))));
+
+      uniquewords_size = Integer.valueOf(bufferedReader.readLine());
+      bufferedReader.readLine();
+
+      numof_ir = Integer.valueOf(bufferedReader.readLine());
+      numwords_ir = Integer.valueOf(bufferedReader.readLine());
+      wordfreq_ir = unflattenToHashmap(bufferedReader.readLine());
+      bufferedReader.readLine();
+      numof_r = Integer.valueOf(bufferedReader.readLine());
+      numwords_r = Integer.valueOf(bufferedReader.readLine());
+      wordfreq_r = unflattenToHashmap(bufferedReader.readLine());
+
+      ismodel = true;
+
+      bufferedReader.close();
+
+    }
+
+    // update probabilities
+
+    for (String word : linearray) {
+      if (wordfreq_ir.containsKey(word))
+        prob_ir += Math.log(wordfreq_ir.get(word)) + 1
+            - Math.log(numwords_ir + uniquewords_size);
+      else
+        prob_ir += 1 - Math.log(numwords_ir + uniquewords_size);
+
+      if (wordfreq_r.containsKey(word))
+        prob_r += Math.log(wordfreq_r.get(word)) + 1
+            - Math.log(numwords_r + uniquewords_size);
+      else
+        prob_r += 1 - Math.log(numwords_r + uniquewords_size);
+
+    }
+
+    prob_ir += Math.log(numof_ir) - Math.log(numof_ir + numof_r);
+    prob_r += Math.log(numof_r) - Math.log(numof_ir + numof_r);
+
+    if (prob_ir > prob_r)
+      result = "0";
+    else
+      result = "1";
+
+    return result;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parsefilter-naivebayes/src/main/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesParseFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parsefilter-naivebayes/src/main/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesParseFilter.java b/nutch-plugins/parsefilter-naivebayes/src/main/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesParseFilter.java
new file mode 100644
index 0000000..30810ae
--- /dev/null
+++ b/nutch-plugins/parsefilter-naivebayes/src/main/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesParseFilter.java
@@ -0,0 +1,197 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parsefilter.naivebayes;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.w3c.dom.DocumentFragment;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.nutch.parse.HTMLMetaTags;
+import org.apache.nutch.parse.HtmlParseFilter;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseResult;
+import org.apache.nutch.protocol.Content;
+
+import java.io.Reader;
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.util.ArrayList;
+
+/**
+ * Html Parse filter that classifies the outlinks from the parseresult as
+ * relevant or irrelevant based on the parseText's relevancy (using a training
+ * file where you can give positive and negative example texts see the
+ * description of parsefilter.naivebayes.trainfile) and if found irrelevant it
+ * gives the link a second chance if it contains any of the words from the list
+ * given in parsefilter.naivebayes.wordlist. CAUTION: Set the parser.timeout to
+ * -1 or a bigger value than 30, when using this classifier.
+ */
+public class NaiveBayesParseFilter implements HtmlParseFilter {
+
+  private static final Logger LOG = LoggerFactory
+      .getLogger(NaiveBayesParseFilter.class);
+
+  public static final String TRAINFILE_MODELFILTER = "parsefilter.naivebayes.trainfile";
+  public static final String DICTFILE_MODELFILTER = "parsefilter.naivebayes.wordlist";
+
+  private Configuration conf;
+  private String inputFilePath;
+  private String dictionaryFile;
+  private ArrayList<String> wordlist = new ArrayList<String>();
+
+  public boolean filterParse(String text) {
+
+    try {
+      return classify(text);
+    } catch (IOException e) {
+      LOG.error("Error occured while classifying:: " + text + " ::"
+          + StringUtils.stringifyException(e));
+    }
+
+    return false;
+  }
+
+  public boolean filterUrl(String url) {
+
+    return containsWord(url, wordlist);
+
+  }
+
+  public boolean classify(String text) throws IOException {
+
+    // if classified as relevant "1" then return true
+    if (Classify.classify(text).equals("1"))
+      return true;
+    return false;
+  }
+
+  public void train() throws Exception {
+    // check if the model file exists, if it does then don't train
+    if (!FileSystem.get(conf).exists(new Path("naivebayes-model"))) {
+      LOG.info("Training the Naive Bayes Model");
+      Train.start(inputFilePath);
+    } else {
+      LOG.info("Model file already exists. Skipping training.");
+    }
+  }
+
+  public boolean containsWord(String url, ArrayList<String> wordlist) {
+    for (String word : wordlist) {
+      if (url.contains(word)) {
+        return true;
+      }
+    }
+
+    return false;
+  }
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    inputFilePath = conf.get(TRAINFILE_MODELFILTER);
+    dictionaryFile = conf.get(DICTFILE_MODELFILTER);
+    if (inputFilePath == null || inputFilePath.trim().length() == 0
+        || dictionaryFile == null || dictionaryFile.trim().length() == 0) {
+      String message = "ParseFilter: NaiveBayes: trainfile or wordlist not set in the parsefilte.naivebayes.trainfile or parsefilte.naivebayes.wordlist";
+      if (LOG.isErrorEnabled()) {
+        LOG.error(message);
+      }
+      throw new IllegalArgumentException(message);
+    }
+    try {
+      if ((FileSystem.get(conf).exists(new Path(inputFilePath)))
+          || (FileSystem.get(conf).exists(new Path(dictionaryFile)))) {
+        String message = "ParseFilter: NaiveBayes: " + inputFilePath + " or "
+            + dictionaryFile + " not found!";
+        if (LOG.isErrorEnabled()) {
+          LOG.error(message);
+        }
+        throw new IllegalArgumentException(message);
+      }
+
+      BufferedReader br = null;
+
+      String CurrentLine;
+      Reader reader = conf.getConfResourceAsReader(dictionaryFile);
+      br = new BufferedReader(reader);
+      while ((CurrentLine = br.readLine()) != null) {
+        wordlist.add(CurrentLine);
+      }
+
+    } catch (IOException e) {
+      LOG.error(StringUtils.stringifyException(e));
+    }
+    try {
+      train();
+    } catch (Exception e) {
+
+      LOG.error("Error occured while training:: "
+          + StringUtils.stringifyException(e));
+
+    }
+
+  }
+
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+  @Override
+  public ParseResult filter(Content content, ParseResult parseResult,
+      HTMLMetaTags metaTags, DocumentFragment doc) {
+
+    Parse parse = parseResult.get(content.getUrl());
+
+    String url = content.getBaseUrl();
+    ArrayList<Outlink> tempOutlinks = new ArrayList<Outlink>();
+    String text = parse.getText();
+
+    if (!filterParse(text)) { // kick in the second tier
+      // if parent page found
+      // irrelevant
+      LOG.info("ParseFilter: NaiveBayes: Page found irrelevant:: " + url);
+      LOG.info("Checking outlinks");
+
+      Outlink[] out = null;
+      for (int i = 0; i < parse.getData().getOutlinks().length; i++) {
+        LOG.info("ParseFilter: NaiveBayes: Outlink to check:: "
+            + parse.getData().getOutlinks()[i].getToUrl());
+        if (filterUrl(parse.getData().getOutlinks()[i].getToUrl())) {
+          tempOutlinks.add(parse.getData().getOutlinks()[i]);
+          LOG.info("ParseFilter: NaiveBayes: found relevant");
+
+        } else {
+          LOG.info("ParseFilter: NaiveBayes: found irrelevant");
+        }
+      }
+      out = new Outlink[tempOutlinks.size()];
+      for (int i = 0; i < tempOutlinks.size(); i++) {
+        out[i] = tempOutlinks.get(i);
+      }
+      parse.getData().setOutlinks(out);
+
+    } else {
+      LOG.info("ParseFilter: NaiveBayes: Page found relevant:: " + url);
+    }
+
+    return parseResult;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parsefilter-naivebayes/src/main/java/org/apache/nutch/parsefilter/naivebayes/Train.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parsefilter-naivebayes/src/main/java/org/apache/nutch/parsefilter/naivebayes/Train.java b/nutch-plugins/parsefilter-naivebayes/src/main/java/org/apache/nutch/parsefilter/naivebayes/Train.java
new file mode 100644
index 0000000..19a6911
--- /dev/null
+++ b/nutch-plugins/parsefilter-naivebayes/src/main/java/org/apache/nutch/parsefilter/naivebayes/Train.java
@@ -0,0 +1,148 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parsefilter.naivebayes;
+
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+import java.util.HashMap;
+import java.util.HashSet;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+
+public class Train {
+
+  public static String replacefirstoccuranceof(String tomatch, String line) {
+
+    int index = line.indexOf(tomatch);
+    if (index == -1) {
+      return line;
+    } else {
+      return line.substring(0, index)
+          + line.substring(index + tomatch.length());
+    }
+
+  }
+
+  public static void updateHashMap(HashMap<String, Integer> dict, String key) {
+    if (!key.equals("")) {
+      if (dict.containsKey(key))
+        dict.put(key, dict.get(key) + 1);
+      else
+        dict.put(key, 1);
+    }
+  }
+
+  public static String flattenHashMap(HashMap<String, Integer> dict) {
+    String result = "";
+
+    for (String key : dict.keySet()) {
+
+      result += key + ":" + dict.get(key) + ",";
+    }
+
+    // remove the last comma
+    result = result.substring(0, result.length() - 1);
+
+    return result;
+  }
+
+  public static void start(String filepath) throws IOException {
+
+    // two classes 0/irrelevant and 1/relevant
+
+    // calculate the total number of instances/examples per class, word count in
+    // each class and for each class a word:frequency map
+
+    int numof_ir = 0;
+    int numof_r = 0;
+    int numwords_ir = 0;
+    int numwords_r = 0;
+    HashSet<String> uniquewords = new HashSet<String>();
+    HashMap<String, Integer> wordfreq_ir = new HashMap<String, Integer>();
+    HashMap<String, Integer> wordfreq_r = new HashMap<String, Integer>();
+
+    String line = "";
+    String target = "";
+    String[] linearray = null;
+
+    // read the line
+    Configuration configuration = new Configuration();
+    FileSystem fs = FileSystem.get(configuration);
+
+    BufferedReader bufferedReader = new BufferedReader(
+        configuration.getConfResourceAsReader(filepath));
+
+    while ((line = bufferedReader.readLine()) != null) {
+
+      target = line.split("\t")[0];
+
+      line = replacefirstoccuranceof(target + "\t", line);
+
+      linearray = line.replaceAll("[^a-zA-Z ]", "").toLowerCase().split(" ");
+
+      // update the data structures
+      if (target.equals("0")) {
+
+        numof_ir += 1;
+        numwords_ir += linearray.length;
+        for (int i = 0; i < linearray.length; i++) {
+          uniquewords.add(linearray[i]);
+          updateHashMap(wordfreq_ir, linearray[i]);
+        }
+      } else {
+
+        numof_r += 1;
+        numwords_r += linearray.length;
+        for (int i = 0; i < linearray.length; i++) {
+          uniquewords.add(linearray[i]);
+          updateHashMap(wordfreq_r, linearray[i]);
+        }
+
+      }
+
+    }
+
+    // write the model file
+
+    Path path = new Path("naivebayes-model");
+
+    Writer writer = new BufferedWriter(new OutputStreamWriter(fs.create(path,
+        true)));
+
+    writer.write(String.valueOf(uniquewords.size()) + "\n");
+    writer.write("0\n");
+    writer.write(String.valueOf(numof_ir) + "\n");
+    writer.write(String.valueOf(numwords_ir) + "\n");
+    writer.write(flattenHashMap(wordfreq_ir) + "\n");
+    writer.write("1\n");
+    writer.write(String.valueOf(numof_r) + "\n");
+    writer.write(String.valueOf(numwords_r) + "\n");
+    writer.write(flattenHashMap(wordfreq_r) + "\n");
+
+    writer.close();
+
+    bufferedReader.close();
+
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parsefilter-naivebayes/src/main/java/org/apache/nutch/parsefilter/naivebayes/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parsefilter-naivebayes/src/main/java/org/apache/nutch/parsefilter/naivebayes/package-info.java b/nutch-plugins/parsefilter-naivebayes/src/main/java/org/apache/nutch/parsefilter/naivebayes/package-info.java
new file mode 100644
index 0000000..6a892be
--- /dev/null
+++ b/nutch-plugins/parsefilter-naivebayes/src/main/java/org/apache/nutch/parsefilter/naivebayes/package-info.java
@@ -0,0 +1,28 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Html Parse filter that classifies the outlinks from the parseresult as
+ * relevant or irrelevant based on the parseText's relevancy (using a training
+ * file where you can give positive and negative example texts see the
+ * description of parsefilter.naivebayes.trainfile) and if found irrelevent
+ * it gives the link a second chance if it contains any of the words from the
+ * list given in parsefilter.naivebayes.wordlist. CAUTION: Set the
+ * parser.timeout to -1 or a bigger value than 30, when using this classifier.
+ */
+package org.apache.nutch.parsefilter.naivebayes;
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parsefilter-regex/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parsefilter-regex/build.xml b/nutch-plugins/parsefilter-regex/build.xml
new file mode 100644
index 0000000..14d1127
--- /dev/null
+++ b/nutch-plugins/parsefilter-regex/build.xml
@@ -0,0 +1,27 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="parsefilter-regex" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+  <!-- for junit test -->
+  <mkdir dir="${build.test}/data"/>
+  <copy todir="${build.test}/data">
+    <fileset dir="data" />
+  </copy>
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parsefilter-regex/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parsefilter-regex/ivy.xml b/nutch-plugins/parsefilter-regex/ivy.xml
new file mode 100644
index 0000000..ed4cbc3
--- /dev/null
+++ b/nutch-plugins/parsefilter-regex/ivy.xml
@@ -0,0 +1,37 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>  
+</ivy-module>

[03/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build for nutch-core and nutch-plugins

Posted by th...@apache.org.

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/subcollection/src/main/java/org/apache/nutch/collection/Subcollection.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/subcollection/src/main/java/org/apache/nutch/collection/Subcollection.java b/nutch-plugins/subcollection/src/main/java/org/apache/nutch/collection/Subcollection.java
new file mode 100644
index 0000000..13064eb
--- /dev/null
+++ b/nutch-plugins/subcollection/src/main/java/org/apache/nutch/collection/Subcollection.java
@@ -0,0 +1,259 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.collection;
+
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+import java.util.StringTokenizer;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.nutch.net.URLFilter;
+import org.apache.xerces.util.DOMUtil;
+import org.w3c.dom.Element;
+import org.w3c.dom.NodeList;
+
+/**
+ * SubCollection represents a subset of index, you can define url patterns that
+ * will indicate that particular page (url) is part of SubCollection.
+ */
+public class Subcollection extends Configured implements URLFilter {
+
+  public static final String TAG_COLLECTIONS = "subcollections";
+  public static final String TAG_COLLECTION = "subcollection";
+  public static final String TAG_WHITELIST = "whitelist";
+  public static final String TAG_BLACKLIST = "blacklist";
+  public static final String TAG_NAME = "name";
+  public static final String TAG_KEY = "key";
+  public static final String TAG_ID = "id";
+
+  List<String> blackList = new ArrayList<String>();
+  List<String> whiteList = new ArrayList<String>();
+
+  /**
+   * SubCollection identifier
+   */
+  String id;
+
+  /**
+   * SubCollection key
+   */
+  String key;
+
+  /**
+   * SubCollection name
+   */
+  String name;
+
+  /**
+   * SubCollection whitelist as String
+   */
+  String wlString;
+
+  /**
+   * SubCollection blacklist as String
+   */
+  String blString;
+
+  /**
+   * public Constructor
+   * 
+   * @param id
+   *          id of SubCollection
+   * @param name
+   *          name of SubCollection
+   */
+  public Subcollection(String id, String name, Configuration conf) {
+    this(id, name, null, conf);
+  }
+
+  /**
+   * public Constructor
+   * 
+   * @param id
+   *          id of SubCollection
+   * @param name
+   *          name of SubCollection
+   */
+  public Subcollection(String id, String name, String key, Configuration conf) {
+    this(conf);
+    this.id = id;
+    this.key = key;
+    this.name = name;
+  }
+
+  public Subcollection(Configuration conf) {
+    super(conf);
+  }
+
+  /**
+   * @return Returns the name
+   */
+  public String getName() {
+    return name;
+  }
+
+  /**
+   * @return Returns the key
+   */
+  public String getKey() {
+    return key;
+  }
+
+  /**
+   * @return Returns the id
+   */
+  public String getId() {
+    return id;
+  }
+
+  /**
+   * Returns whitelist
+   * 
+   * @return Whitelist entries
+   */
+  public List<String> getWhiteList() {
+    return whiteList;
+  }
+
+  /**
+   * Returns whitelist String
+   * 
+   * @return Whitelist String
+   */
+  public String getWhiteListString() {
+    return wlString;
+  }
+
+  /**
+   * Returns blacklist String
+   * 
+   * @return Blacklist String
+   */
+  public String getBlackListString() {
+    return blString;
+  }
+
+  /**
+   * @param whiteList
+   *          The whiteList to set.
+   */
+  public void setWhiteList(ArrayList<String> whiteList) {
+    this.whiteList = whiteList;
+  }
+
+  /**
+   * Simple "indexOf" currentFilter for matching patterns.
+   * 
+   * <pre>
+   *  rules for evaluation are as follows:
+   *  1. if pattern matches in blacklist then url is rejected
+   *  2. if pattern matches in whitelist then url is allowed
+   *  3. url is rejected
+   * </pre>
+   * 
+   * @see org.apache.nutch.net.URLFilter#filter(java.lang.String)
+   */
+  public String filter(String urlString) {
+    // first the blacklist
+    Iterator<String> i = blackList.iterator();
+    while (i.hasNext()) {
+      String row = (String) i.next();
+      if (urlString.contains(row))
+        return null;
+    }
+
+    // then whitelist
+    i = whiteList.iterator();
+    while (i.hasNext()) {
+      String row = (String) i.next();
+      if (urlString.contains(row))
+        return urlString;
+    }
+    return null;
+  }
+
+  /**
+   * Initialize Subcollection from dom element
+   * 
+   * @param collection
+   */
+  public void initialize(Element collection) {
+    this.id = DOMUtil.getChildText(
+        collection.getElementsByTagName(TAG_ID).item(0)).trim();
+    this.name = DOMUtil.getChildText(
+        collection.getElementsByTagName(TAG_NAME).item(0)).trim();
+    this.wlString = DOMUtil.getChildText(
+        collection.getElementsByTagName(TAG_WHITELIST).item(0)).trim();
+
+    parseList(this.whiteList, wlString);
+
+    // Check if there's a blacklist we need to parse
+    NodeList nodeList = collection.getElementsByTagName(TAG_BLACKLIST);
+    if (nodeList.getLength() > 0) {
+      this.blString = DOMUtil.getChildText(nodeList.item(0)).trim();
+      parseList(this.blackList, blString);
+    }
+
+    // Check if there's a key element or set default name
+    nodeList = collection.getElementsByTagName(TAG_KEY);
+    if (nodeList.getLength() == 1) {
+      this.key = DOMUtil.getChildText(nodeList.item(0)).trim();
+    }
+  }
+
+  /**
+   * Create a list of patterns from chunk of text, patterns are separated with
+   * newline
+   * 
+   * @param list
+   * @param text
+   */
+  protected void parseList(List<String> list, String text) {
+    list.clear();
+
+    StringTokenizer st = new StringTokenizer(text, "\n\r");
+
+    while (st.hasMoreElements()) {
+      String line = (String) st.nextElement();
+      list.add(line.trim());
+    }
+  }
+
+  /**
+   * Set contents of blacklist from String
+   * 
+   * @param list
+   *          the blacklist contents
+   */
+  public void setBlackList(String list) {
+    this.blString = list;
+    parseList(blackList, list);
+  }
+
+  /**
+   * Set contents of whitelist from String
+   * 
+   * @param list
+   *          the whitelist contents
+   */
+  public void setWhiteList(String list) {
+    this.wlString = list;
+    parseList(whiteList, list);
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/subcollection/src/main/java/org/apache/nutch/collection/package.html
----------------------------------------------------------------------
diff --git a/nutch-plugins/subcollection/src/main/java/org/apache/nutch/collection/package.html b/nutch-plugins/subcollection/src/main/java/org/apache/nutch/collection/package.html
new file mode 100644
index 0000000..be08d1c
--- /dev/null
+++ b/nutch-plugins/subcollection/src/main/java/org/apache/nutch/collection/package.html
@@ -0,0 +1,36 @@
+<html>
+<body>
+<p>
+Subcollection is a subset of an index. Subcollections are defined
+by urlpatterns in form of white/blacklist. So to get the page into
+subcollection it must match the whitelist and not the blacklist.
+</p>
+<p>
+Subcollection definitions are read from a file subcollections.xml
+and the format is as follows (imagine here that you are crawling all
+the virtualhosts from apache.org and you wan't to tag pages with
+url pattern "http://lucene.apache.org/nutch" and http://wiki.apache.org/nutch/
+to be part of subcollection "nutch", this allows you to later search
+specifically from this subcollection)
+</p>
+<p/>
+<p/>
+<pre>
+&lt;?xml version="1.0" encoding="UTF-8"?>
+&lt;subcollections>
+	&lt;subcollection>
+		&lt;name>nutch&lt;/name>
+		&lt;id>lucene&lt;/id>
+		&lt;whitelist>http://lucene.apache.org/nutch&lt;/whitelist>
+		&lt;whitelist>http://wiki.apache.org/nutch/&lt;/whitelist>
+		&lt;blacklist />
+	&lt;/subcollection>
+&lt;/subcollections>
+</pre>
+</p>
+<p>Despite of this configuration you still can crawl any urls
+as long as they pass through your global url filters. (note that
+you must also seed your urls in normal nutch way)
+</p>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/subcollection/src/main/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/subcollection/src/main/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java b/nutch-plugins/subcollection/src/main/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java
new file mode 100644
index 0000000..2946d9e
--- /dev/null
+++ b/nutch-plugins/subcollection/src/main/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer.subcollection;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.io.Text;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.util.NutchConfiguration;
+
+import org.apache.nutch.indexer.IndexingFilter;
+import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.indexer.NutchDocument;
+
+import org.apache.nutch.collection.CollectionManager;
+import org.apache.nutch.collection.Subcollection;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+
+public class SubcollectionIndexingFilter extends Configured implements
+    IndexingFilter {
+
+  private Configuration conf;
+
+  public SubcollectionIndexingFilter() {
+    super(NutchConfiguration.create());
+  }
+
+  public SubcollectionIndexingFilter(Configuration conf) {
+    super(conf);
+  }
+
+  /**
+   * @param Configuration
+   *          conf
+   */
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    fieldName = conf.get("subcollection.default.fieldname", "subcollection");
+  }
+
+  /**
+   * @return Configuration
+   */
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+  /**
+   * Doc field name
+   */
+  public static String fieldName = "subcollection";
+
+  /**
+   * Logger
+   */
+  public static final Logger LOG = LoggerFactory
+      .getLogger(SubcollectionIndexingFilter.class);
+
+  /**
+   * "Mark" document to be a part of subcollection
+   * 
+   * @param doc
+   * @param url
+   */
+  private void addSubCollectionField(NutchDocument doc, String url) {
+    for (Subcollection coll : CollectionManager.getCollectionManager(getConf())
+        .getSubCollections(url)) {
+      if (coll.getKey() == null) {
+        doc.add(fieldName, coll.getName());
+      } else {
+        doc.add(coll.getKey(), coll.getName());
+      }
+    }
+  }
+
+  public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
+      CrawlDatum datum, Inlinks inlinks) throws IndexingException {
+    String sUrl = url.toString();
+    addSubCollectionField(doc, sUrl);
+    return doc;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/subcollection/src/main/java/org/apache/nutch/indexer/subcollection/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/subcollection/src/main/java/org/apache/nutch/indexer/subcollection/package-info.java b/nutch-plugins/subcollection/src/main/java/org/apache/nutch/indexer/subcollection/package-info.java
new file mode 100644
index 0000000..1c6ba72
--- /dev/null
+++ b/nutch-plugins/subcollection/src/main/java/org/apache/nutch/indexer/subcollection/package-info.java
@@ -0,0 +1,25 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Indexing filter to assign documents to subcollections.
+ * The field "subcollection" is added and filled with a collection name
+ * defined in a configuration file and selected by pattern, see
+ * {@link org.apache.nutch.collection}.
+ */
+package org.apache.nutch.indexer.subcollection;
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/subcollection/src/test/java/org/apache/nutch/collection/TestSubcollection.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/subcollection/src/test/java/org/apache/nutch/collection/TestSubcollection.java b/nutch-plugins/subcollection/src/test/java/org/apache/nutch/collection/TestSubcollection.java
new file mode 100644
index 0000000..a2d2772
--- /dev/null
+++ b/nutch-plugins/subcollection/src/test/java/org/apache/nutch/collection/TestSubcollection.java
@@ -0,0 +1,112 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.collection;
+
+import java.io.ByteArrayInputStream;
+import java.io.InputStream;
+import java.util.Collection;
+
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Assert;
+import org.junit.Test;
+
+public class TestSubcollection {
+
+  /**
+   * Test filtering logic
+   * 
+   * @throws Exception
+   */
+  @Test
+  public void testFilter() throws Exception {
+    Subcollection sc = new Subcollection(NutchConfiguration.create());
+    sc.setWhiteList("www.nutch.org\nwww.apache.org");
+    sc.setBlackList("jpg\nwww.apache.org/zecret/");
+
+    // matches whitelist
+    Assert.assertEquals("http://www.apache.org/index.html",
+        sc.filter("http://www.apache.org/index.html"));
+
+    // matches blacklist
+    Assert.assertEquals(null,
+        sc.filter("http://www.apache.org/zecret/index.html"));
+    Assert.assertEquals(null, sc.filter("http://www.apache.org/img/image.jpg"));
+
+    // no match
+    Assert.assertEquals(null, sc.filter("http://www.google.com/"));
+  }
+
+  @Test
+  public void testInput() {
+    StringBuffer xml = new StringBuffer();
+    xml.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>");
+    xml.append("<!-- just a comment -->");
+    xml.append("<subcollections>");
+    xml.append("<subcollection>");
+    xml.append("<name>nutch collection</name>");
+    xml.append("<id>nutch</id>");
+    xml.append("<whitelist>");
+    xml.append("http://lucene.apache.org/nutch/\n");
+    xml.append("http://wiki.apache.org/nutch/\n");
+    xml.append("</whitelist>");
+    xml.append("<blacklist>");
+    xml.append("http://www.xxx.yyy\n");
+    xml.append("</blacklist>");
+    xml.append("</subcollection>");
+    xml.append("</subcollections>");
+
+    InputStream is = new ByteArrayInputStream(xml.toString().getBytes());
+
+    CollectionManager cm = new CollectionManager();
+    cm.parse(is);
+
+    Collection<?> c = cm.getAll();
+
+    // test that size matches
+    Assert.assertEquals(1, c.size());
+
+    Subcollection collection = (Subcollection) c.toArray()[0];
+
+    // test collection id
+    Assert.assertEquals("nutch", collection.getId());
+
+    // test collection name
+    Assert.assertEquals("nutch collection", collection.getName());
+
+    // test whitelist
+    Assert.assertEquals(2, collection.whiteList.size());
+
+    String wlUrl = (String) collection.whiteList.get(0);
+    Assert.assertEquals("http://lucene.apache.org/nutch/", wlUrl);
+
+    wlUrl = (String) collection.whiteList.get(1);
+    Assert.assertEquals("http://wiki.apache.org/nutch/", wlUrl);
+
+    // matches whitelist
+    Assert.assertEquals("http://lucene.apache.org/nutch/",
+        collection.filter("http://lucene.apache.org/nutch/"));
+
+    // test blacklist
+    Assert.assertEquals(1, collection.blackList.size());
+
+    String blUrl = (String) collection.blackList.get(0);
+    Assert.assertEquals("http://www.xxx.yyy", blUrl);
+
+    // no match
+    Assert.assertEquals(null, collection.filter("http://www.google.com/"));
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/tld/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/tld/build.xml b/nutch-plugins/tld/build.xml
new file mode 100644
index 0000000..f46c8e6
--- /dev/null
+++ b/nutch-plugins/tld/build.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="tld" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/tld/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/tld/ivy.xml b/nutch-plugins/tld/ivy.xml
new file mode 100644
index 0000000..1a86d68
--- /dev/null
+++ b/nutch-plugins/tld/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/tld/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/tld/plugin.xml b/nutch-plugins/tld/plugin.xml
new file mode 100644
index 0000000..712a34a
--- /dev/null
+++ b/nutch-plugins/tld/plugin.xml
@@ -0,0 +1,51 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="tld"
+   name="Top Level Domain Plugin"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+
+   <runtime>
+      <library name="tld.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+   <extension id="org.apache.nutch.indexer.tld"
+              name="Top Level Domain Indexing Filter"
+              point="org.apache.nutch.indexer.IndexingFilter">
+      <implementation id="TLDIndexingFilter"
+                      class="org.apache.nutch.indexer.tld.TLDIndexingFilter"/>
+   </extension>
+
+   <extension id="org.apache.nutch.scoring.tld"
+              name="Top Level Domain Scoring Filter"
+              point="org.apache.nutch.scoring.ScoringFilter">
+
+      <implementation id="org.apache.nutch.scoring.tld.TLDScoringFilter"
+                      class="org.apache.nutch.scoring.tld.TLDScoringFilter" />
+   </extension>
+
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/tld/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/tld/pom.xml b/nutch-plugins/tld/pom.xml
new file mode 100644
index 0000000..95039bd
--- /dev/null
+++ b/nutch-plugins/tld/pom.xml
@@ -0,0 +1,38 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>tld</artifactId>
+    <packaging>jar</packaging>
+
+    <name>tld</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/tld/src/main/java/org/apache/nutch/indexer/tld/TLDIndexingFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/tld/src/main/java/org/apache/nutch/indexer/tld/TLDIndexingFilter.java b/nutch-plugins/tld/src/main/java/org/apache/nutch/indexer/tld/TLDIndexingFilter.java
new file mode 100644
index 0000000..cd7e194
--- /dev/null
+++ b/nutch-plugins/tld/src/main/java/org/apache/nutch/indexer/tld/TLDIndexingFilter.java
@@ -0,0 +1,69 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.indexer.tld;
+
+import java.net.URL;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.indexer.IndexingFilter;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.util.URLUtil;
+import org.apache.nutch.util.domain.DomainSuffix;
+
+/**
+ * Adds the Top level domain extensions to the index
+ * 
+ * @author Enis Soztutar &lt;enis.soz.nutch@gmail.com&gt;
+ */
+public class TLDIndexingFilter implements IndexingFilter {
+  public static final Logger LOG = LoggerFactory
+      .getLogger(TLDIndexingFilter.class);
+
+  private Configuration conf;
+
+  public NutchDocument filter(NutchDocument doc, Parse parse, Text urlText,
+      CrawlDatum datum, Inlinks inlinks) throws IndexingException {
+
+    try {
+      URL url = new URL(urlText.toString());
+      DomainSuffix d = URLUtil.getDomainSuffix(url);
+
+      doc.add("tld", d.getDomain());
+
+    } catch (Exception ex) {
+      LOG.warn(ex.toString());
+    }
+
+    return doc;
+  }
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+  }
+
+  public Configuration getConf() {
+    return this.conf;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/tld/src/main/java/org/apache/nutch/indexer/tld/package.html
----------------------------------------------------------------------
diff --git a/nutch-plugins/tld/src/main/java/org/apache/nutch/indexer/tld/package.html b/nutch-plugins/tld/src/main/java/org/apache/nutch/indexer/tld/package.html
new file mode 100644
index 0000000..75841d9
--- /dev/null
+++ b/nutch-plugins/tld/src/main/java/org/apache/nutch/indexer/tld/package.html
@@ -0,0 +1,5 @@
+<html>
+<body>
+<p>Top Level Domain Indexing plugin.</p><p></p>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/tld/src/main/java/org/apache/nutch/scoring/tld/TLDScoringFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/tld/src/main/java/org/apache/nutch/scoring/tld/TLDScoringFilter.java b/nutch-plugins/tld/src/main/java/org/apache/nutch/scoring/tld/TLDScoringFilter.java
new file mode 100644
index 0000000..b7f4963
--- /dev/null
+++ b/nutch-plugins/tld/src/main/java/org/apache/nutch/scoring/tld/TLDScoringFilter.java
@@ -0,0 +1,114 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.scoring.tld;
+
+import java.util.List;
+import java.util.Collection;
+import java.util.Map.Entry;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.indexer.NutchField;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.scoring.ScoringFilter;
+import org.apache.nutch.scoring.ScoringFilterException;
+import org.apache.nutch.util.domain.DomainSuffix;
+import org.apache.nutch.util.domain.DomainSuffixes;
+
+/**
+ * Scoring filter to boost tlds.
+ * 
+ * @author Enis Soztutar &lt;enis.soz.nutch@gmail.com&gt;
+ */
+public class TLDScoringFilter implements ScoringFilter {
+
+  private Configuration conf;
+  private DomainSuffixes tldEntries;
+
+  public TLDScoringFilter() {
+    tldEntries = DomainSuffixes.getInstance();
+  }
+
+  public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum,
+      CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore)
+      throws ScoringFilterException {
+
+    NutchField tlds = doc.getField("tld");
+    float boost = 1.0f;
+
+    if (tlds != null) {
+      for (Object tld : tlds.getValues()) {
+        DomainSuffix entry = tldEntries.get(tld.toString());
+        if (entry != null)
+          boost *= entry.getBoost();
+      }
+    }
+    return initScore * boost;
+  }
+
+  public CrawlDatum distributeScoreToOutlink(Text fromUrl, Text toUrl,
+      ParseData parseData, CrawlDatum target, CrawlDatum adjust, int allCount,
+      int validCount) throws ScoringFilterException {
+    return adjust;
+  }
+
+  public float generatorSortValue(Text url, CrawlDatum datum, float initSort)
+      throws ScoringFilterException {
+    return initSort;
+  }
+
+  public void initialScore(Text url, CrawlDatum datum)
+      throws ScoringFilterException {
+  }
+
+  public void injectedScore(Text url, CrawlDatum datum)
+      throws ScoringFilterException {
+  }
+
+  public void passScoreAfterParsing(Text url, Content content, Parse parse)
+      throws ScoringFilterException {
+  }
+
+  public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content)
+      throws ScoringFilterException {
+  }
+
+  public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum,
+      List<CrawlDatum> inlinked) throws ScoringFilterException {
+  }
+
+  public Configuration getConf() {
+    return conf;
+  }
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+  }
+
+  public CrawlDatum distributeScoreToOutlinks(Text fromUrl,
+      ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets,
+      CrawlDatum adjust, int allCount) throws ScoringFilterException {
+    return adjust;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/tld/src/main/java/org/apache/nutch/scoring/tld/package.html
----------------------------------------------------------------------
diff --git a/nutch-plugins/tld/src/main/java/org/apache/nutch/scoring/tld/package.html b/nutch-plugins/tld/src/main/java/org/apache/nutch/scoring/tld/package.html
new file mode 100644
index 0000000..d05e4b8
--- /dev/null
+++ b/nutch-plugins/tld/src/main/java/org/apache/nutch/scoring/tld/package.html
@@ -0,0 +1,5 @@
+<html>
+<body>
+<p>Top Level Domain Scoring plugin.</p><p></p>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-automaton/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-automaton/build.xml b/nutch-plugins/urlfilter-automaton/build.xml
new file mode 100644
index 0000000..78557fc
--- /dev/null
+++ b/nutch-plugins/urlfilter-automaton/build.xml
@@ -0,0 +1,51 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="urlfilter-automaton" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+  <!-- Build compilation dependencies -->
+  <target name="deps-jar">
+    <ant target="jar" inheritall="false" dir="../lib-regex-filter"/>
+  </target>
+
+  <!-- Add compilation dependencies to classpath -->
+  <path id="plugin.deps">
+    <fileset dir="${nutch.root}/build">
+      <include name="**/lib-regex-filter/*.jar" />
+    </fileset>
+    <pathelement location="${nutch.root}/build/lib-regex-filter/test"/>
+  </path>
+
+  <!-- Compile test classes for dependencies -->
+  <target name="deps-test-compile">
+    <ant target="compile-test" inheritall="false" dir="../lib-regex-filter"/>
+  </target>
+
+  <!-- Deploy Unit test dependencies -->
+  <target name="deps-test">
+    <ant target="deploy" inheritall="false" dir="../lib-regex-filter"/>
+  </target>
+
+  <!-- for junit test -->
+  <mkdir dir="${build.test}/data"/>
+  <copy todir="${build.test}/data">
+    <fileset dir="sample" includes="**/*.rules, **/*.urls"/>
+  </copy>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-automaton/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-automaton/ivy.xml b/nutch-plugins/urlfilter-automaton/ivy.xml
new file mode 100644
index 0000000..7c1968f
--- /dev/null
+++ b/nutch-plugins/urlfilter-automaton/ivy.xml
@@ -0,0 +1,42 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+    <dependency org="dk.brics.automaton" name="automaton" rev="1.11-8" conf="*->default" />
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-automaton/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-automaton/plugin.xml b/nutch-plugins/urlfilter-automaton/plugin.xml
new file mode 100644
index 0000000..d0cc1ef
--- /dev/null
+++ b/nutch-plugins/urlfilter-automaton/plugin.xml
@@ -0,0 +1,43 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="urlfilter-automaton"
+   name="Automaton URL Filter"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="urlfilter-automaton.jar">
+         <export name="*"/>
+      </library>
+      <library name="automaton-1.11-8.jar"/>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+      <import plugin="lib-regex-filter"/>
+   </requires>
+
+   <extension id="org.apache.nutch.net.urlfilter.automaton"
+              name="Nutch Automaton URL Filter"
+              point="org.apache.nutch.net.URLFilter">
+      <implementation id="AutomatonURLFilter"
+                      class="org.apache.nutch.urlfilter.automaton.AutomatonURLFilter"/>
+   </extension>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-automaton/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-automaton/pom.xml b/nutch-plugins/urlfilter-automaton/pom.xml
new file mode 100644
index 0000000..898944e
--- /dev/null
+++ b/nutch-plugins/urlfilter-automaton/pom.xml
@@ -0,0 +1,58 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>urlfilter-automaton</artifactId>
+    <packaging>jar</packaging>
+
+    <name>urlfilter-automaton</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+    <dependencies>
+        <dependency>
+            <groupId>dk.brics.automaton</groupId>
+            <artifactId>automaton</artifactId>
+            <version>1.11-8</version>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.nutch</groupId>
+            <artifactId>lib-regex-filter</artifactId>
+            <version>${project.parent.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.nutch</groupId>
+            <artifactId>lib-regex-filter</artifactId>
+            <version>${project.parent.version}</version>
+            <scope>test</scope>
+            <type>test-jar</type>
+        </dependency>
+
+    </dependencies>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-automaton/src/main/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-automaton/src/main/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java b/nutch-plugins/urlfilter-automaton/src/main/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java
new file mode 100644
index 0000000..ae4896d
--- /dev/null
+++ b/nutch-plugins/urlfilter-automaton/src/main/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java
@@ -0,0 +1,116 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlfilter.automaton;
+
+// JDK imports
+import java.io.Reader;
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.regex.PatternSyntaxException;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
+
+// Automaton imports
+import dk.brics.automaton.RegExp;
+import dk.brics.automaton.RunAutomaton;
+import org.apache.nutch.net.*;
+import org.apache.nutch.urlfilter.api.RegexRule;
+import org.apache.nutch.urlfilter.api.RegexURLFilterBase;
+
+/**
+ * RegexURLFilterBase implementation based on the <a
+ * href="http://www.brics.dk/automaton/">dk.brics.automaton</a> Finite-State
+ * Automata for Java<sup>TM</sup>.
+ * 
+ * @author J&eacute;r&ocirc;me Charron
+ * @see <a href="http://www.brics.dk/automaton/">dk.brics.automaton</a>
+ */
+public class AutomatonURLFilter extends RegexURLFilterBase {
+  public static final String URLFILTER_AUTOMATON_FILE = "urlfilter.automaton.file";
+  public static final String URLFILTER_AUTOMATON_RULES = "urlfilter.automaton.rules";
+
+  public AutomatonURLFilter() {
+    super();
+  }
+
+  public AutomatonURLFilter(String filename) throws IOException,
+      PatternSyntaxException {
+    super(filename);
+  }
+
+  AutomatonURLFilter(Reader reader) throws IOException,
+      IllegalArgumentException {
+    super(reader);
+  }
+
+  /*
+   * ----------------------------------- * <implementation:RegexURLFilterBase> *
+   * -----------------------------------
+   */
+
+  /**
+   * Rules specified as a config property will override rules specified as a
+   * config file.
+   */
+  protected Reader getRulesReader(Configuration conf) throws IOException {
+    String stringRules = conf.get(URLFILTER_AUTOMATON_RULES);
+    if (stringRules != null) {
+      return new StringReader(stringRules);
+    }
+    String fileRules = conf.get(URLFILTER_AUTOMATON_FILE);
+    return conf.getConfResourceAsReader(fileRules);
+  }
+
+  // Inherited Javadoc
+  protected RegexRule createRule(boolean sign, String regex) {
+    return new Rule(sign, regex);
+  }
+  
+  protected RegexRule createRule(boolean sign, String regex, String hostOrDomain) {
+    return new Rule(sign, regex, hostOrDomain);
+  }
+
+  /*
+   * ------------------------------------ * </implementation:RegexURLFilterBase>
+   * * ------------------------------------
+   */
+
+  public static void main(String args[]) throws IOException {
+    main(new AutomatonURLFilter(), args);
+  }
+
+  private class Rule extends RegexRule {
+
+    private RunAutomaton automaton;
+
+    Rule(boolean sign, String regex) {
+      super(sign, regex);
+      automaton = new RunAutomaton(new RegExp(regex, RegExp.ALL).toAutomaton());
+    }
+    
+    Rule(boolean sign, String regex, String hostOrDomain) {
+      super(sign, regex, hostOrDomain);
+      automaton = new RunAutomaton(new RegExp(regex, RegExp.ALL).toAutomaton());
+    }
+
+    protected boolean match(String url) {
+      return automaton.run(url);
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-automaton/src/main/java/org/apache/nutch/urlfilter/automaton/package.html
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-automaton/src/main/java/org/apache/nutch/urlfilter/automaton/package.html b/nutch-plugins/urlfilter-automaton/src/main/java/org/apache/nutch/urlfilter/automaton/package.html
new file mode 100644
index 0000000..42533f7
--- /dev/null
+++ b/nutch-plugins/urlfilter-automaton/src/main/java/org/apache/nutch/urlfilter/automaton/package.html
@@ -0,0 +1,9 @@
+<html>
+<body>
+<p>
+URL filter plugin based on
+<a href="http://www.brics.dk/automaton/">dk.brics.automaton</a> Finite-State
+Automata for Java<sup>TM</sup>.
+</p>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-automaton/src/test/java/org/apache/nutch/urlfilter/automaton/TestAutomatonURLFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-automaton/src/test/java/org/apache/nutch/urlfilter/automaton/TestAutomatonURLFilter.java b/nutch-plugins/urlfilter-automaton/src/test/java/org/apache/nutch/urlfilter/automaton/TestAutomatonURLFilter.java
new file mode 100644
index 0000000..a70a6b6
--- /dev/null
+++ b/nutch-plugins/urlfilter-automaton/src/test/java/org/apache/nutch/urlfilter/automaton/TestAutomatonURLFilter.java
@@ -0,0 +1,56 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlfilter.automaton;
+
+// JDK imports
+import java.io.IOException;
+import java.io.Reader;
+
+import org.apache.nutch.net.*;
+// Nutch imports
+import org.apache.nutch.urlfilter.api.RegexURLFilterBaseTest;
+import org.junit.Assert;
+import org.junit.Test;
+
+/**
+ * JUnit based test of class <code>AutomatonURLFilter</code>.
+ * 
+ * @author J&eacute;r&ocirc;me Charron
+ */
+public class TestAutomatonURLFilter extends RegexURLFilterBaseTest {
+
+  protected URLFilter getURLFilter(Reader rules) {
+    try {
+      return new AutomatonURLFilter(rules);
+    } catch (IOException e) {
+      Assert.fail(e.toString());
+      return null;
+    }
+  }
+
+  @Test
+  public void test() {
+    test("WholeWebCrawling");
+    test("IntranetCrawling");
+    bench(50, "Benchmarks");
+    bench(100, "Benchmarks");
+    bench(200, "Benchmarks");
+    bench(400, "Benchmarks");
+    bench(800, "Benchmarks");
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-automaton/src/test/resources/Benchmarks.rules
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-automaton/src/test/resources/Benchmarks.rules b/nutch-plugins/urlfilter-automaton/src/test/resources/Benchmarks.rules
new file mode 100644
index 0000000..a2f6da0
--- /dev/null
+++ b/nutch-plugins/urlfilter-automaton/src/test/resources/Benchmarks.rules
@@ -0,0 +1,26 @@
+# The url filter file used by the crawl command.
+
+# Better for intranet crawling.
+# Be sure to change MY.DOMAIN.NAME to your domain name.
+
+# Each non-comment, non-blank line contains a regular expression
+# prefixed by '+' or '-'.  The first matching pattern in the file
+# determines whether a URL is included or ignored.  If no pattern
+# matches, the URL is ignored.
+
+# skip file:, ftp:, & mailto: urls
+-(file|ftp|mailto):.*
+
+# skip image and other suffixes we can't yet parse
+-.*\.(gif|GIF|jpg|JPG|ico|ICO|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe|png)
+
+# skip URLs containing certain characters as probable queries, etc.
+-.*[?*!@=].*
+
+# skip .fr .org and .net domains
+-.*//.*\.fr/.*
+-.*//.*\.org/.*
+-.*//.*\.net/.*
+
+# skip everything else
++.*

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-automaton/src/test/resources/Benchmarks.urls
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-automaton/src/test/resources/Benchmarks.urls b/nutch-plugins/urlfilter-automaton/src/test/resources/Benchmarks.urls
new file mode 100644
index 0000000..40bf4ee
--- /dev/null
+++ b/nutch-plugins/urlfilter-automaton/src/test/resources/Benchmarks.urls
@@ -0,0 +1,297 @@
++http://www.hostip.info/
+-http://www.elanceur.org/Articles/OntologieSurfaite.html
++http://www.opensymphony.com/quartz/
+-http://www.portletbridge.org/saxbenchmark/index.html
++http://www.lesmotsdelinfo.com/
++http://usefulinc.com/doap/
++http://www.codezoo.com/
++http://search.infocious.com/
+-http://pedagogie.ac-montpellier.fr/disciplines/anglais/tice/sms.html
++http://www.brics.dk/%7Eamoeller/automaton/
++http://jazzz.com/wp.html
++http://www.maxkiesler.com/index.php
++http://adscriptum.blogspot.com/2006/03/google-et-la-prsentation-deric-schmidt.html
++http://www.alias-i.com/lingpipe/
+-http://johnny.ihackstuff.com/index.php?module=prodreviews
+-http://www.spurl.net/
++http://www.dropload.com/
++http://vivisimo.com/
++http://www.marumushi.com/apps/newsmap/newsmap.cfm
++http://www.ixquick.com/
+-http://today.java.net/pub/a/today/2003/07/30/LuceneIntro.html
++http://www.mail-archive.com/
++http://www.spymac.com/
+-http://browsers.evolt.org/
+-http://www.oswd.org/
++http://www.stayinvisible.com/index.pl
++http://java.sun.com/j2se/1.4.2/docs/api/index.html
++http://www.microsoft.com/resources/documentation/windows/xp/all/proddocs/en-us/ntcmds.mspx
++http://www.bloglines.com/
+-http://www.fckeditor.net/
++http://search.msn.com/
+-http://www.grub.org/
++http://www.xml.com/pub/a/2000/11/29/schemas/part1.html
+-http://www.mnot.net/cache_docs/
+-http://www.furl.net/
++http://www.blogpulse.com/
++http://www.googlefight.com/
++http://www.rokulabs.com/
+-http://mightylegends.zapto.org/dvd/dvdauthor_howto.php
+-http://www.batbox.org/wrt54g-linux.html
+-http://en.wikipedia.org/wiki/%s
++http://www.sipcenter.com/
++http://www.merriampark.com/ld.htm
++http://anon.inf.tu-dresden.de/index_en.html
++http://www.pluck.com/
++http://www.tiddlywiki.com/
++http://www.jux2.com/
++http://clusty.com/
+-http://findability.org/
++http://www.searchengineshowdown.com/
++http://www.nhacks.com/email/index.php
++http://www.koders.com/
++http://www.cs.rochester.edu/sosp2003/papers/p125-ghemawat.pdf
++http://www.gmailwiki.com/index.php/Main_Page
++http://www.tadalist.com/
++http://www.net2ftp.com/
++http://www.streamload.com/
++http://www.lucazappa.com/brilliantMaker/buttonImage.php
++http://www.hybernaut.com/bdv/delicious-import.html
++http://www.gtmcknight.com/buttons/
++http://amb.vis.ne.jp/mozilla/scrapbook/
++http://g-metrics.com/index.php
+-http://tor.eff.org/
++http://www.search-this.com/search_engine_decoder.asp
++http://www.onjava.com/pub/a/onjava/2005/01/26/classloading.html
++http://www.adaptivepath.com/publications/essays/archives/000385.php
+-http://isnoop.net/gmail/
+-http://openweb.eu.org/
++http://www.mistergooddeal.com/
++http://javatoolbox.com/
+-http://www.freenews.fr/
++http://www.wikiwax.com/
+-http://today.java.net/pub/a/today/2005/04/21/farm.html
++http://users.skynet.be/J.Beever/pave.htm
++http://www.lundi8h.com/
++http://www.snap.com/
++http://www.goosee.com/puppy/index.shtml
+-http://www.softwarefreedom.org/index.html
+-http://y.20q.net/
++http://www.bitty.com/
++http://www.lafraise.com/
+-http://www.liquidinformation.org/
++http://www.searchtools.com/
++http://www.martinfowler.com/articles/injection.html
++http://pdos.csail.mit.edu/scigen/
+-http://developer.yahoo.net/blog/
++http://blogger-templates.blogspot.com/
++http://phpadsnew.com/two/
++http://www.langreiter.com/exec/yahoo-vs-google.html
+-http://www.dataparksearch.org/
+-http://www.yubnub.org/
+-http://www.fing.org/
+-http://www.swish-e.org/
+-http://www.openajax.net/wordpress/
++http://crypto.stanford.edu/PwdHash/
++http://www.html-kit.com/favicon/
+-http://today.java.net/pub/a/today/2005/08/09/didyoumean.html?page=1
++http://www.durhamtownship.com/
++http://jiwire.com/
++http://www.insilmaril.de/vym/
+-http://www.spreadshirt.net/
++http://www.goffice.com/
++http://www.writely.com/
++http://www.milindparikh.com/
++http://www.onjava.com/pub/a/onjava/2005/02/02/bitsets.html
++http://www.wikyblog.com/Map/Guest/Home
+-http://www.kottke.org/05/08/googleos-webos
++http://www.rollyo.com/
++http://www.meebo.com/
++http://www.factbites.com/
++http://www.placeopedia.com/
++http://swoogle.umbc.edu/
++http://www.viaduc.com/
+-http://demo.wikiwyg.net/wikiwyg/demo/standalone/
++http://podcasts.yahoo.com/
+-http://beaglewiki.org/Main_Page
++http://yq.search.yahoo.com/
+-http://www.onlamp.com/pub/a/onlamp/2005/10/13/what_is_rails.html?page=1
++http://www.onlamp.com/pub/a/onlamp/2005/10/13/what_is_rails.html
++http://socialight.com/
++http://www.lexxe.com/
++http://www.xom.nu/
++http://www.turboprint.de/
++http://www.whatdoesthatmean.com/index.php/Welcome_to_%27Whatdoesthatmean%3F%27
++http://www.wi-fiplanet.com/tutorials/article.php/3562391
++http://particletree.com/features/10-tips-to-a-better-form/
++http://www.songbirdnest.com/
+-http://www.w3.org/Talks/Tools/Slidy/
+-http://www.compassframework.org/display/SITE/Home
++http://motrech.blogspot.com/
++http://www.moteurzine.com/
++http://www.mex-search.com/
+-http://beta.previewseek.com/?mdc=y&amp;twin=n&amp;ilang=french
++http://www.goshme.com/
++http://rialto.application-servers.com/
++http://www.multe-pass.com/
++http://www.tailrank.com/
++http://www.vandertramp.com/INTERNETDOWN/
++http://www.letterjames.de/index.html
++http://code.google.com/index.html
++http://www.kritx.com/
++http://performancing.com/firefox
++http://www.mywebsearch.com/
+-http://en.wikibooks.org/w/index.php?title=Wikimania05/IM1
++http://www.lukew.com/resources/articles/blogs2.asp
+-http://www.hyperwords.net/
++http://ajax.parish.ath.cx/translator/
++http://www.maplandia.com/
+-http://www.tbray.org/ongoing/When/200x/2006/01/08/No-New-XML-Languages
++http://onefeed.com/index.php
++http://www.file-swap.com/
+-http://opennlp.org/
++http://mindprod.com/jgloss/encoding.html
++http://code.google.com/webstats/index.html
++http://www.freeweb-hosting.com/google_pagerank_pr_checker/
+-http://www.framakey.org/
+-http://microformats.org/wiki/hreview
+-http://www.ashesandsnow.org/index2.html
+-http://uima-framework.sourceforge.net/
++http://sethgodin.typepad.com/seths_blog/2006/01/flipping_the_fu.html
+-http://www.anandtech.com/IT/showdoc.aspx?i=2523&amp;p=2
++http://fr.techcrunch.com/
+-http://developer.yahoo.net/yui/
++http://www.fredrikodman.com/
++http://www.mpirical.com/companion/mpirical_companion.html
++http://www.onjava.com/pub/a/onjava/2005/08/03/drools.html
+-http://k9copy.free.fr/
+-http://lespetitescases.net/comment-organiser-l-information-pour-y-naviguer-efficacement-3
+-http://www.tbray.org/ongoing/When/200x/2006/01/09/On-XML-Language-Design
+-http://lespetitescases.net/structurer-decrire-et-organiser-l-information-2
++http://blogokat.canalblog.com/archives/2005/11/02/882454.html
++http://robur.slu.se/jensl/xmlclitools/
+-http://www.internetactu.net/?p=6291
+-http://www.xml.com/pub/a/2005/10/19/microformats-and-web-2.0.html?page=1
++http://www.memodata.com/2004/fr/alexandria/
+-http://presse-citron.net/?2006/01/23/654-joomla-pete-grave
++http://www.randomerror.com/
++http://www.i-cherubini.it/mauro/blog/2006/01/05/techniques-for-determining-the-location-on-umts-networks/
+-http://fr.newsgator.com/ngs/subscriber/WebEd2.aspx?fid=368395
+-http://interstices.info/display.jsp?id=c_15918
++http://www.tech-invite.com/
++http://www.croczilla.com/zap
+-http://www.libervis.com/modules/wordpress/?p=13
++http://www.searchmorph.com/wp/2005/07/19/recent-discovery-clickfraud-tools/
+-http://savoirscdi.cndp.fr/CulturePro/actualisation/Serres/Serres.htm
++http://www.influo.com/
++http://www.dsi-info.ca/chroniques/chroniques-recherche-web.html
+-http://www.addnb.org/fr/docs/webinvisible.htm
+-http://manhack.net/
+-http://www.jibaku.net/
++http://www.pipologie.com/
++http://christophenoel.blogspot.com/
+-http://www.seekport.fr/seekbot/
++http://beta.exalead.com/
+-http://www.boolgum.fr/index.html
++http://www.kesako.canalblog.com/
++http://loran.blogspot.com/
++http://outils-recherche.blogspot.com/
++http://www.art-dept.com/artists/giacobbe/
++http://www.meggould.netfirms.com/site_seeingIII.htm
++http://www.freedpi.com/
++http://www.frenchfred.com/
++http://www.photoways.com/
+-http://freco.free.fr/index.htm
+-http://triturages.free.fr/index.htm
+-http://www.qsos.org/
++http://www.alvis.info/alvis/
++http://www.i-cherubini.it/mauro/blog/2005/12/16/open-source-information-retrieval-systems/
+-http://www.shinux.org/
++http://www.linuxlinks.com/Distributions/Mini_Distributions/index.shtml
++http://www.kurobox.com/online/tiki-index.php
+-http://news.gmane.org/gmane.comp.misc.linkstation.linux
++http://www.imsbook.com/SIP-IMS-Standards-List.html
+-http://incubator.apache.org/directory/subprojects/snickers/
+-http://www.mozilla.org/projects/security/pki/jss/javadoc/org/mozilla/jss/asn1/package-summary.html
+-http://sourceforge.net/projects/cryptix-asn1/
+-http://sourceforge.net/projects/basn/
+-http://asn1.elibel.tm.fr/fr/index.htm
+-http://sourceforge.net/projects/a2j/
++http://www.degrouptest.com/
++http://interstices.info/
++http://louvre-boite.viabloga.com/news/18.shtml
+-http://tel.ccsd.cnrs.fr/documents/archives0/00/00/62/60/index_fr.html
++http://poiplace.oabsoftware.nl/
+-http://www.gpspassion.com/forumsen/topic.asp?TOPIC_ID=7759
+-http://www.yoono.com/favorites.jsp?user-id=lquerel
+-http://www.librecours.org/cgi-bin/main
+-http://www.onjava.com/pub/a/onjava/2006/01/18/using-lucene-to-search-java-source.html?page=1
+-http://limo.sourceforge.net/
++http://www-scf.usc.edu/%7Emattmann/
++http://spaces.msn.com/members/famillezen/
+-http://photos.joune.org/
+-http://www.canon.fr/paperart/
++http://flash.eastweb.ru/files/20051024092150.swf
++http://www.xsltwiki.com/index.php/Main_Page
++http://www.i-cherubini.it/mauro/blog/2005/12/08/software-that-goes-on-a-stick/
+-http://www.webrankinfo.com/forums/forum_15.htm?sid=307384cdbce813aa19ba017513cbbc31
++http://www.loiclemeur.com/france/2006/01/eric_tenin_se_f.html
+-http://member.openmobilealliance.org/ftp/Public_documents/MCC/2005/
++http://www.aeliosfinance.com/
++http://www.capital-it.com/
+-http://www.tradedoubler.fr/pan/public/solutions/publisher
+-http://www.recherche.gouv.fr/technologie/concours/2006/index.htm
++http://www.techcrunch.com/2005/12/21/gravee-takes-a-new-approach-to-search/
++http://wanabo.com/
+-http://www.lespetitescases.net/structurer-decrire-et-organiser-l-information-1
+-http://presse-citron.net/?2006/02/07/705-joue-la-comme-stickam
++http://aeliosfinance.com/
++http://www.centreincubation.com/
++http://www.franceincubation.com/
+-http://www.oseo.fr/
++http://www.i18nfaq.com/chardet.html
+-http://cpdetector.sourceforge.net/
++http://www.jeremi.info/index.php/2005/07/21/7-introduction-aux-methodes-agiles
++http://chezlorry.ca/Accueil.htm
++http://cetnia.blogs.com/d_lires/
+-http://www.directwine.fr/
++http://www.new-phenix.com/
+-http://upnp.sourceforge.net/
+-http://www.pixmania.fr/
+-http://www.lespetitescases.net/comment-organiser-l-information-pour-y-naviguer-efficacement-3
++http://www.i-cherubini.it/mauro/blog/2006/01/25/kwmap-a-keyword-search-visualization-tool/
++http://www.stepnewz.com/sn/default.asp
++http://opquast.com/
+-http://www.freeplayer.org/
+-http://www.cafe-clope.net/orangeamere/index.php/2005/08/24/5-le-modele-contributif-une-utopie
+-http://atomcomputer.free.fr/fbox/
+-http://www.internetactu.net/index.php?p=6100
+-http://mammouthland.free.fr/cours/css/genecss.php
+-http://www.xml.com/pub/a/2006/02/01/doing-http-caching-right-introducing-httplib2.html?page=1
++http://www-106.ibm.com/developerworks/xml/library/x-xapi.html
+-http://xml.apache.org/xalan-j/extensions.html
++http://developers.sun.com/foryourbusiness/jcc/
++http://blogs.sun.com/roller/page/roumen/Weblog
+-http://www.onjava.com/pub/a/onjava/2005/10/12/diagnostic-tests-with-ant.html?page=1
+-http://blog.developpez.com/index.php?blog=51&amp;p=1389&amp;more=1&amp;c=1&amp;tb=1&amp;pb=1
++http://dcabasson.developpez.com/articles/javascript/ajax/ajax-autocompletion-pas-a-pas/
++http://odur.let.rug.nl/%7Evannoord/
+-http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html
+-http://artist.inist.fr/
++http://www.elra.info/
+-http://beinecke.library.yale.edu/dl_crosscollex/SearchExecXC.asp?srchtype=CNO
++http://www.i-cherubini.it/mauro/blog/2005/12/13/information-retrieval-system-evaluation-effort-sensitivity-and-reliability
++http://www.i-cherubini.it/mauro/blog/2005/12/13/trec-datasets-text-retrieval-conference-datasets-for-information-retrieval
++http://www.i-cherubini.it/mauro/blog/2005/12/12/focused-crawling-using-context-graphs/
++http://www.i-cherubini.it/mauro/blog/2005/12/08/spam-filtering-using-contextual-network-graphs/
++http://www.cs.northwestern.edu/%7Evidya/semanticons/IconsWebPage/
++http://www.i-cherubini.it/mauro/blog/2006/01/05/social-information-retrieval/
++http://www.i-cherubini.it/mauro/blog/2006/01/04/an-introduction-to-random-indexing/
++http://dossierdoc.typepad.com/descripteurs/2006/01/liste_de_thsaur.html
+-http://www.lexique.org/
++http://www.i-cherubini.it/mauro/blog/2006/01/22/montylingua-a-commonsense-enriched-part-of-speech-tagger/
++http://www.streamium.com/products/mx6000i/
+-http://www.p4c.philips.com/cgi-bin/dcbint/cpindex.pl?ctn=MX6000I/22S&amp;scy=FR&amp;slg=fr
+-http://store.interact-tv.com/store/product_info.php?cPath=9&amp;products_id=73
++http://www.tversity.com/
+-http://www.aspseek.org/index.php
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-automaton/src/test/resources/IntranetCrawling.rules
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-automaton/src/test/resources/IntranetCrawling.rules b/nutch-plugins/urlfilter-automaton/src/test/resources/IntranetCrawling.rules
new file mode 100644
index 0000000..8966183
--- /dev/null
+++ b/nutch-plugins/urlfilter-automaton/src/test/resources/IntranetCrawling.rules
@@ -0,0 +1,24 @@
+# The url filter file used by the crawl command.
+
+# Better for intranet crawling.
+# Be sure to change MY.DOMAIN.NAME to your domain name.
+
+# Each non-comment, non-blank line contains a regular expression
+# prefixed by '+' or '-'.  The first matching pattern in the file
+# determines whether a URL is included or ignored.  If no pattern
+# matches, the URL is ignored.
+
+# skip file:, ftp:, & mailto: urls
+-(file|ftp|mailto):.*
+
+# skip image and other suffixes we can't yet parse
+-.*\.(gif|GIF|jpg|JPG|ico|ICO|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe|png)
+
+# skip URLs containing certain characters as probable queries, etc.
+-.*[?*!@=].*
+
+# accept hosts in MY.DOMAIN.NAME
++http://([a-z0-9]*\.)*MY.DOMAIN.NAME/.*
+
+# skip everything else
+-.*

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-automaton/src/test/resources/IntranetCrawling.urls
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-automaton/src/test/resources/IntranetCrawling.urls b/nutch-plugins/urlfilter-automaton/src/test/resources/IntranetCrawling.urls
new file mode 100644
index 0000000..b1ad9b7
--- /dev/null
+++ b/nutch-plugins/urlfilter-automaton/src/test/resources/IntranetCrawling.urls
@@ -0,0 +1,8 @@
+-file://home/jc/nutch/index.html
+-ftp://ftp.apache.org/nutch.html
+-mailto:jerome.charron@gmail.com
+-news://any.news.server/comp.lang.java
+-whois:/nutch.org
++http://MY.DOMAIN.NAME/
++http://MY.DOMAIN.NAME/nutch
++http://www.MY.DOMAIN.NAME/

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-automaton/src/test/resources/WholeWebCrawling.rules
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-automaton/src/test/resources/WholeWebCrawling.rules b/nutch-plugins/urlfilter-automaton/src/test/resources/WholeWebCrawling.rules
new file mode 100644
index 0000000..dfae8b0
--- /dev/null
+++ b/nutch-plugins/urlfilter-automaton/src/test/resources/WholeWebCrawling.rules
@@ -0,0 +1,19 @@
+# The default url filter.
+# Better for whole-internet crawling.
+
+# Each non-comment, non-blank line contains a regular expression
+# prefixed by '+' or '-'.  The first matching pattern in the file
+# determines whether a URL is included or ignored.  If no pattern
+# matches, the URL is ignored.
+
+# skip file: ftp: and mailto: urls
+-(file|ftp|mailto):.*
+
+# skip image and other suffixes we can't yet parse
+-.*\.(gif|GIF|jpg|JPG|ico|ICO|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe)
+
+# skip URLs containing certain characters as probable queries, etc.
+-.*[?*!@=].*
+
+# accept anything else
++.*

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-automaton/src/test/resources/WholeWebCrawling.urls
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-automaton/src/test/resources/WholeWebCrawling.urls b/nutch-plugins/urlfilter-automaton/src/test/resources/WholeWebCrawling.urls
new file mode 100644
index 0000000..d3b1bf3
--- /dev/null
+++ b/nutch-plugins/urlfilter-automaton/src/test/resources/WholeWebCrawling.urls
@@ -0,0 +1,11 @@
+-file://home/jc/nutch/index.html
+-ftp://ftp.apache.org/nutch.html
+-mailto:jerome.charron@gmail.com
++news://any.news.server/comp.lang.java
++whois:/nutch.org
+-http://www.nutch.org/nutch.gif
+-http://www.nutch.org/nutch.eps
+-http://www.nutch.org/nutch?q=nutch
++http://www.nutch.org/
++http://www.nutch.org/abcd/foo/bar/foo/bar/foo/
++http://www.nutch.org/abcd/foo/bar/xyz/foo/bar/foo/

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-domain/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-domain/build.xml b/nutch-plugins/urlfilter-domain/build.xml
new file mode 100644
index 0000000..4af55ac
--- /dev/null
+++ b/nutch-plugins/urlfilter-domain/build.xml
@@ -0,0 +1,28 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="urlfilter-domain" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+  <!-- for junit test -->
+  <mkdir dir="${build.test}/data"/>
+  <copy todir="${build.test}/data">
+    <fileset dir="data" />
+  </copy>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-domain/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-domain/ivy.xml b/nutch-plugins/urlfilter-domain/ivy.xml
new file mode 100644
index 0000000..1a86d68
--- /dev/null
+++ b/nutch-plugins/urlfilter-domain/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-domain/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-domain/plugin.xml b/nutch-plugins/urlfilter-domain/plugin.xml
new file mode 100644
index 0000000..1452d58
--- /dev/null
+++ b/nutch-plugins/urlfilter-domain/plugin.xml
@@ -0,0 +1,43 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="urlfilter-domain"
+   name="Domain URL Filter"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="urlfilter-domain.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+   <extension id="org.apache.nutch.net.urlfilter.domain"
+              name="Nutch Domain URL Filter"
+              point="org.apache.nutch.net.URLFilter">
+      <implementation id="DomainURLFilter"
+        class="org.apache.nutch.urlfilter.domain.DomainURLFilter">
+        <parameter name="file" value="domain-urlfilter.txt"/>
+      </implementation>
+   </extension>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-domain/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-domain/pom.xml b/nutch-plugins/urlfilter-domain/pom.xml
new file mode 100644
index 0000000..0c9dddd
--- /dev/null
+++ b/nutch-plugins/urlfilter-domain/pom.xml
@@ -0,0 +1,38 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>urlfilter-domain</artifactId>
+    <packaging>jar</packaging>
+
+    <name>urlfilter-domain</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-domain/src/main/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-domain/src/main/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java b/nutch-plugins/urlfilter-domain/src/main/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java
new file mode 100644
index 0000000..821d944
--- /dev/null
+++ b/nutch-plugins/urlfilter-domain/src/main/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java
@@ -0,0 +1,212 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlfilter.domain;
+
+import java.io.BufferedReader;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+import java.util.LinkedHashSet;
+import java.util.Set;
+
+import org.apache.commons.lang.StringUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.net.URLFilter;
+import org.apache.nutch.plugin.Extension;
+import org.apache.nutch.plugin.PluginRepository;
+import org.apache.nutch.util.URLUtil;
+import org.apache.nutch.util.domain.DomainSuffix;
+
+/**
+ * <p>
+ * Filters URLs based on a file containing domain suffixes, domain names, and
+ * hostnames. Only a url that matches one of the suffixes, domains, or hosts
+ * present in the file is allowed.
+ * </p>
+ * 
+ * <p>
+ * Urls are checked in order of domain suffix, domain name, and hostname against
+ * entries in the domain file. The domain file would be setup as follows with
+ * one entry per line:
+ * 
+ * <pre>
+ * com apache.org www.apache.org
+ * </pre>
+ * 
+ * <p>
+ * The first line is an example of a filter that would allow all .com domains.
+ * The second line allows all urls from apache.org and all of its subdomains
+ * such as lucene.apache.org and hadoop.apache.org. The third line would allow
+ * only urls from www.apache.org. There is no specific ordering to entries. The
+ * entries are from more general to more specific with the more general
+ * overridding the more specific.
+ * </p>
+ * 
+ * The domain file defaults to domain-urlfilter.txt in the classpath but can be
+ * overridden using the:
+ * 
+ * <ul>
+ * <ol>
+ * property "urlfilter.domain.file" in ./conf/nutch-*.xml, and
+ * </ol>
+ * <ol>
+ * attribute "file" in plugin.xml of this plugin
+ * </ol>
+ * </ul>
+ * 
+ * the attribute "file" has higher precedence if defined.
+ */
+public class DomainURLFilter implements URLFilter {
+
+  private static final Logger LOG = LoggerFactory
+      .getLogger(DomainURLFilter.class);
+
+  // read in attribute "file" of this plugin.
+  private static String attributeFile = null;
+  private Configuration conf;
+  private String domainFile = null;
+  private Set<String> domainSet = new LinkedHashSet<String>();
+
+  private void readConfiguration(Reader configReader) throws IOException {
+
+    // read the configuration file, line by line
+    BufferedReader reader = new BufferedReader(configReader);
+    String line = null;
+    while ((line = reader.readLine()) != null) {
+      if (StringUtils.isNotBlank(line) && !line.startsWith("#")) {
+        // add non-blank lines and non-commented lines
+        domainSet.add(StringUtils.lowerCase(line.trim()));
+      }
+    }
+  }
+
+  /**
+   * Default constructor.
+   */
+  public DomainURLFilter() {
+
+  }
+
+  /**
+   * Constructor that specifies the domain file to use.
+   * 
+   * @param domainFile
+   *          The domain file, overrides domain-urlfilter.text default.
+   * 
+   * @throws IOException
+   */
+  public DomainURLFilter(String domainFile) {
+    this.domainFile = domainFile;
+  }
+
+  /**
+   * Sets the configuration.
+   */
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+
+    // get the extensions for domain urlfilter
+    String pluginName = "urlfilter-domain";
+    Extension[] extensions = PluginRepository.get(conf)
+        .getExtensionPoint(URLFilter.class.getName()).getExtensions();
+    for (int i = 0; i < extensions.length; i++) {
+      Extension extension = extensions[i];
+      if (extension.getDescriptor().getPluginId().equals(pluginName)) {
+        attributeFile = extension.getAttribute("file");
+        break;
+      }
+    }
+
+    // handle blank non empty input
+    if (attributeFile != null && attributeFile.trim().equals("")) {
+      attributeFile = null;
+    }
+
+    if (attributeFile != null) {
+      if (LOG.isInfoEnabled()) {
+        LOG.info("Attribute \"file\" is defined for plugin " + pluginName
+            + " as " + attributeFile);
+      }
+    } else {
+      if (LOG.isWarnEnabled()) {
+        LOG.warn("Attribute \"file\" is not defined in plugin.xml for plugin "
+            + pluginName);
+      }
+    }
+
+    // domain file and attribute "file" take precedence if defined
+    String file = conf.get("urlfilter.domain.file");
+    String stringRules = conf.get("urlfilter.domain.rules");
+    if (domainFile != null) {
+      file = domainFile;
+    } else if (attributeFile != null) {
+      file = attributeFile;
+    }
+    Reader reader = null;
+    if (stringRules != null) { // takes precedence over files
+      reader = new StringReader(stringRules);
+    } else {
+      reader = conf.getConfResourceAsReader(file);
+    }
+    try {
+      if (reader == null) {
+        reader = new FileReader(file);
+      }
+      readConfiguration(reader);
+    } catch (IOException e) {
+      LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
+    }
+  }
+
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+  public String filter(String url) {
+    // https://issues.apache.org/jira/browse/NUTCH-2189
+    if (domainSet.size() == 0) return url;
+    
+    try {
+      // match for suffix, domain, and host in that order. more general will
+      // override more specific
+      String domain = URLUtil.getDomainName(url).toLowerCase().trim();
+      String host = URLUtil.getHost(url);
+      String suffix = null;
+      DomainSuffix domainSuffix = URLUtil.getDomainSuffix(url);
+      if (domainSuffix != null) {
+        suffix = domainSuffix.getDomain();
+      }
+
+      if (domainSet.contains(suffix) || domainSet.contains(domain)
+          || domainSet.contains(host)) {
+        return url;
+      }
+
+      // doesn't match, don't allow
+      return null;
+    } catch (Exception e) {
+
+      // if an error happens, allow the url to pass
+      LOG.error("Could not apply filter on url: " + url + "\n"
+          + org.apache.hadoop.util.StringUtils.stringifyException(e));
+      return null;
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/urlfilter-domain/src/main/java/org/apache/nutch/urlfilter/domain/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-domain/src/main/java/org/apache/nutch/urlfilter/domain/package-info.java b/nutch-plugins/urlfilter-domain/src/main/java/org/apache/nutch/urlfilter/domain/package-info.java
new file mode 100644
index 0000000..d2eba1f
--- /dev/null
+++ b/nutch-plugins/urlfilter-domain/src/main/java/org/apache/nutch/urlfilter/domain/package-info.java
@@ -0,0 +1,25 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * URL filter plugin to include only URLs which match an element in a given list of
+ * domain suffixes, domain names, and/or host names.
+ * See {@link org.apache.nutch.urlfilter.domainblacklist} for the counterpart
+ * (exclude URLs by host or domain).
+ */
+package org.apache.nutch.urlfilter.domain;
+

[06/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build for nutch-core and nutch-plugins

Posted by th...@apache.org.

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java b/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java
new file mode 100644
index 0000000..afcf24a
--- /dev/null
+++ b/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java
@@ -0,0 +1,163 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * Based on EasySSLProtocolSocketFactory from commons-httpclient:
+ * 
+ * $Header:
+ * /home/jerenkrantz/tmp/commons/commons-convert/cvs/home/cvs/jakarta-commons//httpclient/src/contrib/org/apache/commons/httpclient/contrib/ssl/DummySSLProtocolSocketFactory.java,v
+ * 1.7 2004/06/11 19:26:27 olegk Exp $ $Revision$ $Date: 2005-02-26 05:01:52
+ * -0800 (Sat, 26 Feb 2005) $
+ */
+
+package org.apache.nutch.protocol.httpclient;
+
+import java.io.IOException;
+import java.net.InetAddress;
+import java.net.Socket;
+import java.net.UnknownHostException;
+
+import org.apache.commons.httpclient.ConnectTimeoutException;
+import org.apache.commons.httpclient.HttpClientError;
+import org.apache.commons.httpclient.params.HttpConnectionParams;
+import org.apache.commons.httpclient.protocol.ControllerThreadSocketFactory;
+import org.apache.commons.httpclient.protocol.SecureProtocolSocketFactory;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import javax.net.ssl.SSLContext;
+import javax.net.ssl.TrustManager;
+
+public class DummySSLProtocolSocketFactory implements
+    SecureProtocolSocketFactory {
+
+  /** Logger object for this class. */
+  private static final Logger LOG = LoggerFactory
+      .getLogger(DummySSLProtocolSocketFactory.class);
+
+  private SSLContext sslcontext = null;
+
+  /**
+   * Constructor for DummySSLProtocolSocketFactory.
+   */
+  public DummySSLProtocolSocketFactory() {
+    super();
+  }
+
+  private static SSLContext createEasySSLContext() {
+    try {
+      SSLContext context = SSLContext.getInstance("SSL");
+      context.init(null,
+          new TrustManager[] { new DummyX509TrustManager(null) }, null);
+      return context;
+    } catch (Exception e) {
+      if (LOG.isErrorEnabled()) {
+        LOG.error(e.getMessage(), e);
+      }
+      throw new HttpClientError(e.toString());
+    }
+  }
+
+  private SSLContext getSSLContext() {
+    if (this.sslcontext == null) {
+      this.sslcontext = createEasySSLContext();
+    }
+    return this.sslcontext;
+  }
+
+  /**
+   * @see org.apache.commons.httpclient.protocol.SecureProtocolSocketFactory#createSocket(String,int,InetAddress,int)
+   */
+  public Socket createSocket(String host, int port, InetAddress clientHost,
+      int clientPort) throws IOException, UnknownHostException {
+
+    return getSSLContext().getSocketFactory().createSocket(host, port,
+        clientHost, clientPort);
+  }
+
+  /**
+   * Attempts to get a new socket connection to the given host within the given
+   * time limit.
+   * <p>
+   * To circumvent the limitations of older JREs that do not support connect
+   * timeout a controller thread is executed. The controller thread attempts to
+   * create a new socket within the given limit of time. If socket constructor
+   * does not return until the timeout expires, the controller terminates and
+   * throws an {@link ConnectTimeoutException}
+   * </p>
+   * 
+   * @param host
+   *          the host name/IP
+   * @param port
+   *          the port on the host
+   * @param localAddress
+   *          the local host name/IP to bind the socket to
+   * @param localPort
+   *          the port on the local machine
+   * @param params
+   *          {@link HttpConnectionParams Http connection parameters}
+   * 
+   * @return Socket a new socket
+   * 
+   * @throws IOException
+   *           if an I/O error occurs while creating the socket
+   * @throws UnknownHostException
+   *           if the IP address of the host cannot be determined
+   */
+  public Socket createSocket(final String host, final int port,
+      final InetAddress localAddress, final int localPort,
+      final HttpConnectionParams params) throws IOException,
+      UnknownHostException, ConnectTimeoutException {
+    if (params == null) {
+      throw new IllegalArgumentException("Parameters may not be null");
+    }
+    int timeout = params.getConnectionTimeout();
+    if (timeout == 0) {
+      return createSocket(host, port, localAddress, localPort);
+    } else {
+      // To be eventually deprecated when migrated to Java 1.4 or above
+      return ControllerThreadSocketFactory.createSocket(this, host, port,
+          localAddress, localPort, timeout);
+    }
+  }
+
+  /**
+   * @see org.apache.commons.httpclient.protocol.SecureProtocolSocketFactory#createSocket(String,int)
+   */
+  public Socket createSocket(String host, int port) throws IOException,
+      UnknownHostException {
+    return getSSLContext().getSocketFactory().createSocket(host, port);
+  }
+
+  /**
+   * @see org.apache.commons.httpclient.protocol.SecureProtocolSocketFactory#createSocket(Socket,String,int,boolean)
+   */
+  public Socket createSocket(Socket socket, String host, int port,
+      boolean autoClose) throws IOException, UnknownHostException {
+    return getSSLContext().getSocketFactory().createSocket(socket, host, port,
+        autoClose);
+  }
+
+  public boolean equals(Object obj) {
+    return ((obj != null) && obj.getClass().equals(
+        DummySSLProtocolSocketFactory.class));
+  }
+
+  public int hashCode() {
+    return DummySSLProtocolSocketFactory.class.hashCode();
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/DummyX509TrustManager.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/DummyX509TrustManager.java b/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/DummyX509TrustManager.java
new file mode 100644
index 0000000..b5509cc
--- /dev/null
+++ b/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/DummyX509TrustManager.java
@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * Based on EasyX509TrustManager from commons-httpclient.
+ */
+
+package org.apache.nutch.protocol.httpclient;
+
+import java.security.KeyStore;
+import java.security.KeyStoreException;
+import java.security.NoSuchAlgorithmException;
+import java.security.cert.CertificateException;
+import java.security.cert.X509Certificate;
+
+import javax.net.ssl.TrustManagerFactory;
+import javax.net.ssl.TrustManager;
+import javax.net.ssl.X509TrustManager;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class DummyX509TrustManager implements X509TrustManager {
+  private X509TrustManager standardTrustManager = null;
+
+  /** Logger object for this class. */
+  private static final Logger LOG = LoggerFactory
+      .getLogger(DummyX509TrustManager.class);
+
+  /**
+   * Constructor for DummyX509TrustManager.
+   */
+  public DummyX509TrustManager(KeyStore keystore)
+      throws NoSuchAlgorithmException, KeyStoreException {
+    super();
+    String algo = TrustManagerFactory.getDefaultAlgorithm();
+    TrustManagerFactory factory = TrustManagerFactory.getInstance(algo);
+    factory.init(keystore);
+    TrustManager[] trustmanagers = factory.getTrustManagers();
+    if (trustmanagers.length == 0) {
+      throw new NoSuchAlgorithmException(algo + " trust manager not supported");
+    }
+    this.standardTrustManager = (X509TrustManager) trustmanagers[0];
+  }
+
+  /**
+   * @see javax.net.ssl.X509TrustManager#checkClientTrusted(X509Certificate[],
+   *      String)
+   */
+  public boolean isClientTrusted(X509Certificate[] certificates) {
+    return true;
+  }
+
+  /**
+   * @see javax.net.ssl.X509TrustManager#checkServerTrusted(X509Certificate[],
+   *      String)
+   */
+  public boolean isServerTrusted(X509Certificate[] certificates) {
+    return true;
+  }
+
+  /**
+   * @see javax.net.ssl.X509TrustManager#getAcceptedIssuers()
+   */
+  public X509Certificate[] getAcceptedIssuers() {
+    return this.standardTrustManager.getAcceptedIssuers();
+  }
+
+  public void checkClientTrusted(X509Certificate[] arg0, String arg1)
+      throws CertificateException {
+    // do nothing
+
+  }
+
+  public void checkServerTrusted(X509Certificate[] arg0, String arg1)
+      throws CertificateException {
+    // do nothing
+
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/Http.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/Http.java b/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/Http.java
new file mode 100644
index 0000000..75506ce
--- /dev/null
+++ b/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/Http.java
@@ -0,0 +1,572 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.httpclient;
+
+// JDK imports
+import java.io.InputStream;
+import java.io.IOException;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.parsers.ParserConfigurationException;
+
+import org.xml.sax.SAXException;
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.NodeList;
+import org.w3c.dom.Node;
+
+// Slf4j Logging imports
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+// HTTP Client imports
+import org.apache.commons.httpclient.Header;
+import org.apache.commons.httpclient.HostConfiguration;
+import org.apache.commons.httpclient.HttpClient;
+import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager;
+import org.apache.commons.httpclient.NTCredentials;
+import org.apache.commons.httpclient.auth.AuthScope;
+import org.apache.commons.httpclient.params.HttpConnectionManagerParams;
+import org.apache.commons.httpclient.protocol.Protocol;
+import org.apache.commons.httpclient.protocol.ProtocolSocketFactory;
+// NUTCH-1929 Consider implementing dependency injection for crawl HTTPS sites that use self signed certificates
+//import org.apache.commons.httpclient.protocol.SSLProtocolSocketFactory;
+
+import org.apache.commons.lang.StringUtils;
+// Nutch imports
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.protocol.http.api.HttpBase;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.NutchConfiguration;
+
+/**
+ * <p>
+ * This class is a protocol plugin that configures an HTTP client for Basic,
+ * Digest and NTLM authentication schemes for web server as well as proxy
+ * server. It takes care of HTTPS protocol as well as cookies in a single fetch
+ * session.
+ * </p>
+ * <p>
+ * Documentation can be found on the Nutch <a
+ * href="https://wiki.apache.org/nutch/HttpAuthenticationSchemes"
+ * >HttpAuthenticationSchemes</a> wiki page.
+ * </p>
+ * <p>
+ * The original description of the motivation to support <a
+ * href="https://wiki.apache.org/nutch/HttpPostAuthentication"
+ * >HttpPostAuthentication</a> is also included on the Nutch wiki. Additionally
+ * HttpPostAuthentication development is documented at the <a
+ * href="https://issues.apache.org/jira/browse/NUTCH-827">NUTCH-827</a> Jira
+ * issue.
+ * 
+ * @author Susam Pal
+ */
+public class Http extends HttpBase {
+
+  public static final Logger LOG = LoggerFactory.getLogger(Http.class);
+
+  private static MultiThreadedHttpConnectionManager connectionManager = new MultiThreadedHttpConnectionManager();
+
+  // Since the Configuration has not yet been set,
+  // then an unconfigured client is returned.
+  private static HttpClient client = new HttpClient(connectionManager);
+  private static String defaultUsername;
+  private static String defaultPassword;
+  private static String defaultRealm;
+  private static String defaultScheme;
+  private static String authFile;
+  private static String agentHost;
+  private static boolean authRulesRead = false;
+  private static Configuration conf;
+
+  private int maxThreadsTotal = 10;
+
+  private String proxyUsername;
+  private String proxyPassword;
+  private String proxyRealm;
+
+  private static HttpFormAuthConfigurer formConfigurer;
+
+  /**
+   * Returns the configured HTTP client.
+   * 
+   * @return HTTP client
+   */
+  static synchronized HttpClient getClient() {
+    return client;
+  }
+
+  /**
+   * Constructs this plugin.
+   */
+  public Http() {
+    super(LOG);
+  }
+
+  /**
+   * Reads the configuration from the Nutch configuration files and sets the
+   * configuration.
+   * 
+   * @param conf
+   *          Configuration
+   */
+  public void setConf(Configuration conf) {
+    super.setConf(conf);
+    this.conf = conf;
+    this.maxThreadsTotal = conf.getInt("fetcher.threads.fetch", 10);
+    this.proxyUsername = conf.get("http.proxy.username", "");
+    this.proxyPassword = conf.get("http.proxy.password", "");
+    this.proxyRealm = conf.get("http.proxy.realm", "");
+    agentHost = conf.get("http.agent.host", "");
+    authFile = conf.get("http.auth.file", "");
+    configureClient();
+    try {
+      setCredentials();
+    } catch (Exception ex) {
+      if (LOG.isErrorEnabled()) {
+        LOG.error("Could not read " + authFile + " : " + ex.getMessage());
+      }
+    }
+  }
+
+  /**
+   * Main method.
+   * 
+   * @param args
+   *          Command line arguments
+   */
+  public static void main(String[] args) throws Exception {
+    Http http = new Http();
+    http.setConf(NutchConfiguration.create());
+    main(http, args);
+  }
+
+  /**
+   * Fetches the <code>url</code> with a configured HTTP client and gets the
+   * response.
+   * 
+   * @param url
+   *          URL to be fetched
+   * @param datum
+   *          Crawl data
+   * @param redirect
+   *          Follow redirects if and only if true
+   * @return HTTP response
+   */
+  protected Response getResponse(URL url, CrawlDatum datum, boolean redirect)
+      throws ProtocolException, IOException {
+    resolveCredentials(url);
+    return new HttpResponse(this, url, datum, redirect);
+  }
+
+  /**
+   * Configures the HTTP client
+   */
+  private void configureClient() {
+
+    // Set up an HTTPS socket factory that accepts self-signed certs.
+    // ProtocolSocketFactory factory = new SSLProtocolSocketFactory();
+    ProtocolSocketFactory factory = new DummySSLProtocolSocketFactory();
+    Protocol https = new Protocol("https", factory, 443);
+    Protocol.registerProtocol("https", https);
+
+    HttpConnectionManagerParams params = connectionManager.getParams();
+    params.setConnectionTimeout(timeout);
+    params.setSoTimeout(timeout);
+    params.setSendBufferSize(BUFFER_SIZE);
+    params.setReceiveBufferSize(BUFFER_SIZE);
+
+    // --------------------------------------------------------------------------------
+    // NUTCH-1836: Modification to increase the number of available connections
+    // for multi-threaded crawls.
+    // --------------------------------------------------------------------------------
+    params.setMaxTotalConnections(conf.getInt(
+        "mapred.tasktracker.map.tasks.maximum", 5)
+        * conf.getInt("fetcher.threads.fetch", maxThreadsTotal));
+
+    // Also set max connections per host to maxThreadsTotal since all threads
+    // might be used to fetch from the same host - otherwise timeout errors can
+    // occur
+    params.setDefaultMaxConnectionsPerHost(conf.getInt(
+        "fetcher.threads.fetch", maxThreadsTotal));
+
+    // executeMethod(HttpMethod) seems to ignore the connection timeout on the
+    // connection manager.
+    // set it explicitly on the HttpClient.
+    client.getParams().setConnectionManagerTimeout(timeout);
+
+    HostConfiguration hostConf = client.getHostConfiguration();
+    ArrayList<Header> headers = new ArrayList<Header>();
+    // Set the User Agent in the header
+    // headers.add(new Header("User-Agent", userAgent)); //NUTCH-1941
+    // prefer English
+    headers.add(new Header("Accept-Language", acceptLanguage));
+    // prefer UTF-8
+    headers.add(new Header("Accept-Charset", "utf-8,ISO-8859-1;q=0.7,*;q=0.7"));
+    // prefer understandable formats
+    headers
+        .add(new Header(
+            "Accept",
+            "text/html,application/xml;q=0.9,application/xhtml+xml,text/xml;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5"));
+    // accept gzipped content
+    headers.add(new Header("Accept-Encoding", "x-gzip, gzip, deflate"));
+    hostConf.getParams().setParameter("http.default-headers", headers);
+
+    // HTTP proxy server details
+    if (useProxy) {
+      hostConf.setProxy(proxyHost, proxyPort);
+
+      if (proxyUsername.length() > 0) {
+
+        AuthScope proxyAuthScope = getAuthScope(this.proxyHost, this.proxyPort,
+            this.proxyRealm);
+
+        NTCredentials proxyCredentials = new NTCredentials(this.proxyUsername,
+            this.proxyPassword, Http.agentHost, this.proxyRealm);
+
+        client.getState().setProxyCredentials(proxyAuthScope, proxyCredentials);
+      }
+    }
+
+  }
+
+  /**
+   * Reads authentication configuration file (defined as 'http.auth.file' in
+   * Nutch configuration file) and sets the credentials for the configured
+   * authentication scopes in the HTTP client object.
+   * 
+   * @throws ParserConfigurationException
+   *           If a document builder can not be created.
+   * @throws SAXException
+   *           If any parsing error occurs.
+   * @throws IOException
+   *           If any I/O error occurs.
+   */
+  private static synchronized void setCredentials()
+      throws ParserConfigurationException, SAXException, IOException {
+
+    if (authRulesRead)
+      return;
+
+    authRulesRead = true; // Avoid re-attempting to read
+
+    InputStream is = conf.getConfResourceAsInputStream(authFile);
+    if (is != null) {
+      Document doc = DocumentBuilderFactory.newInstance().newDocumentBuilder()
+          .parse(is);
+
+      Element rootElement = doc.getDocumentElement();
+      if (!"auth-configuration".equals(rootElement.getTagName())) {
+        if (LOG.isWarnEnabled())
+          LOG.warn("Bad auth conf file: root element <"
+              + rootElement.getTagName() + "> found in " + authFile
+              + " - must be <auth-configuration>");
+      }
+
+      // For each set of credentials
+      NodeList credList = rootElement.getChildNodes();
+      for (int i = 0; i < credList.getLength(); i++) {
+        Node credNode = credList.item(i);
+        if (!(credNode instanceof Element))
+          continue;
+
+        Element credElement = (Element) credNode;
+        if (!"credentials".equals(credElement.getTagName())) {
+          if (LOG.isWarnEnabled())
+            LOG.warn("Bad auth conf file: Element <" + credElement.getTagName()
+                + "> not recognized in " + authFile
+                + " - expected <credentials>");
+          continue;
+        }
+
+        String authMethod = credElement.getAttribute("authMethod");
+        // read http form post auth info
+        if (StringUtils.isNotBlank(authMethod)) {
+          formConfigurer = readFormAuthConfigurer(credElement, authMethod);
+          continue;
+        }
+
+        String username = credElement.getAttribute("username");
+        String password = credElement.getAttribute("password");
+
+        // For each authentication scope
+        NodeList scopeList = credElement.getChildNodes();
+        for (int j = 0; j < scopeList.getLength(); j++) {
+          Node scopeNode = scopeList.item(j);
+          if (!(scopeNode instanceof Element))
+            continue;
+
+          Element scopeElement = (Element) scopeNode;
+
+          if ("default".equals(scopeElement.getTagName())) {
+
+            // Determine realm and scheme, if any
+            String realm = scopeElement.getAttribute("realm");
+            String scheme = scopeElement.getAttribute("scheme");
+
+            // Set default credentials
+            defaultUsername = username;
+            defaultPassword = password;
+            defaultRealm = realm;
+            defaultScheme = scheme;
+
+            if (LOG.isTraceEnabled()) {
+              LOG.trace("Credentials - username: " + username
+                  + "; set as default" + " for realm: " + realm + "; scheme: "
+                  + scheme);
+            }
+
+          } else if ("authscope".equals(scopeElement.getTagName())) {
+
+            // Determine authentication scope details
+            String host = scopeElement.getAttribute("host");
+            int port = -1; // For setting port to AuthScope.ANY_PORT
+            try {
+              port = Integer.parseInt(scopeElement.getAttribute("port"));
+            } catch (Exception ex) {
+              // do nothing, port is already set to any port
+            }
+            String realm = scopeElement.getAttribute("realm");
+            String scheme = scopeElement.getAttribute("scheme");
+
+            // Set credentials for the determined scope
+            AuthScope authScope = getAuthScope(host, port, realm, scheme);
+            NTCredentials credentials = new NTCredentials(username, password,
+                agentHost, realm);
+
+            client.getState().setCredentials(authScope, credentials);
+
+            if (LOG.isTraceEnabled()) {
+              LOG.trace("Credentials - username: " + username
+                  + "; set for AuthScope - " + "host: " + host + "; port: "
+                  + port + "; realm: " + realm + "; scheme: " + scheme);
+            }
+
+          } else {
+            if (LOG.isWarnEnabled())
+              LOG.warn("Bad auth conf file: Element <"
+                  + scopeElement.getTagName() + "> not recognized in "
+                  + authFile + " - expected <authscope>");
+          }
+        }
+        is.close();
+      }
+    }
+  }
+
+  /**
+   * <auth-configuration> <credentials authMethod="formAuth" loginUrl="loginUrl"
+   * loginFormId="loginFormId" loginRedirect="true"> <loginPostData> <field
+   * name="username" value="user1"/> </loginPostData> <additionalPostHeaders>
+   * <field name="header1" value="vaule1"/> </additionalPostHeaders>
+   * <removedFormFields> <field name="header1"/> </removedFormFields>
+   * </credentials> </auth-configuration>
+   */
+  private static HttpFormAuthConfigurer readFormAuthConfigurer(
+      Element credElement, String authMethod) {
+    if ("formAuth".equals(authMethod)) {
+      HttpFormAuthConfigurer formConfigurer = new HttpFormAuthConfigurer();
+
+      String str = credElement.getAttribute("loginUrl");
+      if (StringUtils.isNotBlank(str)) {
+        formConfigurer.setLoginUrl(str.trim());
+      } else {
+        throw new IllegalArgumentException("Must set loginUrl.");
+      }
+      str = credElement.getAttribute("loginFormId");
+      if (StringUtils.isNotBlank(str)) {
+        formConfigurer.setLoginFormId(str.trim());
+      } else {
+        throw new IllegalArgumentException("Must set loginFormId.");
+      }
+      str = credElement.getAttribute("loginRedirect");
+      if (StringUtils.isNotBlank(str)) {
+        formConfigurer.setLoginRedirect(Boolean.parseBoolean(str));
+      }
+
+      NodeList nodeList = credElement.getChildNodes();
+      for (int j = 0; j < nodeList.getLength(); j++) {
+        Node node = nodeList.item(j);
+        if (!(node instanceof Element))
+          continue;
+
+        Element element = (Element) node;
+        if ("loginPostData".equals(element.getTagName())) {
+          Map<String, String> loginPostData = new HashMap<String, String>();
+          NodeList childNodes = element.getChildNodes();
+          for (int k = 0; k < childNodes.getLength(); k++) {
+            Node fieldNode = childNodes.item(k);
+            if (!(fieldNode instanceof Element))
+              continue;
+
+            Element fieldElement = (Element) fieldNode;
+            String name = fieldElement.getAttribute("name");
+            String value = fieldElement.getAttribute("value");
+            loginPostData.put(name, value);
+          }
+          formConfigurer.setLoginPostData(loginPostData);
+        } else if ("additionalPostHeaders".equals(element.getTagName())) {
+          Map<String, String> additionalPostHeaders = new HashMap<String, String>();
+          NodeList childNodes = element.getChildNodes();
+          for (int k = 0; k < childNodes.getLength(); k++) {
+            Node fieldNode = childNodes.item(k);
+            if (!(fieldNode instanceof Element))
+              continue;
+
+            Element fieldElement = (Element) fieldNode;
+            String name = fieldElement.getAttribute("name");
+            String value = fieldElement.getAttribute("value");
+            additionalPostHeaders.put(name, value);
+          }
+          formConfigurer.setAdditionalPostHeaders(additionalPostHeaders);
+        } else if ("removedFormFields".equals(element.getTagName())) {
+          Set<String> removedFormFields = new HashSet<String>();
+          NodeList childNodes = element.getChildNodes();
+          for (int k = 0; k < childNodes.getLength(); k++) {
+            Node fieldNode = childNodes.item(k);
+            if (!(fieldNode instanceof Element))
+              continue;
+
+            Element fieldElement = (Element) fieldNode;
+            String name = fieldElement.getAttribute("name");
+            removedFormFields.add(name);
+          }
+          formConfigurer.setRemovedFormFields(removedFormFields);
+        }
+      }
+
+      return formConfigurer;
+    } else {
+      throw new IllegalArgumentException("Unsupported authMethod: "
+          + authMethod);
+    }
+  }
+
+  /**
+   * If credentials for the authentication scope determined from the specified
+   * <code>url</code> is not already set in the HTTP client, then this method
+   * sets the default credentials to fetch the specified <code>url</code>. If
+   * credentials are found for the authentication scope, the method returns
+   * without altering the client.
+   * 
+   * @param url
+   *          URL to be fetched
+   */
+  private void resolveCredentials(URL url) {
+
+    if (formConfigurer != null) {
+      HttpFormAuthentication formAuther = new HttpFormAuthentication(
+          formConfigurer, client, this);
+      try {
+        formAuther.login();
+      } catch (Exception e) {
+        throw new RuntimeException(e);
+      }
+
+      return;
+    }
+
+    if (defaultUsername != null && defaultUsername.length() > 0) {
+
+      int port = url.getPort();
+      if (port == -1) {
+        if ("https".equals(url.getProtocol()))
+          port = 443;
+        else
+          port = 80;
+      }
+
+      AuthScope scope = new AuthScope(url.getHost(), port);
+
+      if (client.getState().getCredentials(scope) != null) {
+        if (LOG.isTraceEnabled())
+          LOG.trace("Pre-configured credentials with scope - host: "
+              + url.getHost() + "; port: " + port + "; found for url: " + url);
+
+        // Credentials are already configured, so do nothing and return
+        return;
+      }
+
+      if (LOG.isTraceEnabled())
+        LOG.trace("Pre-configured credentials with scope -  host: "
+            + url.getHost() + "; port: " + port + "; not found for url: " + url);
+
+      AuthScope serverAuthScope = getAuthScope(url.getHost(), port,
+          defaultRealm, defaultScheme);
+
+      NTCredentials serverCredentials = new NTCredentials(defaultUsername,
+          defaultPassword, agentHost, defaultRealm);
+
+      client.getState().setCredentials(serverAuthScope, serverCredentials);
+    }
+  }
+
+  /**
+   * Returns an authentication scope for the specified <code>host</code>,
+   * <code>port</code>, <code>realm</code> and <code>scheme</code>.
+   * 
+   * @param host
+   *          Host name or address.
+   * @param port
+   *          Port number.
+   * @param realm
+   *          Authentication realm.
+   * @param scheme
+   *          Authentication scheme.
+   */
+  private static AuthScope getAuthScope(String host, int port, String realm,
+      String scheme) {
+
+    if (host.length() == 0)
+      host = null;
+
+    if (port < 0)
+      port = -1;
+
+    if (realm.length() == 0)
+      realm = null;
+
+    if (scheme.length() == 0)
+      scheme = null;
+
+    return new AuthScope(host, port, realm, scheme);
+  }
+
+  /**
+   * Returns an authentication scope for the specified <code>host</code>,
+   * <code>port</code> and <code>realm</code>.
+   * 
+   * @param host
+   *          Host name or address.
+   * @param port
+   *          Port number.
+   * @param realm
+   *          Authentication realm.
+   */
+  private static AuthScope getAuthScope(String host, int port, String realm) {
+
+    return getAuthScope(host, port, realm, "");
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/HttpAuthentication.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/HttpAuthentication.java b/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/HttpAuthentication.java
new file mode 100644
index 0000000..54dc905
--- /dev/null
+++ b/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/HttpAuthentication.java
@@ -0,0 +1,45 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.httpclient;
+
+import java.util.List;
+
+/**
+ * The base level of services required for Http Authentication
+ * 
+ * @see HttpAuthenticationFactory
+ * 
+ * @author Matt Tencati
+ */
+public interface HttpAuthentication {
+
+  /**
+   * Gets the credentials generated by the HttpAuthentication object. May return
+   * null.
+   * 
+   * @return The credentials value
+   */
+  public List<String> getCredentials();
+
+  /**
+   * Gets the realm used by the HttpAuthentication object during creation.
+   * 
+   * @return The realm value
+   */
+  public String getRealm();
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationException.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationException.java b/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationException.java
new file mode 100644
index 0000000..daff5ec
--- /dev/null
+++ b/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationException.java
@@ -0,0 +1,71 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.httpclient;
+
+/**
+ * Can be used to identify problems during creation of Authentication objects.
+ * In the future it may be used as a method of collecting authentication
+ * failures during Http protocol transfer in order to present the user with
+ * credentials required during a future fetch.
+ * 
+ * @author Matt Tencati
+ */
+public class HttpAuthenticationException extends Exception {
+
+  /**
+   * Constructs a new exception with null as its detail message.
+   */
+  public HttpAuthenticationException() {
+    super();
+  }
+
+  /**
+   * Constructs a new exception with the specified detail message.
+   * 
+   * @param message
+   *          the detail message. The detail message is saved for later
+   *          retrieval by the {@link Throwable#getMessage()} method.
+   */
+  public HttpAuthenticationException(String message) {
+    super(message);
+  }
+
+  /**
+   * Constructs a new exception with the specified message and cause.
+   * 
+   * @param message
+   *          the detail message. The detail message is saved for later
+   *          retrieval by the {@link Throwable#getMessage()} method.
+   * @param cause
+   *          the cause (use {@link #getCause()} to retrieve the cause)
+   */
+  public HttpAuthenticationException(String message, Throwable cause) {
+    super(message, cause);
+  }
+
+  /**
+   * Constructs a new exception with the specified cause and detail message from
+   * given clause if it is not null.
+   * 
+   * @param cause
+   *          the cause (use {@link #getCause()} to retrieve the cause)
+   */
+  public HttpAuthenticationException(Throwable cause) {
+    super(cause);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java b/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java
new file mode 100644
index 0000000..064a6d0
--- /dev/null
+++ b/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java
@@ -0,0 +1,98 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.httpclient;
+
+// JDK imports
+import java.util.ArrayList;
+import java.util.Collection;
+
+// Slf4j Logging imports
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configurable;
+
+// Nutch imports
+import org.apache.nutch.metadata.Metadata;
+
+/**
+ * Provides the Http protocol implementation with the ability to authenticate
+ * when prompted. The goal is to provide multiple authentication types but for
+ * now just the {@link HttpBasicAuthentication} authentication type is provided.
+ * 
+ * @see HttpBasicAuthentication
+ * @see Http
+ * @see HttpResponse
+ * 
+ * @author Matt Tencati
+ */
+public class HttpAuthenticationFactory implements Configurable {
+
+  /**
+   * The HTTP Authentication (WWW-Authenticate) header which is returned by a
+   * webserver requiring authentication.
+   */
+  public static final String WWW_AUTHENTICATE = "WWW-Authenticate";
+
+  public static final Logger LOG = LoggerFactory
+      .getLogger(HttpAuthenticationFactory.class);
+
+  private Configuration conf = null;
+
+  public HttpAuthenticationFactory(Configuration conf) {
+    setConf(conf);
+  }
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+  }
+
+  public Configuration getConf() {
+    return conf;
+  }
+
+  public HttpAuthentication findAuthentication(Metadata header) {
+
+    if (header == null)
+      return null;
+
+    try {
+      Collection<String> challenge = new ArrayList<String>();
+      challenge.add(header.get(WWW_AUTHENTICATE));
+
+      for (String challengeString : challenge) {
+        if (challengeString.equals("NTLM"))
+          challengeString = "Basic realm=techweb";
+
+        if (LOG.isTraceEnabled())
+          LOG.trace("Checking challengeString=" + challengeString);
+
+        HttpAuthentication auth = HttpBasicAuthentication.getAuthentication(
+            challengeString, conf);
+        if (auth != null)
+          return auth;
+
+        // TODO Add additional Authentication lookups here
+      }
+    } catch (Exception e) {
+      LOG.error("Error: ", e);
+    }
+    return null;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/HttpBasicAuthentication.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/HttpBasicAuthentication.java b/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/HttpBasicAuthentication.java
new file mode 100644
index 0000000..0cc2de5
--- /dev/null
+++ b/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/HttpBasicAuthentication.java
@@ -0,0 +1,199 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.httpclient;
+
+// JDK imports
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.TreeMap;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+// Commons Codec imports
+import org.apache.commons.codec.binary.Base64;
+
+// Commons Logging imports
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configurable;
+
+/**
+ * Implementation of RFC 2617 Basic Authentication. Usernames and passwords are
+ * stored in standard Nutch configuration files using the following properties:
+ * http.auth.basic.<realm>.user http.auth.basic.<realm>.pass
+ * 
+ * @author Matt Tencati
+ */
+public class HttpBasicAuthentication implements HttpAuthentication,
+    Configurable {
+
+  public static final Logger LOG = LoggerFactory
+      .getLogger(HttpBasicAuthentication.class);
+
+  private static Pattern basic = Pattern
+      .compile("[bB][aA][sS][iI][cC] [rR][eE][aA][lL][mM]=\"(\\w*)\"");
+
+  private static Map<String, HttpBasicAuthentication> authMap = new TreeMap<String, HttpBasicAuthentication>();
+
+  private Configuration conf = null;
+  private String challenge = null;
+  private ArrayList<String> credentials = null;
+  private String realm = null;
+
+  /**
+   * Construct an HttpBasicAuthentication for the given challenge parameters.
+   * The challenge parameters are returned by the web server using a
+   * WWW-Authenticate header. This will typically be represented by single line
+   * of the form <code>WWW-Authenticate: Basic realm="myrealm"</code>
+   * 
+   * @param challenge
+   *          WWW-Authenticate header from web server
+   */
+  protected HttpBasicAuthentication(String challenge, Configuration conf)
+      throws HttpAuthenticationException {
+
+    setConf(conf);
+    this.challenge = challenge;
+    credentials = new ArrayList<String>();
+
+    String username = this.conf.get("http.auth.basic." + challenge + ".user");
+    String password = this.conf.get("http.auth.basic." + challenge
+        + ".password");
+
+    if (LOG.isTraceEnabled()) {
+      LOG.trace("BasicAuthentication challenge is " + challenge);
+      LOG.trace("BasicAuthentication username=" + username);
+      LOG.trace("BasicAuthentication password=" + password);
+    }
+
+    if (username == null) {
+      throw new HttpAuthenticationException("Username for " + challenge
+          + " is null");
+    }
+
+    if (password == null) {
+      throw new HttpAuthenticationException("Password for " + challenge
+          + " is null");
+    }
+
+    byte[] credBytes = (username + ":" + password).getBytes();
+    credentials.add("Authorization: Basic "
+        + new String(Base64.encodeBase64(credBytes)));
+    if (LOG.isTraceEnabled()) {
+      LOG.trace("Basic credentials: " + credentials);
+    }
+  }
+
+  /*
+   * ---------------------------------- * <implementation:Configurable> *
+   * ----------------------------------
+   */
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    // if (conf.getBoolean("http.auth.verbose", false)) {
+    // LOG.setLevel(Level.FINE);
+    // } else {
+    // LOG.setLevel(Level.WARNING);
+    // }
+  }
+
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+  /*
+   * ---------------------------------- * <implementation:Configurable> *
+   * ----------------------------------
+   */
+
+  /**
+   * Gets the Basic credentials generated by this HttpBasicAuthentication object
+   * 
+   * @return Credentials in the form of
+   *         <code>Authorization: Basic &lt;Base64 encoded userid:password&gt;
+   * 
+   */
+  public List<String> getCredentials() {
+    return credentials;
+  }
+
+  /**
+   * Gets the realm attribute of the HttpBasicAuthentication object. This should
+   * have been supplied to the {@link #getAuthentication(String, Configuration)}
+   * static method
+   * 
+   * @return The realm
+   */
+  public String getRealm() {
+    return realm;
+  }
+
+  /**
+   * This method is responsible for providing Basic authentication information.
+   * The method caches authentication information for each realm so that the
+   * required authentication information does not need to be regenerated for
+   * every request.
+   * 
+   * @param challenge
+   *          The challenge string provided by the webserver. This is the text
+   *          which follows the WWW-Authenticate header, including the Basic
+   *          tag.
+   * @return An HttpBasicAuthentication object or null if unable to generate
+   *         appropriate credentials.
+   */
+  public static HttpBasicAuthentication getAuthentication(String challenge,
+      Configuration conf) {
+    if (challenge == null)
+      return null;
+    Matcher basicMatcher = basic.matcher(challenge);
+    if (basicMatcher.matches()) {
+      String realm = basicMatcher.group(1);
+      Object auth = authMap.get(realm);
+      if (auth == null) {
+        HttpBasicAuthentication newAuth = null;
+        try {
+          newAuth = new HttpBasicAuthentication(realm, conf);
+        } catch (HttpAuthenticationException hae) {
+          if (LOG.isTraceEnabled()) {
+            LOG.trace("HttpBasicAuthentication failed for " + challenge);
+          }
+        }
+        authMap.put(realm, newAuth);
+        return newAuth;
+      } else {
+        return (HttpBasicAuthentication) auth;
+      }
+    }
+    return null;
+  }
+
+  /**
+   * Provides a pattern which can be used by an outside resource to determine if
+   * this class can provide credentials based on simple header information. It
+   * does not calculate any information regarding realms or challenges.
+   * 
+   * @return Returns a Pattern which will match a Basic WWW-Authenticate header.
+   */
+  public static final Pattern getBasicPattern() {
+    return basic;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/HttpFormAuthConfigurer.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/HttpFormAuthConfigurer.java b/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/HttpFormAuthConfigurer.java
new file mode 100644
index 0000000..b713ab6
--- /dev/null
+++ b/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/HttpFormAuthConfigurer.java
@@ -0,0 +1,106 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.httpclient;
+
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+public class HttpFormAuthConfigurer {
+  private String loginUrl;
+  private String loginFormId;
+  /**
+   * The data posted to login form, such as username(or email), password
+   */
+  private Map<String, String> loginPostData;
+  /**
+   * In case we need add additional headers.
+   */
+  private Map<String, String> additionalPostHeaders;
+  /**
+   * If http post login returns redirect code: 301 or 302, 
+   * Http Client will automatically follow the redirect.
+   */
+  private boolean loginRedirect;
+  /**
+   * Used when we need remove some form fields.
+   */
+  private Set<String> removedFormFields;
+
+  public HttpFormAuthConfigurer() {
+  }
+
+  public String getLoginUrl() {
+    return loginUrl;
+  }
+
+  public HttpFormAuthConfigurer setLoginUrl(String loginUrl) {
+    this.loginUrl = loginUrl;
+    return this;
+  }
+
+  public String getLoginFormId() {
+    return loginFormId;
+  }
+
+  public HttpFormAuthConfigurer setLoginFormId(String loginForm) {
+    this.loginFormId = loginForm;
+    return this;
+  }
+
+  public Map<String, String> getLoginPostData() {
+    return loginPostData == null ? new HashMap<String, String>()
+        : loginPostData;
+  }
+
+  public HttpFormAuthConfigurer setLoginPostData(
+      Map<String, String> loginPostData) {
+    this.loginPostData = loginPostData;
+    return this;
+  }
+
+  public Map<String, String> getAdditionalPostHeaders() {
+    return additionalPostHeaders == null ? new HashMap<String, String>()
+        : additionalPostHeaders;
+  }
+
+  public HttpFormAuthConfigurer setAdditionalPostHeaders(
+      Map<String, String> additionalPostHeaders) {
+    this.additionalPostHeaders = additionalPostHeaders;
+    return this;
+  }
+
+  public boolean isLoginRedirect() {
+    return loginRedirect;
+  }
+
+  public HttpFormAuthConfigurer setLoginRedirect(boolean redirect) {
+    this.loginRedirect = redirect;
+    return this;
+  }
+
+  public Set<String> getRemovedFormFields() {
+    return removedFormFields == null ? new HashSet<String>()
+        : removedFormFields;
+  }
+
+  public HttpFormAuthConfigurer setRemovedFormFields(
+      Set<String> removedFormFields) {
+    this.removedFormFields = removedFormFields;
+    return this; }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/HttpFormAuthentication.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/HttpFormAuthentication.java b/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/HttpFormAuthentication.java
new file mode 100644
index 0000000..4c73f50
--- /dev/null
+++ b/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/HttpFormAuthentication.java
@@ -0,0 +1,223 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.httpclient;
+
+import java.io.IOException;
+import java.io.UnsupportedEncodingException;
+import java.net.CookieHandler;
+import java.net.CookieManager;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.Set;
+
+import org.apache.commons.httpclient.Header;
+import org.apache.commons.httpclient.HttpClient;
+import org.apache.commons.httpclient.NameValuePair;
+import org.apache.commons.httpclient.methods.GetMethod;
+import org.apache.commons.httpclient.methods.PostMethod;
+import org.apache.commons.io.IOUtils;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class HttpFormAuthentication {
+  private static final Logger LOGGER = LoggerFactory
+      .getLogger(HttpFormAuthentication.class);
+  private static Map<String, String> defaultLoginHeaders = new HashMap<String, String>();
+
+  static {
+    defaultLoginHeaders.put("User-Agent", "Mozilla/5.0");
+    defaultLoginHeaders
+    .put("Accept",
+        "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
+    defaultLoginHeaders.put("Accept-Language", "en-US,en;q=0.5");
+    defaultLoginHeaders.put("Connection", "keep-alive");
+    defaultLoginHeaders.put("Content-Type",
+        "application/x-www-form-urlencoded");
+  }
+
+  private HttpClient client;
+  private HttpFormAuthConfigurer authConfigurer = new HttpFormAuthConfigurer();
+  private String cookies;
+
+  public HttpFormAuthentication(HttpFormAuthConfigurer authConfigurer,
+      HttpClient client, Http http) {
+    this.authConfigurer = authConfigurer;
+    this.client = client;
+    defaultLoginHeaders.put("Accept", http.getAccept());
+    defaultLoginHeaders.put("Accept-Language", http.getAcceptLanguage());
+    defaultLoginHeaders.put("User-Agent", http.getUserAgent());
+  }
+
+  public HttpFormAuthentication(String loginUrl, String loginForm,
+      Map<String, String> loginPostData,
+      Map<String, String> additionalPostHeaders,
+      Set<String> removedFormFields) {
+    this.authConfigurer.setLoginUrl(loginUrl);
+    this.authConfigurer.setLoginFormId(loginForm);
+    this.authConfigurer
+    .setLoginPostData(loginPostData == null ? new HashMap<String, String>()
+        : loginPostData);
+    this.authConfigurer
+    .setAdditionalPostHeaders(additionalPostHeaders == null ? new HashMap<String, String>()
+        : additionalPostHeaders);
+    this.authConfigurer
+    .setRemovedFormFields(removedFormFields == null ? new HashSet<String>()
+        : removedFormFields);
+    this.client = new HttpClient();
+  }
+
+  public void login() throws Exception {
+    // make sure cookies are turned on
+    CookieHandler.setDefault(new CookieManager());
+    String pageContent = httpGetPageContent(authConfigurer.getLoginUrl());
+    List<NameValuePair> params = getLoginFormParams(pageContent);
+    sendPost(authConfigurer.getLoginUrl(), params);
+  }
+
+  private void sendPost(String url, List<NameValuePair> params)
+      throws Exception {
+    PostMethod post = null;
+    try {
+      if (authConfigurer.isLoginRedirect()) {
+        post = new PostMethod(url) {
+          @Override
+          public boolean getFollowRedirects() {
+            return true;
+          }
+        };
+      } else {
+        post = new PostMethod(url);
+      }
+      // we can't use post.setFollowRedirects(true) as it will throw
+      // IllegalArgumentException:
+      // Entity enclosing requests cannot be redirected without user
+      // intervention
+      setLoginHeader(post);
+      post.addParameters(params.toArray(new NameValuePair[0]));
+      int rspCode = client.executeMethod(post);
+      if (LOGGER.isDebugEnabled()) {
+        LOGGER.debug("rspCode: " + rspCode);
+        LOGGER.debug("\nSending 'POST' request to URL : " + url);
+
+        LOGGER.debug("Post parameters : " + params);
+        LOGGER.debug("Response Code : " + rspCode);
+        for (Header header : post.getRequestHeaders()) {
+          LOGGER.debug("Response headers : " + header);
+        }
+      }
+      String rst = IOUtils.toString(post.getResponseBodyAsStream());
+      LOGGER.debug("login post result: " + rst);
+    } finally {
+      if (post != null) {
+        post.releaseConnection();
+      }
+    }
+  }
+
+  private void setLoginHeader(PostMethod post) {
+    Map<String, String> headers = new HashMap<String, String>();
+    headers.putAll(defaultLoginHeaders);
+    // additionalPostHeaders can overwrite value in defaultLoginHeaders
+    headers.putAll(authConfigurer.getAdditionalPostHeaders());
+    for (Entry<String, String> entry : headers.entrySet()) {
+      post.addRequestHeader(entry.getKey(), entry.getValue());
+    }
+    post.addRequestHeader("Cookie", getCookies());
+  }
+
+  private String httpGetPageContent(String url) throws IOException {
+
+    GetMethod get = new GetMethod(url);
+    try {
+      for (Entry<String, String> entry : authConfigurer
+          .getAdditionalPostHeaders().entrySet()) {
+        get.addRequestHeader(entry.getKey(), entry.getValue());
+      }
+      client.executeMethod(get);
+      Header cookieHeader = get.getResponseHeader("Set-Cookie");
+      if (cookieHeader != null) {
+        setCookies(cookieHeader.getValue());
+      }
+      String rst = IOUtils.toString(get.getResponseBodyAsStream());
+      return rst;
+    } finally {
+      get.releaseConnection();
+    }
+
+  }
+
+  private List<NameValuePair> getLoginFormParams(String pageContent)
+      throws UnsupportedEncodingException {
+    List<NameValuePair> params = new ArrayList<NameValuePair>();
+    Document doc = Jsoup.parse(pageContent);
+    Element loginform = doc.getElementById(authConfigurer.getLoginFormId());
+    if (loginform == null) {
+      LOGGER.debug("No form element found with 'id' = {}, trying 'name'.",
+          authConfigurer.getLoginFormId());
+      loginform = doc.select("form[name="+ authConfigurer.getLoginFormId() + "]").first();
+      if (loginform == null) {
+        LOGGER.debug("No form element found with 'name' = {}",
+            authConfigurer.getLoginFormId());
+        throw new IllegalArgumentException("No form exists: "
+            + authConfigurer.getLoginFormId());
+      }
+    }
+    Elements inputElements = loginform.getElementsByTag("input");
+    // skip fields in removedFormFields or loginPostData
+    for (Element inputElement : inputElements) {
+      String key = inputElement.attr("name");
+      String value = inputElement.attr("value");
+      if (authConfigurer.getLoginPostData().containsKey(key)
+          || authConfigurer.getRemovedFormFields().contains(key)) {
+        // value = loginPostData.get(key);
+        continue;
+      }
+      params.add(new NameValuePair(key, value));
+    }
+    // add key and value in loginPostData
+    for (Entry<String, String> entry : authConfigurer.getLoginPostData()
+        .entrySet()) {
+      params.add(new NameValuePair(entry.getKey(), entry.getValue()));
+    }
+    return params;
+  }
+
+  public String getCookies() {
+    return cookies;
+  }
+
+  public void setCookies(String cookies) {
+    this.cookies = cookies;
+  }
+
+  public boolean isRedirect() {
+    return authConfigurer.isLoginRedirect();
+  }
+
+  public void setRedirect(boolean redirect) {
+    this.authConfigurer.setLoginRedirect(redirect);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/HttpResponse.java b/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
new file mode 100644
index 0000000..f074af2
--- /dev/null
+++ b/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
@@ -0,0 +1,216 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.httpclient;
+
+// JDK imports
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URL;
+
+// HTTP Client imports
+import org.apache.commons.httpclient.Header;
+import org.apache.commons.httpclient.HttpVersion;
+import org.apache.commons.httpclient.cookie.CookiePolicy;
+import org.apache.commons.httpclient.methods.GetMethod;
+import org.apache.commons.httpclient.params.HttpMethodParams;
+import org.apache.commons.httpclient.HttpException;
+import org.apache.commons.httpclient.HttpClient;
+
+
+// Nutch imports
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.metadata.SpellCheckedMetadata;
+import org.apache.nutch.net.protocols.HttpDateFormat;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.http.api.HttpBase;
+
+/**
+ * An HTTP response.
+ * 
+ * @author Susam Pal
+ */
+public class HttpResponse implements Response {
+
+  private URL url;
+  private byte[] content;
+  private int code;
+  private Metadata headers = new SpellCheckedMetadata();
+
+  /**
+   * Fetches the given <code>url</code> and prepares HTTP response.
+   * 
+   * @param http
+   *          An instance of the implementation class of this plugin
+   * @param url
+   *          URL to be fetched
+   * @param datum
+   *          Crawl data
+   * @param followRedirects
+   *          Whether to follow redirects; follows redirect if and only if this
+   *          is true
+   * @return HTTP response
+   * @throws IOException
+   *           When an error occurs
+   */
+  HttpResponse(Http http, URL url, CrawlDatum datum, boolean followRedirects)
+      throws IOException {
+
+    // Prepare GET method for HTTP request
+    this.url = url;
+    GetMethod get = new GetMethod(url.toString());
+    get.setFollowRedirects(followRedirects);
+    get.setDoAuthentication(true);
+    if (http.isIfModifiedSinceEnabled() && datum.getModifiedTime() > 0) {
+      get.setRequestHeader("If-Modified-Since",
+          HttpDateFormat.toString(datum.getModifiedTime()));
+    }
+
+    // Set HTTP parameters
+    HttpMethodParams params = get.getParams();
+    if (http.getUseHttp11()) {
+      params.setVersion(HttpVersion.HTTP_1_1);
+    } else {
+      params.setVersion(HttpVersion.HTTP_1_0);
+    }
+    params.makeLenient();
+    params.setContentCharset("UTF-8");
+    params.setCookiePolicy(CookiePolicy.BROWSER_COMPATIBILITY);
+    params.setBooleanParameter(HttpMethodParams.SINGLE_COOKIE_HEADER, true);
+    // XXX (ab) not sure about this... the default is to retry 3 times; if
+    // XXX the request body was sent the method is not retried, so there is
+    // XXX little danger in retrying...
+    // params.setParameter(HttpMethodParams.RETRY_HANDLER, null);
+    try {
+      HttpClient client = Http.getClient();
+      client.getParams().setParameter("http.useragent", http.getUserAgent()); // NUTCH-1941
+      code = client.executeMethod(get);
+
+      Header[] heads = get.getResponseHeaders();
+
+      for (int i = 0; i < heads.length; i++) {
+        headers.set(heads[i].getName(), heads[i].getValue());
+      }
+
+      // Limit download size
+      int contentLength = Integer.MAX_VALUE;
+      String contentLengthString = headers.get(Response.CONTENT_LENGTH);
+      if (contentLengthString != null) {
+        try {
+          contentLength = Integer.parseInt(contentLengthString.trim());
+        } catch (NumberFormatException ex) {
+          throw new HttpException("bad content length: " + contentLengthString);
+        }
+      }
+      if (http.getMaxContent() >= 0 && contentLength > http.getMaxContent()) {
+        contentLength = http.getMaxContent();
+      }
+
+      // always read content. Sometimes content is useful to find a cause
+      // for error.
+      InputStream in = get.getResponseBodyAsStream();
+      try {
+        byte[] buffer = new byte[HttpBase.BUFFER_SIZE];
+        int bufferFilled = 0;
+        int totalRead = 0;
+        ByteArrayOutputStream out = new ByteArrayOutputStream();
+        while ((bufferFilled = in.read(buffer, 0, buffer.length)) != -1
+            && totalRead + bufferFilled <= contentLength) {
+          totalRead += bufferFilled;
+          out.write(buffer, 0, bufferFilled);
+        }
+
+        content = out.toByteArray();
+      } catch (Exception e) {
+        if (code == 200)
+          throw new IOException(e.toString());
+        // for codes other than 200 OK, we are fine with empty content
+      } finally {
+        if (in != null) {
+          in.close();
+        }
+        get.abort();
+      }
+
+      StringBuilder fetchTrace = null;
+      if (Http.LOG.isTraceEnabled()) {
+        // Trace message
+        fetchTrace = new StringBuilder("url: " + url + "; status code: " + code
+            + "; bytes received: " + content.length);
+        if (getHeader(Response.CONTENT_LENGTH) != null)
+          fetchTrace.append("; Content-Length: "
+              + getHeader(Response.CONTENT_LENGTH));
+        if (getHeader(Response.LOCATION) != null)
+          fetchTrace.append("; Location: " + getHeader(Response.LOCATION));
+      }
+      // Extract gzip, x-gzip and deflate content
+      if (content != null) {
+        // check if we have to uncompress it
+        String contentEncoding = headers.get(Response.CONTENT_ENCODING);
+        if (contentEncoding != null && Http.LOG.isTraceEnabled())
+          fetchTrace.append("; Content-Encoding: " + contentEncoding);
+        if ("gzip".equals(contentEncoding) || "x-gzip".equals(contentEncoding)) {
+          content = http.processGzipEncoded(content, url);
+          if (Http.LOG.isTraceEnabled())
+            fetchTrace.append("; extracted to " + content.length + " bytes");
+        } else if ("deflate".equals(contentEncoding)) {
+          content = http.processDeflateEncoded(content, url);
+          if (Http.LOG.isTraceEnabled())
+            fetchTrace.append("; extracted to " + content.length + " bytes");
+        }
+      }
+
+      // Logger trace message
+      if (Http.LOG.isTraceEnabled()) {
+        Http.LOG.trace(fetchTrace.toString());
+      }
+    } finally {
+      get.releaseConnection();
+    }
+  }
+
+  /*
+   * ------------------------- * <implementation:Response> *
+   * -------------------------
+   */
+
+  public URL getUrl() {
+    return url;
+  }
+
+  public int getCode() {
+    return code;
+  }
+
+  public String getHeader(String name) {
+    return headers.get(name);
+  }
+
+  public Metadata getHeaders() {
+    return headers;
+  }
+
+  public byte[] getContent() {
+    return content;
+  }
+
+  /*
+   * -------------------------- * </implementation:Response> *
+   * --------------------------
+   */
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/package.html
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/package.html b/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/package.html
new file mode 100644
index 0000000..9cbcb14
--- /dev/null
+++ b/nutch-plugins/protocol-httpclient/src/main/java/org/apache/nutch/protocol/httpclient/package.html
@@ -0,0 +1,9 @@
+<html>
+<body>
+<p>Protocol plugin which supports retrieving documents via the HTTP and
+HTTPS protocols, optionally with Basic, Digest and NTLM authentication
+schemes for web server as well as proxy server. It handles cookies
+within a single fetch operation. This plugin is based on Jakarta
+Commons HttpClient library.</p>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-httpclient/src/test/conf/httpclient-auth-test.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-httpclient/src/test/conf/httpclient-auth-test.xml b/nutch-plugins/protocol-httpclient/src/test/conf/httpclient-auth-test.xml
new file mode 100644
index 0000000..3c0203b
--- /dev/null
+++ b/nutch-plugins/protocol-httpclient/src/test/conf/httpclient-auth-test.xml
@@ -0,0 +1,58 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<auth-configuration>
+
+  <!-- Default credentials -->
+  <credentials username="userx" password="passx">
+    <default/>
+    <authscope host="127.0.0.1" port="47500"/>
+  </credentials>
+
+  <!-- Defined a realm for 127.0.0.1:47501 so that authentication for
+       other realms fail (except another realm for 127.0.0.1:47501 is
+       defined below for NTLM scheme). -->
+  <credentials username="userx" password="passx">
+    <authscope host="127.0.0.1" port="47501" realm="realmx"
+    scheme="BASIC"/>
+  </credentials>
+
+  <!-- Test case for NTLM authentication scheme. -->
+  <credentials username="ntlm_user" password="ntlm_pass">
+    <authscope host="127.0.0.1" port="47501" realm="NUTCH"
+    scheme="NTLM"/>
+  </credentials>
+
+  <!-- Test case for credentials selection based on scheme (realm1 is
+       present in basic.jsp as well as digest.jsp).
+       Also tests Digest authentication scheme. -->
+  <credentials username="digest_user" password="digest_pass">
+    <authscope host="127.0.0.1" port="47500" realm="realm1"
+    scheme="DIGEST"/>
+  </credentials>
+
+  <!-- Test case for Basic authentication scheme. -->
+  <credentials username="user1" password="pass1">
+    <authscope host="127.0.0.1" port="47500" realm="realm1"/>
+  </credentials>
+  <credentials username="user2" password="pass2">
+    <authscope host="127.0.0.1" port="47500" realm="realm2"/>
+  </credentials>
+
+</auth-configuration>
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-httpclient/src/test/conf/nutch-site-test.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-httpclient/src/test/conf/nutch-site-test.xml b/nutch-plugins/protocol-httpclient/src/test/conf/nutch-site-test.xml
new file mode 100644
index 0000000..856ea15
--- /dev/null
+++ b/nutch-plugins/protocol-httpclient/src/test/conf/nutch-site-test.xml
@@ -0,0 +1,52 @@
+<?xml version="1.0"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<configuration>
+
+<property>
+  <name>http.robots.agents</name>
+  <value>Nutch-Test,*</value>
+  <description></description>
+</property>
+
+<property>
+  <name>http.agent.name</name>
+  <value>Nutch-Test</value>
+  <description></description>
+</property>
+
+<property>
+  <name>http.agent.description</name>
+  <value>Nutch protocol-httpclient test</value>
+  <description></description>
+</property>
+
+<property>
+  <name>http.auth.file</name>
+  <value>httpclient-auth-test.xml</value>
+  <description></description>
+</property>
+
+<property>
+  <name>http.timeout</name>
+  <value>60000</value>
+  <description></description>
+</property>
+
+</configuration>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-httpclient/src/test/java/org/apache/nutch/protocol/httpclient/TestProtocolHttpClient.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-httpclient/src/test/java/org/apache/nutch/protocol/httpclient/TestProtocolHttpClient.java b/nutch-plugins/protocol-httpclient/src/test/java/org/apache/nutch/protocol/httpclient/TestProtocolHttpClient.java
new file mode 100644
index 0000000..783e5af
--- /dev/null
+++ b/nutch-plugins/protocol-httpclient/src/test/java/org/apache/nutch/protocol/httpclient/TestProtocolHttpClient.java
@@ -0,0 +1,217 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol.httpclient;
+
+import java.net.URL;
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+import org.mortbay.jetty.Server;
+import org.mortbay.jetty.bio.SocketConnector;
+import org.mortbay.jetty.handler.ContextHandler;
+import org.mortbay.jetty.servlet.ServletHandler;
+import org.mortbay.jetty.servlet.SessionHandler;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.net.protocols.Response;
+
+/**
+ * Test cases for protocol-httpclient.
+ */
+public class TestProtocolHttpClient {
+
+  private Server server;
+  private Configuration conf;
+  private static final String RES_DIR = System.getProperty("test.data", ".");
+  private int port;
+  private Http http = new Http();
+
+  @Before
+  public void setUp() throws Exception {
+
+    ContextHandler context = new ContextHandler();
+    context.setContextPath("/");
+    context.setResourceBase(RES_DIR);
+    ServletHandler sh = new ServletHandler();
+    sh.addServletWithMapping("org.apache.jasper.servlet.JspServlet", "*.jsp");
+    context.addHandler(sh);
+    context.addHandler(new SessionHandler());
+
+    server = new Server();
+    server.addHandler(context);
+
+    conf = new Configuration();
+    conf.addResource("nutch-default.xml");
+    conf.addResource("nutch-site-test.xml");
+
+    http = new Http();
+    http.setConf(conf);
+  }
+
+  @After
+  public void tearDown() throws Exception {
+    server.stop();
+    for (int i = 0; i < 5; i++) {
+      if (!server.isStopped()) {
+       Thread.sleep(1000);
+      }
+    }
+  }
+
+  /**
+   * Tests whether the client can remember cookies.
+   * 
+   * @throws Exception
+   *           If an error occurs or the test case fails.
+   */
+  @Test
+  public void testCookies() throws Exception {
+    startServer(47500);
+    fetchPage("/cookies.jsp", 200);
+    fetchPage("/cookies.jsp?cookie=yes", 200);
+  }
+
+  /**
+   * Tests that no pre-emptive authorization headers are sent by the client.
+   * 
+   * @throws Exception
+   *           If an error occurs or the test case fails.
+   */
+  @Test
+  public void testNoPreemptiveAuth() throws Exception {
+    startServer(47500);
+    fetchPage("/noauth.jsp", 200);
+  }
+
+  /**
+   * Tests default credentials.
+   * 
+   * @throws Exception
+   *           If an error occurs or the test case fails.
+   */
+  @Test
+  public void testDefaultCredentials() throws Exception {
+    startServer(47502);
+    fetchPage("/basic.jsp", 200);
+  }
+
+  /**
+   * Tests basic authentication scheme for various realms.
+   * 
+   * @throws Exception
+   *           If an error occurs or the test case fails.
+   */
+  @Test
+  public void testBasicAuth() throws Exception {
+    startServer(47500);
+    fetchPage("/basic.jsp", 200);
+    fetchPage("/basic.jsp?case=1", 200);
+    fetchPage("/basic.jsp?case=2", 200);
+    server.start();
+  }
+
+  /**
+   * Tests that authentication happens for a defined realm and not for other
+   * realms for a host:port when an extra <code>authscope</code> tag is not
+   * defined to match all other realms.
+   * 
+   * @throws Exception
+   *           If an error occurs or the test case fails.
+   */
+  @Test
+  public void testOtherRealmsNoAuth() throws Exception {
+    startServer(47501);
+    fetchPage("/basic.jsp", 200);
+    fetchPage("/basic.jsp?case=1", 401);
+    fetchPage("/basic.jsp?case=2", 401);
+  }
+
+  /**
+   * Tests Digest authentication scheme.
+   * 
+   * @throws Exception
+   *           If an error occurs or the test case fails.
+   */
+  @Test
+  public void testDigestAuth() throws Exception {
+    startServer(47500);
+    fetchPage("/digest.jsp", 200);
+  }
+
+  /**
+   * Tests NTLM authentication scheme.
+   * 
+   * @throws Exception
+   *           If an error occurs or the test case fails.
+   */
+  @Test
+  public void testNtlmAuth() throws Exception {
+    startServer(47501);
+    fetchPage("/ntlm.jsp", 200);
+  }
+
+  /**
+   * Starts the Jetty server at a specified port.
+   *
+   * Will try up to 10 ports to find an available port to use.
+   *
+   * @param portno
+   *          Port number.
+   * @throws Exception
+   *           When an error occurs.
+   */
+  private void startServer(int portno) throws Exception {
+    SocketConnector listener = new SocketConnector();
+    listener.setHost("127.0.0.1");
+    server.addConnector(listener);
+    for (int p = portno; p < portno + 10; p++) {
+      port = portno;
+      listener.setPort(port);
+      try {
+        server.start();
+        break;
+      } catch (Exception e) {
+        if (p == portno + 9) {
+          throw e;
+        }
+      }
+    }
+  }
+
+  /**
+   * Fetches the specified <code>page</code> from the local Jetty server and
+   * checks whether the HTTP response status code matches with the expected
+   * code.
+   * 
+   * @param page
+   *          Page to be fetched.
+   * @param expectedCode
+   *          HTTP response status code expected while fetching the page.
+   * @throws Exception
+   *           When an error occurs or test case fails.
+   */
+  private void fetchPage(String page, int expectedCode) throws Exception {
+    URL url = new URL("http", "127.0.0.1", port, page);
+    Response response = null;
+    response = http.getResponse(url, new CrawlDatum(), true);
+
+    int code = response.getCode();
+    Assert.assertEquals("HTTP Status Code for " + url, expectedCode, code);
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-interactiveselenium/README.md
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-interactiveselenium/README.md b/nutch-plugins/protocol-interactiveselenium/README.md
new file mode 100644
index 0000000..dd43ee7
--- /dev/null
+++ b/nutch-plugins/protocol-interactiveselenium/README.md
@@ -0,0 +1,38 @@
+Nutch Interactive Selenium
+==========================
+
+This protocol plugin allows you to fetch and interact with pages using [Selenium](http://www.seleniumhq.org/).
+
+# Dependencies and Configuration
+
+You will need to have [Selenium](http://www.seleniumhq.org/) and a compatible version of Firefox installed to use this plugin.
+
+Set the protocol to be used in your Nutch configuration files.
+```
+<!-- NUTCH_HOME/conf/nutch-site.xml -->
+
+<configuration>
+  ...
+  <property>
+    <name>plugin.includes</name>
+    <value>protocol-interactiveselenium|urlfilter-regex| ... </value>
+    <description></description>
+  </property>
+```
+
+# Custom Handlers
+
+Only basic functionality is included in the DefaultHandler that comes with the plugin. If you want additional functionality you can implement custom handlers by implementing the InteractiveSeleniumHandler interface in the plugin package. Be sure to also update the plugin config to include your new handler.
+
+```
+<!-- NUTCH_HOME/conf/nutch-site.xml -->
+<property>
+  <name>interactiveselenium.handlers</name>
+  <value>NewCustomHandler,DefaultHandler</value>
+  <description></description>
+</property>
+```
+
+# Handler Info
+
+Handlers are called in the order that they're specified in the configuration. A "clean" driver is used for each handler so multiple handlers won't interfere with each other. Page content is appended together from each handler and returned for the request.

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-interactiveselenium/build-ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-interactiveselenium/build-ivy.xml b/nutch-plugins/protocol-interactiveselenium/build-ivy.xml
new file mode 100644
index 0000000..9f96619
--- /dev/null
+++ b/nutch-plugins/protocol-interactiveselenium/build-ivy.xml
@@ -0,0 +1,54 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="protocol-interactiveselenium" default="deps-jar" xmlns:ivy="antlib:org.apache.ivy.ant">
+
+    <property name="ivy.install.version" value="2.1.0" />
+    <condition property="ivy.home" value="${env.IVY_HOME}">
+      <isset property="env.IVY_HOME" />
+    </condition>
+    <property name="ivy.home" value="${user.home}/.ant" />
+    <property name="ivy.checksums" value="" />
+    <property name="ivy.jar.dir" value="${ivy.home}/lib" />
+    <property name="ivy.jar.file" value="${ivy.jar.dir}/ivy.jar" />
+
+    <target name="download-ivy" unless="offline">
+
+        <mkdir dir="${ivy.jar.dir}"/>
+        <!-- download Ivy from web site so that it can be used even without any special installation -->
+        <get src="http://repo2.maven.org/maven2/org/apache/ivy/ivy/${ivy.install.version}/ivy-${ivy.install.version}.jar" 
+             dest="${ivy.jar.file}" usetimestamp="true"/>
+    </target>
+
+    <target name="init-ivy" depends="download-ivy">
+      <!-- try to load ivy here from ivy home, in case the user has not already dropped
+              it into ant's lib dir (note that the latter copy will always take precedence).
+              We will not fail as long as local lib dir exists (it may be empty) and
+              ivy is in at least one of ant's lib dir or the local lib dir. -->
+        <path id="ivy.lib.path">
+            <fileset dir="${ivy.jar.dir}" includes="*.jar"/>
+
+        </path>
+        <taskdef resource="org/apache/ivy/ant/antlib.xml"
+                 uri="antlib:org.apache.ivy.ant" classpathref="ivy.lib.path"/>
+    </target>
+
+  <target name="deps-jar" depends="init-ivy">
+    <ivy:retrieve pattern="lib/[artifact]-[revision].[ext]" sync="true"/>
+  </target>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-interactiveselenium/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-interactiveselenium/build.xml b/nutch-plugins/protocol-interactiveselenium/build.xml
new file mode 100644
index 0000000..69dab90
--- /dev/null
+++ b/nutch-plugins/protocol-interactiveselenium/build.xml
@@ -0,0 +1,37 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="protocol-interactiveselenium" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+  <!-- Build compilation dependencies -->
+  <target name="deps-jar">
+    <ant target="jar" inheritall="false" dir="../lib-http"/>
+    <ant target="jar" inheritall="false" dir="../lib-selenium"/>
+  </target>
+
+  <!-- Add compilation dependencies to classpath -->
+  <path id="plugin.deps">
+    <fileset dir="${nutch.root}/build">
+      <include name="**/lib-http/*.jar" />
+      <include name="**/lib-selenium/*.jar" />
+      <include name="**/protocol-selenium/*.jar" />
+    </fileset>
+  </path>
+
+</project>

[12/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build for nutch-core and nutch-plugins

Posted by th...@apache.org.

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-swf/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-swf/build.xml b/nutch-plugins/parse-swf/build.xml
new file mode 100644
index 0000000..f4fb20f
--- /dev/null
+++ b/nutch-plugins/parse-swf/build.xml
@@ -0,0 +1,38 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="parse-swf" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+  <!-- Deploy Unit test dependencies -->
+  <target name="deps-test">
+    <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
+    <ant target="deploy" inheritall="false" dir="../protocol-file"/>
+  </target>
+
+
+  <!-- for junit test -->
+  <mkdir dir="${build.test}/data"/>
+  <copy file="sample/test1.swf" todir="${build.test}/data"/>
+  <copy file="sample/test2.swf" todir="${build.test}/data"/>
+  <copy file="sample/test3.swf" todir="${build.test}/data"/>
+  <copy file="sample/test1.txt" todir="${build.test}/data"/>
+  <copy file="sample/test2.txt" todir="${build.test}/data"/>
+  <copy file="sample/test3.txt" todir="${build.test}/data"/>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-swf/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-swf/ivy.xml b/nutch-plugins/parse-swf/ivy.xml
new file mode 100644
index 0000000..1a86d68
--- /dev/null
+++ b/nutch-plugins/parse-swf/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-swf/lib/javaswf-LICENSE.txt
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-swf/lib/javaswf-LICENSE.txt b/nutch-plugins/parse-swf/lib/javaswf-LICENSE.txt
new file mode 100644
index 0000000..4138a66
--- /dev/null
+++ b/nutch-plugins/parse-swf/lib/javaswf-LICENSE.txt
@@ -0,0 +1,33 @@
+
+  Copyright (c) 2001-2005, David N. Main, All rights reserved.
+  
+  Redistribution and use in source and binary forms, with or
+  without modification, are permitted provided that the 
+  following conditions are met:
+ 
+  1. Redistributions of source code must retain the above 
+  copyright notice, this list of conditions and the following 
+  disclaimer. 
+  
+  2. Redistributions in binary form must reproduce the above 
+  copyright notice, this list of conditions and the following 
+  disclaimer in the documentation and/or other materials 
+  provided with the distribution.
+  
+  3. The name of the author may not be used to endorse or 
+  promote products derived from this software without specific 
+  prior written permission. 
+  
+  THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY 
+  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
+  THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 
+  PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
+  AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 
+  NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 
+  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 
+  HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 
+  OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 
+  EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-swf/lib/javaswf.jar
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-swf/lib/javaswf.jar b/nutch-plugins/parse-swf/lib/javaswf.jar
new file mode 100644
index 0000000..78f9b0b
Binary files /dev/null and b/nutch-plugins/parse-swf/lib/javaswf.jar differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-swf/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-swf/plugin.xml b/nutch-plugins/parse-swf/plugin.xml
new file mode 100644
index 0000000..8cc72c0
--- /dev/null
+++ b/nutch-plugins/parse-swf/plugin.xml
@@ -0,0 +1,44 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="parse-swf"
+   name="SWF Parse Plug-in"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+
+   <runtime>
+      <library name="parse-swf.jar">
+         <export name="*"/>
+      </library>
+      <library name="javaswf.jar"/>
+   </runtime>
+
+   <extension id="org.apache.nutch.parse.swf"
+              name="SWFParse"
+              point="org.apache.nutch.parse.Parser">
+
+      <implementation id="org.apache.nutch.parse.swf.SWFParser"
+                      class="org.apache.nutch.parse.swf.SWFParser">
+        <parameter name="contentType" value="application/x-shockwave-flash"/>
+        <parameter name="pathSuffix"  value="swf"/>
+      </implementation>
+      
+   </extension>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-swf/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-swf/pom.xml b/nutch-plugins/parse-swf/pom.xml
new file mode 100644
index 0000000..743511e
--- /dev/null
+++ b/nutch-plugins/parse-swf/pom.xml
@@ -0,0 +1,46 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>parse-swf</artifactId>
+    <packaging>jar</packaging>
+
+    <name>parse-swf</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+    <dependencies>
+        <dependency>
+            <groupId>com.google.gwt</groupId>
+            <artifactId>gwt-incubator</artifactId>
+            <version>2.0.1</version>
+        </dependency>
+
+    </dependencies>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-swf/src/main/java/org/apache/nutch/parse/swf/SWFParser.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-swf/src/main/java/org/apache/nutch/parse/swf/SWFParser.java b/nutch-plugins/parse-swf/src/main/java/org/apache/nutch/parse/swf/SWFParser.java
new file mode 100644
index 0000000..9251366
--- /dev/null
+++ b/nutch-plugins/parse-swf/src/main/java/org/apache/nutch/parse/swf/SWFParser.java
@@ -0,0 +1,685 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.swf;
+
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.util.*;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.parse.*;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.NutchConfiguration;
+
+import org.apache.hadoop.conf.Configuration;
+
+import com.anotherbigidea.flash.interfaces.*;
+import com.anotherbigidea.flash.readers.*;
+import com.anotherbigidea.flash.structs.*;
+import com.anotherbigidea.flash.writers.SWFActionBlockImpl;
+import com.anotherbigidea.flash.writers.SWFTagTypesImpl;
+import com.anotherbigidea.io.InStream;
+
+/**
+ * Parser for Flash SWF files. Loosely based on the sample in JavaSWF
+ * distribution.
+ */
+public class SWFParser implements Parser {
+  public static final Logger LOG = LoggerFactory
+      .getLogger("org.apache.nutch.parse.swf");
+
+  private Configuration conf = null;
+
+  public SWFParser() {
+  }
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+  }
+
+  public Configuration getConf() {
+    return conf;
+  }
+
+  public ParseResult getParse(Content content) {
+
+    String text = null;
+    Vector<Outlink> outlinks = new Vector<Outlink>();
+
+    try {
+
+      byte[] raw = content.getContent();
+
+      String contentLength = content.getMetadata().get(Response.CONTENT_LENGTH);
+      if (contentLength != null
+          && raw.length != Integer.parseInt(contentLength)) {
+        return new ParseStatus(ParseStatus.FAILED,
+            ParseStatus.FAILED_TRUNCATED, "Content truncated at " + raw.length
+                + " bytes. Parser can't handle incomplete files.")
+            .getEmptyParseResult(content.getUrl(), getConf());
+      }
+      ExtractText extractor = new ExtractText();
+
+      // TagParser implements SWFTags and drives a SWFTagTypes interface
+      TagParser parser = new TagParser(extractor);
+      // use this instead to debug the file
+      // TagParser parser = new TagParser( new SWFTagDumper(true, true) );
+
+      // SWFReader reads an input file and drives a SWFTags interface
+      SWFReader reader = new SWFReader(parser, new InStream(raw));
+
+      // read the input SWF file and pass it through the interface pipeline
+      reader.readFile();
+      text = extractor.getText();
+      String atext = extractor.getActionText();
+      if (atext != null && atext.length() > 0)
+        text += "\n--------\n" + atext;
+      // harvest potential outlinks
+      String[] links = extractor.getUrls();
+      for (int i = 0; i < links.length; i++) {
+        Outlink out = new Outlink(links[i], "");
+        outlinks.add(out);
+      }
+      Outlink[] olinks = OutlinkExtractor.getOutlinks(text, conf);
+      if (olinks != null)
+        for (int i = 0; i < olinks.length; i++) {
+          outlinks.add(olinks[i]);
+        }
+    } catch (Exception e) { // run time exception
+      LOG.error("Error, runtime exception: ", e);
+      return new ParseStatus(ParseStatus.FAILED,
+          "Can't be handled as SWF document. " + e).getEmptyParseResult(
+          content.getUrl(), getConf());
+    }
+    if (text == null)
+      text = "";
+
+    Outlink[] links = (Outlink[]) outlinks
+        .toArray(new Outlink[outlinks.size()]);
+    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, "", links,
+        content.getMetadata());
+    return ParseResult.createParseResult(content.getUrl(), new ParseImpl(text,
+        parseData));
+  }
+
+  /**
+   * Arguments are: 0. Name of input SWF file.
+   */
+  public static void main(String[] args) throws IOException {
+    FileInputStream in = new FileInputStream(args[0]);
+
+    byte[] buf = new byte[in.available()];
+    in.read(buf);
+    in.close();
+    SWFParser parser = new SWFParser();
+    ParseResult parseResult = parser.getParse(new Content("file:" + args[0],
+        "file:" + args[0], buf, "application/x-shockwave-flash",
+        new Metadata(), NutchConfiguration.create()));
+    Parse p = parseResult.get("file:" + args[0]);
+    System.out.println("Parse Text:");
+    System.out.println(p.getText());
+    System.out.println("Parse Data:");
+    System.out.println(p.getData());
+  }
+}
+
+/**
+ * Shows how to parse a Flash movie and extract all the text in Text symbols and
+ * the initial text in Edit Fields. Output is to System.out.
+ * 
+ * A "pipeline" is set up in the main method:
+ * 
+ * SWFReader-->TagParser-->ExtractText
+ * 
+ * SWFReader reads the input SWF file and separates out the header and the tags.
+ * The separated contents are passed to TagParser which parses out the
+ * individual tag types and passes them to ExtractText.
+ * 
+ * ExtractText extends SWFTagTypesImpl and overrides some methods.
+ */
+class ExtractText extends SWFTagTypesImpl {
+  /**
+   * Store font info keyed by the font symbol id. Each entry is an int[] of
+   * character codes for the correspnding font glyphs (An empty array denotes a
+   * System Font).
+   */
+  protected HashMap<Integer, int[]> fontCodes = new HashMap<Integer, int[]>();
+
+  public ArrayList<String> strings = new ArrayList<String>();
+
+  public HashSet<String> actionStrings = new HashSet<String>();
+
+  public ArrayList<String> urls = new ArrayList<String>();
+
+  public ExtractText() {
+    super(null);
+  }
+
+  public String getText() {
+    StringBuffer res = new StringBuffer();
+    Iterator<String> it = strings.iterator();
+    while (it.hasNext()) {
+      if (res.length() > 0)
+        res.append(' ');
+      res.append(it.next());
+    }
+    return res.toString();
+  }
+
+  public String getActionText() {
+    StringBuffer res = new StringBuffer();
+    String[] strings = (String[]) actionStrings
+        .toArray(new String[actionStrings.size()]);
+    Arrays.sort(strings);
+    for (int i = 0; i < strings.length; i++) {
+      if (i > 0)
+        res.append('\n');
+      res.append(strings[i]);
+    }
+    return res.toString();
+  }
+
+  public String[] getUrls() {
+    String[] res = new String[urls.size()];
+    int i = 0;
+    Iterator<String> it = urls.iterator();
+    while (it.hasNext()) {
+      res[i] = (String) it.next();
+      i++;
+    }
+    return res;
+  }
+
+  public void tagDefineFontInfo2(int arg0, String arg1, int arg2, int[] arg3,
+      int arg4) throws IOException {
+    tagDefineFontInfo(arg0, arg1, arg2, arg3);
+  }
+
+  /**
+   * SWFTagTypes interface Save the Text Font character code info
+   */
+  public void tagDefineFontInfo(int fontId, String fontName, int flags,
+      int[] codes) throws IOException {
+    // System.out.println("-defineFontInfo id=" + fontId + ", name=" +
+    // fontName);
+    fontCodes.put(new Integer(fontId), codes);
+  }
+
+  // XXX too much hassle for too little return ... we cannot guess character
+  // XXX codes anyway, so we just give up.
+  /*
+   * public SWFVectors tagDefineFont(int arg0, int arg1) throws IOException {
+   * return null; }
+   */
+
+  /**
+   * SWFTagTypes interface. Save the character code info.
+   */
+  public SWFVectors tagDefineFont2(int id, int flags, String name,
+      int numGlyphs, int ascent, int descent, int leading, int[] codes,
+      int[] advances, Rect[] bounds, int[] kernCodes1, int[] kernCodes2,
+      int[] kernAdjustments) throws IOException {
+    // System.out.println("-defineFontInfo id=" + id + ", name=" + name);
+    fontCodes.put(new Integer(id), (codes != null) ? codes : new int[0]);
+
+    return null;
+  }
+
+  /**
+   * SWFTagTypes interface. Dump any initial text in the field.
+   */
+  public void tagDefineTextField(int fieldId, String fieldName,
+      String initialText, Rect boundary, int flags, AlphaColor textColor,
+      int alignment, int fontId, int fontSize, int charLimit, int leftMargin,
+      int rightMargin, int indentation, int lineSpacing) throws IOException {
+    if (initialText != null) {
+      strings.add(initialText);
+    }
+  }
+
+  /**
+   * SWFTagTypes interface
+   */
+  public SWFText tagDefineText(int id, Rect bounds, Matrix matrix)
+      throws IOException {
+    lastBounds = curBounds;
+    curBounds = bounds;
+    return new TextDumper();
+  }
+
+  Rect lastBounds = null;
+  Rect curBounds = null;
+
+  /**
+   * SWFTagTypes interface
+   */
+  public SWFText tagDefineText2(int id, Rect bounds, Matrix matrix)
+      throws IOException {
+    lastBounds = curBounds;
+    curBounds = bounds;
+    return new TextDumper();
+  }
+
+  public class TextDumper implements SWFText {
+    protected Integer fontId;
+
+    protected boolean firstY = true;
+
+    public void font(int fontId, int textHeight) {
+      this.fontId = new Integer(fontId);
+    }
+
+    public void setY(int y) {
+      if (firstY)
+        firstY = false;
+      else
+        strings.add("\n"); // Change in Y - dump a new line
+    }
+
+    /*
+     * There are some issues with this method: sometimes SWF files define their
+     * own font, so short of OCR we cannot guess what is the glyph code ->
+     * character mapping. Additionally, some files don't use literal space
+     * character, instead they adjust glyphAdvances. We don't handle it at all -
+     * in such cases the text will be all glued together.
+     */
+    public void text(int[] glyphIndices, int[] glyphAdvances) {
+      // System.out.println("-text id=" + fontId);
+      int[] codes = (int[]) fontCodes.get(fontId);
+      if (codes == null) {
+        // unknown font, better not guess
+        strings.add("\n**** ?????????????? ****\n");
+        return;
+      }
+
+      // --Translate the glyph indices to character codes
+      char[] chars = new char[glyphIndices.length];
+
+      for (int i = 0; i < chars.length; i++) {
+        int index = glyphIndices[i];
+
+        if (index >= codes.length) // System Font ?
+        {
+          chars[i] = (char) index;
+        } else {
+          chars[i] = (char) (codes[index]);
+        }
+        // System.out.println("-ch[" + i + "]='" + chars[i] + "'(" +
+        // (int)chars[i] + ") +" + glyphAdvances[i]);
+      }
+      strings.add(new String(chars));
+    }
+
+    public void color(Color color) {
+    }
+
+    public void setX(int x) {
+    }
+
+    public void done() {
+      strings.add("\n");
+    }
+  }
+
+  public SWFActions tagDoAction() throws IOException {
+    // ActionTextWriter actions = new ActionTextWriter(new
+    // PrintWriter(System.out));
+    NutchSWFActions actions = new NutchSWFActions(actionStrings, urls);
+    return actions;
+  }
+
+  public SWFActions tagDoInitAction(int arg0) throws IOException {
+    // ActionTextWriter actions = new ActionTextWriter(new
+    // PrintWriter(System.out));
+    NutchSWFActions actions = new NutchSWFActions(actionStrings, urls);
+    return actions;
+  }
+
+  public void tagGeneratorFont(byte[] arg0) throws IOException {
+    // TODO Auto-generated method stub
+    super.tagGeneratorFont(arg0);
+  }
+
+  public void tagGeneratorText(byte[] arg0) throws IOException {
+    // TODO Auto-generated method stub
+    super.tagGeneratorText(arg0);
+  }
+
+}
+
+/**
+ * ActionScript parser. This parser tries to extract free text embedded inside
+ * the script, but without polluting it too much with names of variables,
+ * methods, etc. Not ideal, but it works.
+ */
+class NutchSWFActions extends SWFActionBlockImpl implements SWFActions {
+  private HashSet<String> strings = null;
+
+  private ArrayList<String> urls = null;
+
+  String[] dict = null;
+
+  Stack<Object> stack = null;
+
+  public NutchSWFActions(HashSet<String> strings, ArrayList<String> urls) {
+    this.strings = strings;
+    this.urls = urls;
+    stack = new SmallStack(100, strings);
+  }
+
+  public void lookupTable(String[] values) throws IOException {
+    for (int i = 0; i < values.length; i++) {
+      if (!strings.contains(values[i]))
+        strings.add(values[i]);
+    }
+    super.lookupTable(values);
+    dict = values;
+  }
+
+  public void defineLocal() throws IOException {
+    stack.pop();
+    super.defineLocal();
+  }
+
+  public void getURL(int vars, int mode) {
+    // System.out.println("-getURL: vars=" + vars + ", mode=" + mode);
+  }
+
+  public void getURL(String url, String target) throws IOException {
+    // System.out.println("-getURL: url=" + url + ", target=" + target);
+    stack.push(url);
+    stack.push(target);
+    strings.remove(url);
+    strings.remove(target);
+    urls.add(url);
+    super.getURL(url, target);
+  }
+
+  public SWFActionBlock.TryCatchFinally _try(String var) throws IOException {
+    // stack.push(var);
+    strings.remove(var);
+    return super._try(var);
+  }
+
+  public void comment(String var) throws IOException {
+    // stack.push(var);
+    strings.remove(var);
+    super.comment(var);
+  }
+
+  public void goToFrame(String var) throws IOException {
+    stack.push(var);
+    strings.remove(var);
+    super.gotoFrame(var);
+  }
+
+  public void ifJump(String var) throws IOException {
+    strings.remove(var);
+    super.ifJump(var);
+  }
+
+  public void jump(String var) throws IOException {
+    strings.remove(var);
+    super.jump(var);
+  }
+
+  public void jumpLabel(String var) throws IOException {
+    strings.remove(var);
+    super.jumpLabel(var);
+  }
+
+  public void lookup(int var) throws IOException {
+    if (dict != null && var >= 0 && var < dict.length) {
+      stack.push(dict[var]);
+    }
+    super.lookup(var);
+  }
+
+  public void push(String var) throws IOException {
+    stack.push(var);
+    strings.remove(var);
+    super.push(var);
+  }
+
+  public void setTarget(String var) throws IOException {
+    stack.push(var);
+    strings.remove(var);
+    super.setTarget(var);
+  }
+
+  public SWFActionBlock startFunction(String var, String[] params)
+      throws IOException {
+    stack.push(var);
+    strings.remove(var);
+    if (params != null) {
+      for (int i = 0; i < params.length; i++) {
+        strings.remove(params[i]);
+      }
+    }
+    return this;
+  }
+
+  public SWFActionBlock startFunction2(String var, int arg1, int arg2,
+      String[] params, int[] arg3) throws IOException {
+    stack.push(var);
+    strings.remove(var);
+    if (params != null) {
+      for (int i = 0; i < params.length; i++) {
+        strings.remove(params[i]);
+      }
+    }
+    return this;
+  }
+
+  public void waitForFrame(int num, String var) throws IOException {
+    stack.push(var);
+    strings.remove(var);
+    super.waitForFrame(num, var);
+  }
+
+  public void waitForFrame(String var) throws IOException {
+    stack.push(var);
+    strings.remove(var);
+    super.waitForFrame(var);
+  }
+
+  public void done() throws IOException {
+    while (stack.size() > 0) {
+      strings.remove(stack.pop());
+    }
+  }
+
+  public SWFActionBlock start(int arg0, int arg1) throws IOException {
+    return this;
+  }
+
+  public SWFActionBlock start(int arg0) throws IOException {
+    return this;
+  }
+
+  public void add() throws IOException {
+    super.add();
+  }
+
+  public void asciiToChar() throws IOException {
+    super.asciiToChar();
+  }
+
+  public void asciiToCharMB() throws IOException {
+    super.asciiToCharMB();
+  }
+
+  public void push(int var) throws IOException {
+    if (dict != null && var >= 0 && var < dict.length) {
+      stack.push(dict[var]);
+    }
+    super.push(var);
+  }
+
+  public void callFunction() throws IOException {
+    strings.remove(stack.pop());
+    super.callFunction();
+  }
+
+  public void callMethod() throws IOException {
+    strings.remove(stack.pop());
+    super.callMethod();
+  }
+
+  public void getMember() throws IOException {
+    // 0: name
+    String val = (String) stack.pop();
+    strings.remove(val);
+    super.getMember();
+  }
+
+  public void setMember() throws IOException {
+    // 0: value -1: name
+    stack.pop(); // value
+    String name = (String) stack.pop();
+    strings.remove(name);
+    super.setMember();
+  }
+
+  public void setProperty() throws IOException {
+    super.setProperty();
+  }
+
+  public void setVariable() throws IOException {
+    super.setVariable();
+  }
+
+  public void call() throws IOException {
+    strings.remove(stack.pop());
+    super.call();
+  }
+
+  public void setTarget() throws IOException {
+    strings.remove(stack.pop());
+    super.setTarget();
+  }
+
+  public void pop() throws IOException {
+    strings.remove(stack.pop());
+    super.pop();
+  }
+
+  public void push(boolean arg0) throws IOException {
+    stack.push("" + arg0);
+    super.push(arg0);
+  }
+
+  public void push(double arg0) throws IOException {
+    stack.push("" + arg0);
+    super.push(arg0);
+  }
+
+  public void push(float arg0) throws IOException {
+    stack.push("" + arg0);
+    super.push(arg0);
+  }
+
+  public void pushNull() throws IOException {
+    stack.push("");
+    super.pushNull();
+  }
+
+  public void pushRegister(int arg0) throws IOException {
+    stack.push("" + arg0);
+    super.pushRegister(arg0);
+  }
+
+  public void pushUndefined() throws IOException {
+    stack.push("???");
+    super.pushUndefined();
+  }
+
+  public void getProperty() throws IOException {
+    stack.pop();
+    super.getProperty();
+  }
+
+  public void getVariable() throws IOException {
+    strings.remove(stack.pop());
+    super.getVariable();
+  }
+
+  public void gotoFrame(boolean arg0) throws IOException {
+    stack.push("" + arg0);
+    super.gotoFrame(arg0);
+  }
+
+  public void gotoFrame(int arg0) throws IOException {
+    stack.push("" + arg0);
+    super.gotoFrame(arg0);
+  }
+
+  public void gotoFrame(String arg0) throws IOException {
+    stack.push("" + arg0);
+    strings.remove(arg0);
+    super.gotoFrame(arg0);
+  }
+
+  public void newObject() throws IOException {
+    stack.pop();
+    super.newObject();
+  }
+
+  public SWFActionBlock startWith() throws IOException {
+    return this;
+  }
+
+}
+
+/*
+ * Small bottom-less stack.
+ */
+class SmallStack extends Stack<Object> {
+
+  private static final long serialVersionUID = 1L;
+
+  private int maxSize;
+
+  private HashSet<String> strings = null;
+
+  public SmallStack(int maxSize, HashSet<String> strings) {
+    this.maxSize = maxSize;
+    this.strings = strings;
+  }
+
+  public Object push(Object o) {
+    // limit max size
+    if (this.size() > maxSize) {
+      String val = (String) remove(0);
+      strings.remove(val);
+    }
+    return super.push(o);
+  }
+
+  public Object pop() {
+    // tolerate underruns
+    if (this.size() == 0)
+      return null;
+    else
+      return super.pop();
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-swf/src/main/java/org/apache/nutch/parse/swf/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-swf/src/main/java/org/apache/nutch/parse/swf/package-info.java b/nutch-plugins/parse-swf/src/main/java/org/apache/nutch/parse/swf/package-info.java
new file mode 100644
index 0000000..5942e64
--- /dev/null
+++ b/nutch-plugins/parse-swf/src/main/java/org/apache/nutch/parse/swf/package-info.java
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Parse Flash SWF files.
+ */
+package org.apache.nutch.parse.swf;
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-swf/src/test/java/org/apache/nutch/parse/swf/TestSWFParser.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-swf/src/test/java/org/apache/nutch/parse/swf/TestSWFParser.java b/nutch-plugins/parse-swf/src/test/java/org/apache/nutch/parse/swf/TestSWFParser.java
new file mode 100644
index 0000000..129b85f
--- /dev/null
+++ b/nutch-plugins/parse-swf/src/test/java/org/apache/nutch/parse/swf/TestSWFParser.java
@@ -0,0 +1,94 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.swf;
+
+import java.io.FileInputStream;
+import java.io.InputStreamReader;
+
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseException;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Assert;
+import org.junit.Test;
+
+/**
+ * Unit tests for SWFParser.
+ */
+public class TestSWFParser {
+
+  private String fileSeparator = System.getProperty("file.separator");
+  // This system property is defined in ./src/plugin/build-plugin.xml
+  private String sampleDir = System.getProperty("test.data", ".");
+
+  private String[] sampleFiles = new String[] { "test1.swf", "test2.swf",
+      "test3.swf" };
+  private String[] sampleTexts = new String[] { "test1.txt", "test2.txt",
+      "test3.txt" };
+
+  @Test
+  public void testIt() throws ProtocolException, ParseException {
+    String urlString;
+    Protocol protocol;
+    Content content;
+    Parse parse;
+    Configuration conf = NutchConfiguration.create();
+
+    for (int i = 0; i < sampleFiles.length; i++) {
+      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
+
+      protocol = new ProtocolFactory(conf).getProtocol(urlString);
+      content = protocol.getProtocolOutput(new Text(urlString),
+          new CrawlDatum()).getContent();
+
+      parse = new ParseUtil(conf).parse(content).get(content.getUrl());
+
+      String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim();
+      Assert.assertTrue(sampleTexts[i].equals(text));
+    }
+  }
+
+  public TestSWFParser() {
+    for (int i = 0; i < sampleFiles.length; i++) {
+      try {
+        // read the test string
+        FileInputStream fis = new FileInputStream(sampleDir + fileSeparator
+            + sampleTexts[i]);
+        StringBuffer sb = new StringBuffer();
+        int len = 0;
+        InputStreamReader isr = new InputStreamReader(fis, "UTF-8");
+        char[] buf = new char[1024];
+        while ((len = isr.read(buf)) > 0) {
+          sb.append(buf, 0, len);
+        }
+        isr.close();
+        sampleTexts[i] = sb.toString().replaceAll("[ \t\r\n]+", " ").trim();
+      } catch (Exception e) {
+        e.printStackTrace();
+      }
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-swf/src/test/resources/test1.swf
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-swf/src/test/resources/test1.swf b/nutch-plugins/parse-swf/src/test/resources/test1.swf
new file mode 100644
index 0000000..cd2019b
Binary files /dev/null and b/nutch-plugins/parse-swf/src/test/resources/test1.swf differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-swf/src/test/resources/test1.txt
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-swf/src/test/resources/test1.txt b/nutch-plugins/parse-swf/src/test/resources/test1.txt
new file mode 100644
index 0000000..68505d5
--- /dev/null
+++ b/nutch-plugins/parse-swf/src/test/resources/test1.txt
@@ -0,0 +1,60 @@
+
+--------
+/go/gnav_cart
+/go/gnav_company
+/go/gnav_devnet
+/go/gnav_downloads
+/go/gnav_fl_minmessage
+/go/gnav_help
+/go/gnav_mm_home
+/go/gnav_products
+/go/gnav_search?loc=en_us
+/go/gnav_showcase
+/go/gnav_solutions
+/go/gnav_store
+/go/gnav_support
+/go/gnav_your_account
+Acquisition Info
+Adobe Home
+AppleGothic
+Array
+Company
+Developers
+Downloads
+Help
+Home
+International
+LocaleManager
+Macromedia Flash Player
+Macromedia Home
+MovieClip
+Products
+Showcase
+Solutions
+Store
+String
+Support
+TextFormat
+To ensure the best possible Internet Experience, please download the latest version of the free
+Verdana
+_sans
+active
+bluePill
+button
+color
+company
+devnet
+downloads
+en_us
+home
+javascript:openCrosslinkWindow('/go/adobeacquisition')
+javascript:openCrosslinkWindow('/go/gnav_adobe_home')
+products
+rollOut
+rollOver
+selected
+showcase
+solutions
+support
+tabHolder
+textColor

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-swf/src/test/resources/test2.swf
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-swf/src/test/resources/test2.swf b/nutch-plugins/parse-swf/src/test/resources/test2.swf
new file mode 100644
index 0000000..eb9b03d
Binary files /dev/null and b/nutch-plugins/parse-swf/src/test/resources/test2.swf differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-swf/src/test/resources/test2.txt
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-swf/src/test/resources/test2.txt b/nutch-plugins/parse-swf/src/test/resources/test2.txt
new file mode 100644
index 0000000..f77b78a
--- /dev/null
+++ b/nutch-plugins/parse-swf/src/test/resources/test2.txt
@@ -0,0 +1,5 @@
+Impact Impact Impact  Arial Arial Arial  Webdings Webdings Webdings  Verdana Verdana Verdana  CourierNew CourierNew CourierNew  Bimini Bimini Bimini 
+--------
+TextFormat
+color
+font

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-swf/src/test/resources/test3.swf
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-swf/src/test/resources/test3.swf b/nutch-plugins/parse-swf/src/test/resources/test3.swf
new file mode 100644
index 0000000..4df9f1e
Binary files /dev/null and b/nutch-plugins/parse-swf/src/test/resources/test3.swf differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-swf/src/test/resources/test3.txt
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-swf/src/test/resources/test3.txt b/nutch-plugins/parse-swf/src/test/resources/test3.txt
new file mode 100644
index 0000000..66ae3d8
--- /dev/null
+++ b/nutch-plugins/parse-swf/src/test/resources/test3.txt
@@ -0,0 +1,11 @@
+Mix. 
+ Edit. 
+ Master. 
+ Compose. 
+ Animate. 
+ With a single suite of powerful tools 
+ that work together as one. 
+ World-class video and audio tools that bring  
+ new power and efficiency to your film, video,  
+ DVD, and web workflows. 
+ Learn more. 

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/build-ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-tika/build-ivy.xml b/nutch-plugins/parse-tika/build-ivy.xml
new file mode 100644
index 0000000..e4984d8
--- /dev/null
+++ b/nutch-plugins/parse-tika/build-ivy.xml
@@ -0,0 +1,54 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="parse-tika" default="deps-jar" xmlns:ivy="antlib:org.apache.ivy.ant">
+
+    <property name="ivy.install.version" value="2.1.0" />
+    <condition property="ivy.home" value="${env.IVY_HOME}">
+      <isset property="env.IVY_HOME" />
+    </condition>
+    <property name="ivy.home" value="${user.home}/.ant" />
+    <property name="ivy.checksums" value="" />
+    <property name="ivy.jar.dir" value="${ivy.home}/lib" />
+    <property name="ivy.jar.file" value="${ivy.jar.dir}/ivy.jar" />
+
+    <target name="download-ivy" unless="offline">
+
+        <mkdir dir="${ivy.jar.dir}"/>
+        <!-- download Ivy from web site so that it can be used even without any special installation -->
+        <get src="http://repo2.maven.org/maven2/org/apache/ivy/ivy/${ivy.install.version}/ivy-${ivy.install.version}.jar" 
+             dest="${ivy.jar.file}" usetimestamp="true"/>
+    </target>
+
+    <target name="init-ivy" depends="download-ivy">
+      <!-- try to load ivy here from ivy home, in case the user has not already dropped
+              it into ant's lib dir (note that the latter copy will always take precedence).
+              We will not fail as long as local lib dir exists (it may be empty) and
+              ivy is in at least one of ant's lib dir or the local lib dir. -->
+        <path id="ivy.lib.path">
+            <fileset dir="${ivy.jar.dir}" includes="*.jar"/>
+
+        </path>
+        <taskdef resource="org/apache/ivy/ant/antlib.xml"
+                 uri="antlib:org.apache.ivy.ant" classpathref="ivy.lib.path"/>
+    </target>
+
+  <target name="deps-jar" depends="init-ivy">
+    <ivy:retrieve pattern="lib/[artifact]-[revision].[ext]" sync="true"/>
+  </target>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-tika/build.xml b/nutch-plugins/parse-tika/build.xml
new file mode 100644
index 0000000..4ecb3f8
--- /dev/null
+++ b/nutch-plugins/parse-tika/build.xml
@@ -0,0 +1,55 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="parse-tika" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+  
+  <!-- Build compilation dependencies -->
+  <target name="deps-jar">
+    <ant target="jar" inheritall="false" dir="../lib-nekohtml" />
+  </target>
+
+  <!-- Add compilation dependencies to classpath -->
+  <path id="plugin.deps">
+    <fileset dir="${nutch.root}/build">
+      <include name="**/lib-nekohtml/*.jar" />
+    </fileset>
+  </path>
+  
+    <!-- Deploy Unit test dependencies -->
+  <target name="deps-test">
+    <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
+    <ant target="deploy" inheritall="false" dir="../protocol-file"/>
+    <ant target="deploy" inheritall="false" dir="../lib-nekohtml" />
+  </target>
+
+  <!-- for junit test -->
+  <mkdir dir="${build.test}/data"/>
+  <copy todir="${build.test}/data">
+    <fileset dir="sample">
+      <include name="*.rss"/>
+      <include name="*.rtf"/>
+      <include name="*.pdf"/>
+      <include name="ootest.*"/>
+      <include name="*.doc"/>
+      <include name="*.gif"/>
+    </fileset>
+  </copy>
+  
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/howto_upgrade_tika.txt
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-tika/howto_upgrade_tika.txt b/nutch-plugins/parse-tika/howto_upgrade_tika.txt
new file mode 100644
index 0000000..63a05a4
--- /dev/null
+++ b/nutch-plugins/parse-tika/howto_upgrade_tika.txt
@@ -0,0 +1,8 @@
+1. Upgrade Tika depencency in trunk/ivy/ivy.xml
+
+2. Upgrade Tika dependency in src/plugin/parse-tika/ivy.xml
+
+3. Upgrade Tika's own dependencies in src/plugin/parse-tika/plugin.xml
+   To get the list of dependencies and their versions execute:
+   $ ant -f ./build-ivy.xml
+   $ ls lib | sed 's/^/      <library name="/g' | sed 's/$/"\/>/g'

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-tika/ivy.xml b/nutch-plugins/parse-tika/ivy.xml
new file mode 100644
index 0000000..7a9e959
--- /dev/null
+++ b/nutch-plugins/parse-tika/ivy.xml
@@ -0,0 +1,46 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../../ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+    <dependency org="org.apache.tika" name="tika-parsers" rev="1.12" conf="*->default">
+     <exclude org="org.apache.tika" name="tika-core" />
+     <exclude org="org.apache.httpcomponents" name="httpclient" />
+     <exclude org="org.apache.httpcomponents" name="httpcore" />
+    </dependency>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-tika/plugin.xml b/nutch-plugins/parse-tika/plugin.xml
new file mode 100644
index 0000000..04fcd2e
--- /dev/null
+++ b/nutch-plugins/parse-tika/plugin.xml
@@ -0,0 +1,136 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="parse-tika"
+   name="Tika Parser Plug-in"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="parse-tika.jar">
+         <export name="*"/>
+      </library>
+      <library name="apache-mime4j-core-0.7.2.jar"/>
+      <library name="apache-mime4j-dom-0.7.2.jar"/>
+      <library name="asm-5.0.4.jar"/>
+      <library name="aspectjrt-1.8.0.jar"/>
+      <library name="bcmail-jdk15on-1.52.jar"/>
+      <library name="bcpkix-jdk15on-1.52.jar"/>
+      <library name="bcprov-jdk15on-1.52.jar"/>
+      <library name="boilerpipe-1.1.0.jar"/>
+      <library name="bzip2-0.9.1.jar"/>
+      <library name="c3p0-0.9.1.1.jar"/>
+      <library name="cdm-4.5.5.jar"/>
+      <library name="commons-codec-1.6.jar"/>
+      <library name="commons-compress-1.10.jar"/>
+      <library name="commons-csv-1.0.jar"/>
+      <library name="commons-exec-1.3.jar"/>
+      <library name="commons-io-2.4.jar"/>
+      <library name="commons-lang-2.6.jar"/>
+      <library name="commons-logging-1.1.3.jar"/>
+      <library name="commons-logging-api-1.1.jar"/>
+      <library name="commons-vfs2-2.0.jar"/>
+      <library name="cxf-core-3.0.3.jar"/>
+      <library name="cxf-rt-frontend-jaxrs-3.0.3.jar"/>
+      <library name="cxf-rt-rs-client-3.0.3.jar"/>
+      <library name="cxf-rt-transports-http-3.0.3.jar"/>
+      <library name="ehcache-core-2.6.2.jar"/>
+      <library name="fontbox-1.8.10.jar"/>
+      <library name="geoapi-3.0.0.jar"/>
+      <library name="grib-4.5.5.jar"/>
+      <library name="gson-2.2.4.jar"/>
+      <library name="guava-17.0.jar"/>
+      <library name="httpmime-4.2.6.jar"/>
+      <library name="httpservices-4.5.5.jar"/>
+      <library name="isoparser-1.0.2.jar"/>
+      <library name="jackcess-2.1.2.jar"/>
+      <library name="jackcess-encrypt-2.1.1.jar"/>
+      <library name="java-libpst-0.8.1.jar"/>
+      <library name="javax.annotation-api-1.2.jar"/>
+      <library name="javax.ws.rs-api-2.0.1.jar"/>
+      <library name="jcip-annotations-1.0.jar"/>
+      <library name="jcommander-1.35.jar"/>
+      <library name="jdom-2.0.2.jar"/>
+      <library name="jdom2-2.0.4.jar"/>
+      <library name="jempbox-1.8.10.jar"/>
+      <library name="jhighlight-1.0.2.jar"/>
+      <library name="jj2000-5.2.jar"/>
+      <library name="jmatio-1.0.jar"/>
+      <library name="jna-4.1.0.jar"/>
+      <library name="joda-time-2.2.jar"/>
+      <library name="json-20140107.jar"/>
+      <library name="json-simple-1.1.1.jar"/>
+      <library name="jsoup-1.7.2.jar"/>
+      <library name="jsr-275-0.9.3.jar"/>
+      <library name="juniversalchardet-1.0.3.jar"/>
+      <library name="junrar-0.7.jar"/>
+      <library name="jwnl-1.3.3.jar"/>
+      <library name="maven-scm-api-1.4.jar"/>
+      <library name="maven-scm-provider-svn-commons-1.4.jar"/>
+      <library name="maven-scm-provider-svnexe-1.4.jar"/>
+      <library name="metadata-extractor-2.8.0.jar"/>
+      <library name="netcdf4-4.5.5.jar"/>
+      <library name="opennlp-maxent-3.0.3.jar"/>
+      <library name="opennlp-tools-1.5.3.jar"/>
+      <library name="pdfbox-1.8.10.jar"/>
+      <library name="plexus-utils-1.5.6.jar"/>
+      <library name="poi-3.13.jar"/>
+      <library name="poi-ooxml-3.13.jar"/>
+      <library name="poi-ooxml-schemas-3.13.jar"/>
+      <library name="poi-scratchpad-3.13.jar"/>
+      <library name="protobuf-java-2.5.0.jar"/>
+      <library name="quartz-2.2.0.jar"/>
+      <library name="regexp-1.3.jar"/>
+      <library name="rome-1.5.1.jar"/>
+      <library name="rome-utils-1.5.1.jar"/>
+      <library name="sis-metadata-0.5.jar"/>
+      <library name="sis-netcdf-0.5.jar"/>
+      <library name="sis-referencing-0.5.jar"/>
+      <library name="sis-storage-0.5.jar"/>
+      <library name="sis-utility-0.5.jar"/>
+      <library name="slf4j-api-1.7.12.jar"/>
+      <library name="stax2-api-3.1.4.jar"/>
+      <library name="tagsoup-1.2.1.jar"/>
+      <library name="tika-parsers-1.12.jar"/>
+      <library name="udunits-4.5.5.jar"/>
+      <library name="vorbis-java-core-0.6.jar"/>
+      <library name="vorbis-java-tika-0.6.jar"/>
+      <library name="woodstox-core-asl-4.4.1.jar"/>
+      <library name="xmlbeans-2.6.0.jar"/>
+      <library name="xmlschema-core-2.1.0.jar"/>
+      <library name="xmpcore-5.1.2.jar"/>
+      <library name="xz-1.5.jar"/>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+      <import plugin="lib-nekohtml"/>
+   </requires>
+
+   <extension point="org.apache.nutch.parse.Parser"
+              id="org.apache.nutch.parse.tika"
+              name="TikaParser">
+
+      <implementation id="org.apache.nutch.parse.tika.TikaParser"
+                      class="org.apache.nutch.parse.tika.TikaParser">
+       <parameter name="contentType" value="*"/>
+      </implementation>
+
+   </extension>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-tika/pom.xml b/nutch-plugins/parse-tika/pom.xml
new file mode 100644
index 0000000..0cf2340
--- /dev/null
+++ b/nutch-plugins/parse-tika/pom.xml
@@ -0,0 +1,54 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>parse-tika</artifactId>
+    <packaging>jar</packaging>
+
+    <name>parse-tika</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+    <dependencies>
+        <dependency>
+            <groupId>org.apache.tika</groupId>
+            <artifactId>tika-parsers</artifactId>
+            <version>1.13</version>
+            <exclusions>
+                <!-- TODO -->
+            </exclusions>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.nutch</groupId>
+            <artifactId>lib-nekohtml</artifactId>
+            <version>${project.parent.version}</version>
+            <scope>test</scope>
+        </dependency>
+    </dependencies>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/BoilerpipeExtractorRepository.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/BoilerpipeExtractorRepository.java b/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/BoilerpipeExtractorRepository.java
new file mode 100644
index 0000000..7c0d71b
--- /dev/null
+++ b/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/BoilerpipeExtractorRepository.java
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse.tika;
+
+import java.lang.ClassLoader;
+import java.lang.InstantiationException;
+import java.util.HashMap;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.tika.parser.html.BoilerpipeContentHandler;
+import de.l3s.boilerpipe.BoilerpipeExtractor;
+import de.l3s.boilerpipe.extractors.*;
+
+class BoilerpipeExtractorRepository {
+
+    public static final Log LOG = LogFactory.getLog(BoilerpipeExtractorRepository.class);
+    public static final HashMap<String, BoilerpipeExtractor> extractorRepository = new HashMap<String, BoilerpipeExtractor>();
+ 
+    /**
+     * Returns an instance of the specified extractor
+     */
+    public static synchronized BoilerpipeExtractor getExtractor(String boilerpipeExtractorName) {
+      // Check if there's no instance of this extractor
+      if (!extractorRepository.containsKey(boilerpipeExtractorName)) {
+        // FQCN
+        boilerpipeExtractorName = "de.l3s.boilerpipe.extractors." + boilerpipeExtractorName;
+
+        // Attempt to load the class
+        try {
+          ClassLoader loader = BoilerpipeExtractor.class.getClassLoader();
+          Class extractorClass = loader.loadClass(boilerpipeExtractorName);
+
+          // Add an instance to the repository
+          extractorRepository.put(boilerpipeExtractorName, (BoilerpipeExtractor)extractorClass.newInstance());
+
+        } catch (ClassNotFoundException e) {
+          LOG.error("BoilerpipeExtractor " + boilerpipeExtractorName + " not found!");
+        } catch (InstantiationException e) {
+          LOG.error("Could not instantiate " + boilerpipeExtractorName);
+        } catch (Exception e) {
+          LOG.error(e);
+        }
+      }
+
+      return extractorRepository.get(boilerpipeExtractorName);
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/DOMBuilder.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/DOMBuilder.java b/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/DOMBuilder.java
new file mode 100644
index 0000000..77a1044
--- /dev/null
+++ b/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/DOMBuilder.java
@@ -0,0 +1,794 @@
+/*
+ * XXX ab@apache.org: This class is copied verbatim from Xalan-J 2.6.0
+ * XXX distribution, org.apache.xml.utils.DOMBuilder, in order to
+ * avoid dependency on Xalan.
+ */
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Id: DOMBuilder.java 823614 2009-10-09 17:02:32Z ab $
+ */
+package org.apache.nutch.parse.tika;
+
+import java.util.Stack;
+
+import org.w3c.dom.Comment;
+import org.w3c.dom.Document;
+import org.w3c.dom.DocumentFragment;
+import org.w3c.dom.Element;
+import org.w3c.dom.Node;
+import org.w3c.dom.Text;
+import org.w3c.dom.CDATASection;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.Locator;
+import org.xml.sax.ext.LexicalHandler;
+
+/**
+ * This class takes SAX events (in addition to some extra events that SAX
+ * doesn't handle yet) and adds the result to a document or document fragment.
+ */
+class DOMBuilder implements ContentHandler, LexicalHandler {
+  private boolean upperCaseElementNames = true;
+
+  /** Root document */
+  public Document m_doc;
+
+  /** Current node */
+  protected Node m_currentNode = null;
+
+  /** First node of document fragment or null if not a DocumentFragment */
+  public DocumentFragment m_docFrag = null;
+
+  /** Vector of element nodes */
+  protected Stack<Element> m_elemStack = new Stack<Element>();
+
+  /**
+  * Element recorded with this namespace will be converted to Node without a
+  * namespace
+  */
+  private String defaultNamespaceURI = null;
+
+  /**
+   * DOMBuilder instance constructor... it will add the DOM nodes to the
+   * document fragment.
+   * 
+   * @param doc
+   *          Root document
+   * @param node
+   *          Current node
+   */
+  DOMBuilder(Document doc, Node node) {
+    m_doc = doc;
+    m_currentNode = node;
+  }
+
+  /**
+   * DOMBuilder instance constructor... it will add the DOM nodes to the
+   * document fragment.
+   * 
+   * @param doc
+   *          Root document
+   * @param docFrag
+   *          Document fragment
+   */
+  DOMBuilder(Document doc, DocumentFragment docFrag) {
+    m_doc = doc;
+    m_docFrag = docFrag;
+  }
+
+  /**
+   * DOMBuilder instance constructor... it will add the DOM nodes to the
+   * document.
+   * 
+   * @param doc
+   *          Root document
+   */
+  DOMBuilder(Document doc) {
+    m_doc = doc;
+  }
+
+  /**
+   * Get the root node of the DOM being created. This is either a Document or a
+   * DocumentFragment.
+   * 
+   * @return The root document or document fragment if not null
+   */
+  Node getRootNode() {
+    return (null != m_docFrag) ? (Node) m_docFrag : (Node) m_doc;
+  }
+
+  /**
+   * Get the node currently being processed.
+   * 
+   * @return the current node being processed
+   */
+  Node getCurrentNode() {
+    return m_currentNode;
+  }
+
+  /**
+   * Return null since there is no Writer for this class.
+   * 
+   * @return null
+   */
+  java.io.Writer getWriter() {
+    return null;
+  }
+
+  /**
+   * Append a node to the current container.
+   * 
+   * @param newNode
+   *          New node to append
+   */
+  protected void append(Node newNode) throws org.xml.sax.SAXException {
+
+    Node currentNode = m_currentNode;
+
+    if (null != currentNode) {
+      currentNode.appendChild(newNode);
+
+      // System.out.println(newNode.getNodeName());
+    } else if (null != m_docFrag) {
+      m_docFrag.appendChild(newNode);
+    } else {
+      boolean ok = true;
+      short type = newNode.getNodeType();
+
+      if (type == Node.TEXT_NODE) {
+        String data = newNode.getNodeValue();
+
+        if ((null != data) && (data.trim().length() > 0)) {
+          throw new org.xml.sax.SAXException(
+              "Warning: can't output text before document element!  Ignoring...");
+        }
+
+        ok = false;
+      } else if (type == Node.ELEMENT_NODE) {
+        if (m_doc.getDocumentElement() != null) {
+          throw new org.xml.sax.SAXException(
+              "Can't have more than one root on a DOM!");
+        }
+      }
+
+      if (ok)
+        m_doc.appendChild(newNode);
+    }
+  }
+
+  /**
+   * Receive an object for locating the origin of SAX document events.
+   * 
+   * <p>
+   * SAX parsers are strongly encouraged (though not absolutely required) to
+   * supply a locator: if it does so, it must supply the locator to the
+   * application by invoking this method before invoking any of the other
+   * methods in the ContentHandler interface.
+   * </p>
+   * 
+   * <p>
+   * The locator allows the application to determine the end position of any
+   * document-related event, even if the parser is not reporting an error.
+   * Typically, the application will use this information for reporting its own
+   * errors (such as character content that does not match an application's
+   * business rules). The information returned by the locator is probably not
+   * sufficient for use with a search engine.
+   * </p>
+   * 
+   * <p>
+   * Note that the locator will return correct information only during the
+   * invocation of the events in this interface. The application should not
+   * attempt to use it at any other time.
+   * </p>
+   * 
+   * @param locator
+   *          An object that can return the location of any SAX document event.
+   * @see org.xml.sax.Locator
+   */
+  public void setDocumentLocator(Locator locator) {
+
+    // No action for the moment.
+  }
+
+  /**
+   * Receive notification of the beginning of a document.
+   * 
+   * <p>
+   * The SAX parser will invoke this method only once, before any other methods
+   * in this interface or in DTDHandler (except for setDocumentLocator).
+   * </p>
+   */
+  public void startDocument() throws org.xml.sax.SAXException {
+
+    // No action for the moment.
+  }
+
+  /**
+   * Receive notification of the end of a document.
+   * 
+   * <p>
+   * The SAX parser will invoke this method only once, and it will be the last
+   * method invoked during the parse. The parser shall not invoke this method
+   * until it has either abandoned parsing (because of an unrecoverable error)
+   * or reached the end of input.
+   * </p>
+   */
+  public void endDocument() throws org.xml.sax.SAXException {
+
+    // No action for the moment.
+  }
+
+  /**
+   * Receive notification of the beginning of an element.
+   * 
+   * <p>
+   * The Parser will invoke this method at the beginning of every element in the
+   * XML document; there will be a corresponding endElement() event for every
+   * startElement() event (even when the element is empty). All of the element's
+   * content will be reported, in order, before the corresponding endElement()
+   * event.
+   * </p>
+   * 
+   * <p>
+   * If the element name has a namespace prefix, the prefix will still be
+   * attached. Note that the attribute list provided will contain only
+   * attributes with explicit values (specified or defaulted): #IMPLIED
+   * attributes will be omitted.
+   * </p>
+   * 
+   * 
+   * @param ns
+   *          The namespace of the node
+   * @param localName
+   *          The local part of the qualified name
+   * @param name
+   *          The element name.
+   * @param atts
+   *          The attributes attached to the element, if any.
+   * @see #endElement
+   * @see org.xml.sax.Attributes
+   */
+  public void startElement(String ns, String localName, String name,
+      Attributes atts) throws org.xml.sax.SAXException {
+
+    Element elem;
+
+    if (upperCaseElementNames)
+      name = name.toUpperCase();
+
+    // Note that the namespace-aware call must be used to correctly
+    // construct a Level 2 DOM, even for non-namespaced nodes.
+    if ((null == ns) || (ns.length() == 0) || ns.equals(defaultNamespaceURI))
+      elem = m_doc.createElementNS(null, name);
+    else
+      elem = m_doc.createElementNS(ns, name);
+
+    append(elem);
+
+    try {
+      int nAtts = atts.getLength();
+
+      if (0 != nAtts) {
+        for (int i = 0; i < nAtts; i++) {
+
+          // System.out.println("type " + atts.getType(i) + " name " +
+          // atts.getLocalName(i) );
+          // First handle a possible ID attribute
+          if (atts.getType(i).equalsIgnoreCase("ID"))
+            setIDAttribute(atts.getValue(i), elem);
+
+          String attrNS = atts.getURI(i);
+
+          if ("".equals(attrNS))
+            attrNS = null; // DOM represents no-namespace as null
+
+          // System.out.println("attrNS: "+attrNS+", localName: "+atts.getQName(i)
+          // +", qname: "+atts.getQName(i)+", value: "+atts.getValue(i));
+          // Crimson won't let us set an xmlns: attribute on the DOM.
+          String attrQName = atts.getQName(i);
+
+          // In SAX, xmlns: attributes have an empty namespace, while in DOM
+          // they should have the xmlns namespace
+          if (attrQName.startsWith("xmlns:"))
+            attrNS = "http://www.w3.org/2000/xmlns/";
+
+          // ALWAYS use the DOM Level 2 call!
+          elem.setAttributeNS(attrNS, attrQName, atts.getValue(i));
+        }
+      }
+
+      // append(elem);
+
+      m_elemStack.push(elem);
+
+      m_currentNode = elem;
+
+      // append(elem);
+    } catch (java.lang.Exception de) {
+      // de.printStackTrace();
+      throw new org.xml.sax.SAXException(de);
+    }
+
+  }
+
+  /**
+   * 
+   * 
+   * 
+   * Receive notification of the end of an element.
+   * 
+   * <p>
+   * The SAX parser will invoke this method at the end of every element in the
+   * XML document; there will be a corresponding startElement() event for every
+   * endElement() event (even when the element is empty).
+   * </p>
+   * 
+   * <p>
+   * If the element name has a namespace prefix, the prefix will still be
+   * attached to the name.
+   * </p>
+   * 
+   * 
+   * @param ns
+   *          the namespace of the element
+   * @param localName
+   *          The local part of the qualified name of the element
+   * @param name
+   *          The element name
+   */
+  public void endElement(String ns, String localName, String name)
+      throws org.xml.sax.SAXException {
+    if (!m_elemStack.isEmpty()) {
+      m_elemStack.pop();
+    }
+    m_currentNode = m_elemStack.isEmpty() ? null : (Node) m_elemStack.peek();
+  }
+
+  /**
+   * Set an ID string to node association in the ID table.
+   * 
+   * @param id
+   *          The ID string.
+   * @param elem
+   *          The associated ID.
+   */
+  public void setIDAttribute(String id, Element elem) {
+
+    // Do nothing. This method is meant to be overiden.
+  }
+
+  /**
+   * Receive notification of character data.
+   * 
+   * <p>
+   * The Parser will call this method to report each chunk of character data.
+   * SAX parsers may return all contiguous character data in a single chunk, or
+   * they may split it into several chunks; however, all of the characters in
+   * any single event must come from the same external entity, so that the
+   * Locator provides useful information.
+   * </p>
+   * 
+   * <p>
+   * The application must not attempt to read from the array outside of the
+   * specified range.
+   * </p>
+   * 
+   * <p>
+   * Note that some parsers will report whitespace using the
+   * ignorableWhitespace() method rather than this one (validating parsers must
+   * do so).
+   * </p>
+   * 
+   * @param ch
+   *          The characters from the XML document.
+   * @param start
+   *          The start position in the array.
+   * @param length
+   *          The number of characters to read from the array.
+   * @see #ignorableWhitespace
+   * @see org.xml.sax.Locator
+   */
+  public void characters(char ch[], int start, int length)
+      throws org.xml.sax.SAXException {
+    if (isOutsideDocElem()
+        && XMLCharacterRecognizer.isWhiteSpace(ch, start, length))
+      return; // avoid DOM006 Hierarchy request error
+
+    if (m_inCData) {
+      cdata(ch, start, length);
+
+      return;
+    }
+
+    String s = new String(ch, start, length);
+    Node childNode;
+    childNode = m_currentNode != null ? m_currentNode.getLastChild() : null;
+    if (childNode != null && childNode.getNodeType() == Node.TEXT_NODE) {
+      ((Text) childNode).appendData(s);
+    } else {
+      Text text = m_doc.createTextNode(s);
+      append(text);
+    }
+  }
+
+  /**
+   * If available, when the disable-output-escaping attribute is used, output
+   * raw text without escaping. A PI will be inserted in front of the node with
+   * the name "lotusxsl-next-is-raw" and a value of "formatter-to-dom".
+   * 
+   * @param ch
+   *          Array containing the characters
+   * @param start
+   *          Index to start of characters in the array
+   * @param length
+   *          Number of characters in the array
+   */
+  public void charactersRaw(char ch[], int start, int length)
+      throws org.xml.sax.SAXException {
+    if (isOutsideDocElem()
+        && XMLCharacterRecognizer.isWhiteSpace(ch, start, length))
+      return; // avoid DOM006 Hierarchy request error
+
+    String s = new String(ch, start, length);
+
+    append(m_doc.createProcessingInstruction("xslt-next-is-raw",
+        "formatter-to-dom"));
+    append(m_doc.createTextNode(s));
+  }
+
+  /**
+   * Report the beginning of an entity.
+   * 
+   * The start and end of the document entity are not reported. The start and
+   * end of the external DTD subset are reported using the pseudo-name "[dtd]".
+   * All other events must be properly nested within start/end entity events.
+   * 
+   * @param name
+   *          The name of the entity. If it is a parameter entity, the name will
+   *          begin with '%'.
+   * @see #endEntity
+   * @see org.xml.sax.ext.DeclHandler#internalEntityDecl
+   * @see org.xml.sax.ext.DeclHandler#externalEntityDecl
+   */
+  public void startEntity(String name) throws org.xml.sax.SAXException {
+
+    // Almost certainly the wrong behavior...
+    // entityReference(name);
+  }
+
+  /**
+   * Report the end of an entity.
+   * 
+   * @param name
+   *          The name of the entity that is ending.
+   * @see #startEntity
+   */
+  public void endEntity(String name) throws org.xml.sax.SAXException {
+  }
+
+  /**
+   * Receive notivication of a entityReference.
+   * 
+   * @param name
+   *          name of the entity reference
+   */
+  public void entityReference(String name) throws org.xml.sax.SAXException {
+    append(m_doc.createEntityReference(name));
+  }
+
+  /**
+   * Receive notification of ignorable whitespace in element content.
+   * 
+   * <p>
+   * Validating Parsers must use this method to report each chunk of ignorable
+   * whitespace (see the W3C XML 1.0 recommendation, section 2.10):
+   * non-validating parsers may also use this method if they are capable of
+   * parsing and using content models.
+   * </p>
+   * 
+   * <p>
+   * SAX parsers may return all contiguous whitespace in a single chunk, or they
+   * may split it into several chunks; however, all of the characters in any
+   * single event must come from the same external entity, so that the Locator
+   * provides useful information.
+   * </p>
+   * 
+   * <p>
+   * The application must not attempt to read from the array outside of the
+   * specified range.
+   * </p>
+   * 
+   * @param ch
+   *          The characters from the XML document.
+   * @param start
+   *          The start position in the array.
+   * @param length
+   *          The number of characters to read from the array.
+   * @see #characters
+   */
+  public void ignorableWhitespace(char ch[], int start, int length)
+      throws org.xml.sax.SAXException {
+    if (isOutsideDocElem())
+      return; // avoid DOM006 Hierarchy request error
+
+    String s = new String(ch, start, length);
+
+    append(m_doc.createTextNode(s));
+  }
+
+  /**
+   * Tell if the current node is outside the document element.
+   * 
+   * @return true if the current node is outside the document element.
+   */
+  private boolean isOutsideDocElem() {
+    return (null == m_docFrag)
+        && m_elemStack.size() == 0
+        && (null == m_currentNode || m_currentNode.getNodeType() == Node.DOCUMENT_NODE);
+  }
+
+  /**
+   * Receive notification of a processing instruction.
+   * 
+   * <p>
+   * The Parser will invoke this method once for each processing instruction
+   * found: note that processing instructions may occur before or after the main
+   * document element.
+   * </p>
+   * 
+   * <p>
+   * A SAX parser should never report an XML declaration (XML 1.0, section 2.8)
+   * or a text declaration (XML 1.0, section 4.3.1) using this method.
+   * </p>
+   * 
+   * @param target
+   *          The processing instruction target.
+   * @param data
+   *          The processing instruction data, or null if none was supplied.
+   */
+  public void processingInstruction(String target, String data)
+      throws org.xml.sax.SAXException {
+    append(m_doc.createProcessingInstruction(target, data));
+  }
+
+  /**
+   * Report an XML comment anywhere in the document.
+   * 
+   * This callback will be used for comments inside or outside the document
+   * element, including comments in the external DTD subset (if read).
+   * 
+   * @param ch
+   *          An array holding the characters in the comment.
+   * @param start
+   *          The starting position in the array.
+   * @param length
+   *          The number of characters to use from the array.
+   */
+  public void comment(char ch[], int start, int length)
+      throws org.xml.sax.SAXException {
+    // tagsoup sometimes submits invalid values here
+    if (ch == null || start < 0 || length >= (ch.length - start) || length < 0)
+      return;
+    append(m_doc.createComment(new String(ch, start, length)));
+  }
+
+  /** Flag indicating that we are processing a CData section */
+  protected boolean m_inCData = false;
+
+  /**
+   * Report the start of a CDATA section.
+   * 
+   * @see #endCDATA
+   */
+  public void startCDATA() throws org.xml.sax.SAXException {
+    m_inCData = true;
+    append(m_doc.createCDATASection(""));
+  }
+
+  /**
+   * Report the end of a CDATA section.
+   * 
+   * @see #startCDATA
+   */
+  public void endCDATA() throws org.xml.sax.SAXException {
+    m_inCData = false;
+  }
+
+  /**
+   * Receive notification of cdata.
+   * 
+   * <p>
+   * The Parser will call this method to report each chunk of character data.
+   * SAX parsers may return all contiguous character data in a single chunk, or
+   * they may split it into several chunks; however, all of the characters in
+   * any single event must come from the same external entity, so that the
+   * Locator provides useful information.
+   * </p>
+   * 
+   * <p>
+   * The application must not attempt to read from the array outside of the
+   * specified range.
+   * </p>
+   * 
+   * <p>
+   * Note that some parsers will report whitespace using the
+   * ignorableWhitespace() method rather than this one (validating parsers must
+   * do so).
+   * </p>
+   * 
+   * @param ch
+   *          The characters from the XML document.
+   * @param start
+   *          The start position in the array.
+   * @param length
+   *          The number of characters to read from the array.
+   * @see #ignorableWhitespace
+   * @see org.xml.sax.Locator
+   */
+  public void cdata(char ch[], int start, int length)
+      throws org.xml.sax.SAXException {
+    if (isOutsideDocElem()
+        && XMLCharacterRecognizer.isWhiteSpace(ch, start, length))
+      return; // avoid DOM006 Hierarchy request error
+
+    String s = new String(ch, start, length);
+
+    // XXX ab@apache.org: modified from the original, to accomodate TagSoup.
+    Node n = m_currentNode.getLastChild();
+    if (n instanceof CDATASection)
+      ((CDATASection) n).appendData(s);
+    else if (n instanceof Comment)
+      ((Comment) n).appendData(s);
+  }
+
+  /**
+   * Report the start of DTD declarations, if any.
+   * 
+   * Any declarations are assumed to be in the internal subset unless otherwise
+   * indicated.
+   * 
+   * @param name
+   *          The document type name.
+   * @param publicId
+   *          The declared public identifier for the external DTD subset, or
+   *          null if none was declared.
+   * @param systemId
+   *          The declared system identifier for the external DTD subset, or
+   *          null if none was declared.
+   * @see #endDTD
+   * @see #startEntity
+   */
+  public void startDTD(String name, String publicId, String systemId)
+      throws org.xml.sax.SAXException {
+
+    // Do nothing for now.
+  }
+
+  /**
+   * Report the end of DTD declarations.
+   * 
+   * @see #startDTD
+   */
+  public void endDTD() throws org.xml.sax.SAXException {
+
+    // Do nothing for now.
+  }
+
+  /**
+   * Begin the scope of a prefix-URI Namespace mapping.
+   * 
+   * <p>
+   * The information from this event is not necessary for normal Namespace
+   * processing: the SAX XML reader will automatically replace prefixes for
+   * element and attribute names when the http://xml.org/sax/features/namespaces
+   * feature is true (the default).
+   * </p>
+   * 
+   * <p>
+   * There are cases, however, when applications need to use prefixes in
+   * character data or in attribute values, where they cannot safely be expanded
+   * automatically; the start/endPrefixMapping event supplies the information to
+   * the application to expand prefixes in those contexts itself, if necessary.
+   * </p>
+   * 
+   * <p>
+   * Note that start/endPrefixMapping events are not guaranteed to be properly
+   * nested relative to each-other: all startPrefixMapping events will occur
+   * before the corresponding startElement event, and all endPrefixMapping
+   * events will occur after the corresponding endElement event, but their order
+   * is not guaranteed.
+   * </p>
+   * 
+   * @param prefix
+   *          The Namespace prefix being declared.
+   * @param uri
+   *          The Namespace URI the prefix is mapped to.
+   * @see #endPrefixMapping
+   * @see #startElement
+   */
+  public void startPrefixMapping(String prefix, String uri)
+      throws org.xml.sax.SAXException {
+
+    /*
+     * // Not sure if this is needed or wanted // Also, it fails in the stree.
+     * if((null != m_currentNode) && (m_currentNode.getNodeType() ==
+     * Node.ELEMENT_NODE)) { String qname; if(((null != prefix) &&
+     * (prefix.length() == 0)) || (null == prefix)) qname = "xmlns"; else qname
+     * = "xmlns:"+prefix;
+     * 
+     * Element elem = (Element)m_currentNode; String val =
+     * elem.getAttribute(qname); // Obsolete, should be DOM2...? if(val == null)
+     * { elem.setAttributeNS("http://www.w3.org/XML/1998/namespace", qname,
+     * uri); } }
+     */
+  }
+
+  /**
+   * End the scope of a prefix-URI mapping.
+   * 
+   * <p>
+   * See startPrefixMapping for details. This event will always occur after the
+   * corresponding endElement event, but the order of endPrefixMapping events is
+   * not otherwise guaranteed.
+   * </p>
+   * 
+   * @param prefix
+   *          The prefix that was being mapping.
+   * @see #startPrefixMapping
+   * @see #endElement
+   */
+  public void endPrefixMapping(String prefix) throws org.xml.sax.SAXException {
+  }
+
+  /**
+   * Receive notification of a skipped entity.
+   * 
+   * <p>
+   * The Parser will invoke this method once for each entity skipped.
+   * Non-validating processors may skip entities if they have not seen the
+   * declarations (because, for example, the entity was declared in an external
+   * DTD subset). All processors may skip external entities, depending on the
+   * values of the http://xml.org/sax/features/external-general-entities and the
+   * http://xml.org/sax/features/external-parameter-entities properties.
+   * </p>
+   * 
+   * @param name
+   *          The name of the skipped entity. If it is a parameter entity, the
+   *          name will begin with '%'.
+   */
+  public void skippedEntity(String name) throws org.xml.sax.SAXException {
+  }
+
+  public boolean isUpperCaseElementNames() {
+    return upperCaseElementNames;
+  }
+
+  public void setUpperCaseElementNames(boolean upperCaseElementNames) {
+    this.upperCaseElementNames = upperCaseElementNames;
+  }
+ 
+  public String getDefaultNamespaceURI() {
+    return defaultNamespaceURI;
+  }
+
+  public void setDefaultNamespaceURI(String defaultNamespaceURI) {
+    this.defaultNamespaceURI = defaultNamespaceURI;
+  }
+}

[44/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build for nutch-core and nutch-plugins

Posted by th...@apache.org.

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/indexer/IndexingFiltersChecker.java b/nutch-core/src/main/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
new file mode 100644
index 0000000..2e1b9c2
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
@@ -0,0 +1,371 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.indexer;
+
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
+import java.io.PrintWriter;
+import java.net.ServerSocket;
+import java.net.Socket;
+import java.net.InetSocketAddress;
+import java.nio.charset.Charset;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.crawl.SignatureFactory;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.net.URLNormalizers;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseResult;
+import org.apache.nutch.parse.ParseSegment;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.protocol.ProtocolOutput;
+import org.apache.nutch.scoring.ScoringFilters;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.StringUtil;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Reads and parses a URL and run the indexers on it. Displays the fields
+ * obtained and the first 100 characters of their value
+ * 
+ * Tested with e.g. ./nutch org.apache.nutch.indexer.IndexingFiltersChecker
+ * http://www.lemonde.fr
+ * 
+ * @author Julien Nioche
+ **/
+
+public class IndexingFiltersChecker extends Configured implements Tool {
+
+  protected URLNormalizers normalizers = null;
+  protected boolean dumpText = false;
+  protected boolean followRedirects = false;
+  protected boolean keepClientCnxOpen = false;
+  // used to simulate the metadata propagated from injection
+  protected HashMap<String, String> metadata = new HashMap<String, String>();
+  protected int tcpPort = -1;
+
+  public static final Logger LOG = LoggerFactory
+      .getLogger(IndexingFiltersChecker.class);
+
+  public IndexingFiltersChecker() {
+
+  }
+
+  public int run(String[] args) throws Exception {
+    String url = null;
+    String usage = "Usage: IndexingFiltersChecker [-normalize] [-followRedirects] [-dumpText] [-md key=value] [-listen <port>] [-keepClientCnxOpen]";
+
+    if (args.length == 0) {
+      System.err.println(usage);
+      return -1;
+    }
+
+    for (int i = 0; i < args.length; i++) {
+      if (args[i].equals("-normalize")) {
+        normalizers = new URLNormalizers(getConf(), URLNormalizers.SCOPE_DEFAULT);
+      } else if (args[i].equals("-listen")) {
+        tcpPort = Integer.parseInt(args[++i]);
+      } else if (args[i].equals("-followRedirects")) {
+        followRedirects = true;
+      } else if (args[i].equals("-keepClientCnxOpen")) {
+        keepClientCnxOpen = true;
+      } else if (args[i].equals("-dumpText")) {
+        dumpText = true;
+      } else if (args[i].equals("-md")) {
+        String k = null, v = null;
+        String nextOne = args[++i];
+        int firstEquals = nextOne.indexOf("=");
+        if (firstEquals != -1) {
+          k = nextOne.substring(0, firstEquals);
+          v = nextOne.substring(firstEquals + 1);
+        } else
+          k = nextOne;
+        metadata.put(k, v);
+      } else if (i != args.length - 1) {
+        System.err.println(usage);
+        System.exit(-1);
+      } else {
+        url =args[i];
+      }
+    }
+    
+    // In listening mode?
+    if (tcpPort == -1) {
+      // No, just fetch and display
+      StringBuilder output = new StringBuilder();
+      int ret = fetch(url, output);
+      System.out.println(output);
+      return ret;
+    } else {
+      // Listen on socket and start workers on incoming requests
+      listen();
+    }
+    
+    return 0;
+  }
+  
+  protected void listen() throws Exception {
+    ServerSocket server = null;
+
+    try{
+      server = new ServerSocket();
+      server.bind(new InetSocketAddress(tcpPort));
+      LOG.info(server.toString());
+    } catch (Exception e) {
+      LOG.error("Could not listen on port " + tcpPort);
+      System.exit(-1);
+    }
+    
+    while(true){
+      Worker worker;
+      try{
+        worker = new Worker(server.accept());
+        Thread thread = new Thread(worker);
+        thread.start();
+      } catch (Exception e) {
+        LOG.error("Accept failed: " + tcpPort);
+        System.exit(-1);
+      }
+    }
+  }
+  
+  private class Worker implements Runnable {
+    private Socket client;
+
+    Worker(Socket client) {
+      this.client = client;
+      LOG.info(client.toString());
+    }
+
+    public void run() {
+      if (keepClientCnxOpen) {
+        while (true) { // keep connection open until closes
+          readWrite();
+        }
+      } else {
+        readWrite();
+        
+        try { // close ourselves
+          client.close();
+        } catch (Exception e){
+          LOG.error(e.toString());
+        }
+      }
+    }
+    
+    protected void readWrite() {
+      String line;
+      BufferedReader in = null;
+      PrintWriter out = null;
+      
+      try{
+        in = new BufferedReader(new InputStreamReader(client.getInputStream()));
+      } catch (Exception e) {
+        LOG.error("in or out failed");
+        System.exit(-1);
+      }
+
+      try{
+        line = in.readLine();        
+        StringBuilder output = new StringBuilder();
+        fetch(line, output);
+        
+        client.getOutputStream().write(output.toString().getBytes(Charset.forName("UTF-8")));
+      }catch (Exception e) {
+        LOG.error("Read/Write failed: " + e);
+      }
+    }
+  }
+    
+  
+  protected int fetch(String url, StringBuilder output) throws Exception {
+    if (normalizers != null) {
+      url = normalizers.normalize(url, URLNormalizers.SCOPE_DEFAULT);
+    }
+
+    LOG.info("fetching: " + url);
+
+    CrawlDatum datum = new CrawlDatum();
+
+    Iterator<String> iter = metadata.keySet().iterator();
+    while (iter.hasNext()) {
+      String key = iter.next();
+      String value = metadata.get(key);
+      if (value == null)
+        value = "";
+      datum.getMetaData().put(new Text(key), new Text(value));
+    }
+
+    IndexingFilters indexers = new IndexingFilters(getConf());
+    
+    int maxRedirects = 3;
+
+    ProtocolOutput protocolOutput = getProtocolOutput(url, datum);
+    Text turl = new Text(url);
+    
+    // Following redirects and not reached maxRedirects?
+    while (!protocolOutput.getStatus().isSuccess() && followRedirects && protocolOutput.getStatus().isRedirect() && maxRedirects != 0) {
+      String[] stuff = protocolOutput.getStatus().getArgs();
+      url = stuff[0];
+      
+      if (normalizers != null) {
+        url = normalizers.normalize(url, URLNormalizers.SCOPE_DEFAULT);
+      }
+    
+      turl.set(url);
+      
+      // try again
+      protocolOutput = getProtocolOutput(url, datum);
+      maxRedirects--;
+    }
+
+    if (!protocolOutput.getStatus().isSuccess()) {
+      output.append("Fetch failed with protocol status: "
+          + protocolOutput.getStatus() + "\n");
+      return 0;
+    }
+
+    Content content = protocolOutput.getContent();
+
+    if (content == null) {
+      output.append("No content for " + url + "\n");
+      return 0;
+    }
+
+    String contentType = content.getContentType();
+
+    if (contentType == null) {
+      return -1;
+    }
+
+    // store the guessed content type in the crawldatum
+    datum.getMetaData().put(new Text(Metadata.CONTENT_TYPE),
+        new Text(contentType));
+
+    if (ParseSegment.isTruncated(content)) {
+      LOG.warn("Content is truncated, parse may fail!");
+    }
+
+    ScoringFilters scfilters = new ScoringFilters(getConf());
+    // call the scoring filters
+    try {
+      scfilters.passScoreBeforeParsing(turl, datum, content);
+    } catch (Exception e) {
+      LOG.warn("Couldn't pass score, url {} ({})", url, e);
+    }
+
+    LOG.info("parsing: {}", url);
+    LOG.info("contentType: {}", contentType);
+
+    ParseResult parseResult = new ParseUtil(getConf()).parse(content);
+
+    NutchDocument doc = new NutchDocument();
+    doc.add("id", url);
+    Text urlText = new Text(url);
+
+    Inlinks inlinks = null;
+    Parse parse = parseResult.get(urlText);
+    if (parse == null) {
+      LOG.error("Failed to get parse from parse result");
+      LOG.error("Available parses in parse result (by URL key):");
+      for (Map.Entry<Text, Parse> entry : parseResult) {
+        LOG.error("  " + entry.getKey());
+      }
+      LOG.error("Parse result does not contain a parse for URL to be checked:");
+      LOG.error("  " + urlText);
+      return -1;
+    }
+
+    byte[] signature = SignatureFactory.getSignature(getConf()).calculate(content,
+        parse);
+    parse.getData().getContentMeta()
+        .set(Nutch.SIGNATURE_KEY, StringUtil.toHexString(signature));
+    String digest = parse.getData().getContentMeta().get(Nutch.SIGNATURE_KEY);
+    doc.add("digest", digest);
+    datum.setSignature(signature);
+
+    // call the scoring filters
+    try {
+      scfilters.passScoreAfterParsing(turl, content, parseResult.get(turl));
+    } catch (Exception e) {
+      LOG.warn("Couldn't pass score, url {} ({})", turl, e);
+    }
+
+    try {
+      doc = indexers.filter(doc, parse, urlText, datum, inlinks);
+    } catch (IndexingException e) {
+      e.printStackTrace();
+    }
+
+    if (doc == null) {
+      output.append("Document discarded by indexing filter\n");
+      return 0;
+    }
+
+    for (String fname : doc.getFieldNames()) {
+      List<Object> values = doc.getField(fname).getValues();
+      if (values != null) {
+        for (Object value : values) {
+          String str = value.toString();
+          int minText = dumpText ? str.length() : Math.min(100, str.length());
+          output.append(fname + " :\t" + str.substring(0, minText) + "\n");
+        }
+      }
+    }
+    
+    output.append("\n"); // For readability if keepClientCnxOpen
+
+    if (getConf().getBoolean("doIndex", false) && doc != null) {
+      IndexWriters writers = new IndexWriters(getConf());
+      writers.open(new JobConf(getConf()), "IndexingFilterChecker");
+      writers.write(doc);
+      writers.close();
+    }
+
+    return 0;
+  }
+  
+  protected ProtocolOutput getProtocolOutput(String url, CrawlDatum datum) throws Exception {
+    ProtocolFactory factory = new ProtocolFactory(getConf());
+    Protocol protocol = factory.getProtocol(url);
+    Text turl = new Text(url);
+    ProtocolOutput protocolOutput = protocol.getProtocolOutput(turl, datum);
+    return protocolOutput;
+  }
+
+  public static void main(String[] args) throws Exception {
+    final int res = ToolRunner.run(NutchConfiguration.create(),
+        new IndexingFiltersChecker(), args);
+    System.exit(res);
+  }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/indexer/IndexingJob.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/indexer/IndexingJob.java b/nutch-core/src/main/java/org/apache/nutch/indexer/IndexingJob.java
new file mode 100644
index 0000000..342ea4a
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/indexer/IndexingJob.java
@@ -0,0 +1,358 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer;
+
+import java.io.File;
+import java.io.IOException;
+import java.text.SimpleDateFormat;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+import java.util.Random;
+
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.segment.SegmentChecker;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapred.FileOutputFormat;
+import org.apache.hadoop.mapred.JobClient;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.Counters.Counter;
+import org.apache.hadoop.mapred.RunningJob;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.nutch.util.HadoopFSUtil;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.NutchJob;
+import org.apache.nutch.util.NutchTool;
+import org.apache.nutch.util.TimingUtil;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Generic indexer which relies on the plugins implementing IndexWriter
+ **/
+
+public class IndexingJob extends NutchTool implements Tool {
+
+  public static Logger LOG = LoggerFactory.getLogger(IndexingJob.class);
+
+  public IndexingJob() {
+    super(null);
+  }
+
+  public IndexingJob(Configuration conf) {
+    super(conf);
+  }
+
+  public void index(Path crawlDb, Path linkDb, List<Path> segments,
+      boolean noCommit) throws IOException {
+    index(crawlDb, linkDb, segments, noCommit, false, null);
+  }
+
+  public void index(Path crawlDb, Path linkDb, List<Path> segments,
+      boolean noCommit, boolean deleteGone) throws IOException {
+    index(crawlDb, linkDb, segments, noCommit, deleteGone, null);
+  }
+
+  public void index(Path crawlDb, Path linkDb, List<Path> segments,
+      boolean noCommit, boolean deleteGone, String params) throws IOException {
+    index(crawlDb, linkDb, segments, noCommit, deleteGone, params, false, false);
+  }
+
+  public void index(Path crawlDb, Path linkDb, List<Path> segments,
+      boolean noCommit, boolean deleteGone, String params, boolean filter,
+      boolean normalize) throws IOException {
+    index(crawlDb, linkDb, segments, noCommit, deleteGone, params, false,
+        false, false);
+  }
+
+  public void index(Path crawlDb, Path linkDb, List<Path> segments,
+      boolean noCommit, boolean deleteGone, String params,
+      boolean filter, boolean normalize, boolean addBinaryContent) throws IOException {
+    index(crawlDb, linkDb, segments, noCommit, deleteGone, params, false,
+        false, false, false);
+  }
+
+  public void index(Path crawlDb, Path linkDb, List<Path> segments,
+      boolean noCommit, boolean deleteGone, String params,
+      boolean filter, boolean normalize, boolean addBinaryContent,
+      boolean base64) throws IOException {
+
+
+    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+    long start = System.currentTimeMillis();
+    LOG.info("Indexer: starting at {}", sdf.format(start));
+
+    final JobConf job = new NutchJob(getConf());
+    job.setJobName("Indexer");
+
+    LOG.info("Indexer: deleting gone documents: {}", deleteGone);
+    LOG.info("Indexer: URL filtering: {}", filter);
+    LOG.info("Indexer: URL normalizing: {}", normalize);
+    if (addBinaryContent) {
+      if (base64) {
+        LOG.info("Indexer: adding binary content as Base64");
+      } else {
+        LOG.info("Indexer: adding binary content");
+      }
+    }        
+    IndexWriters writers = new IndexWriters(getConf());
+    LOG.info(writers.describe());
+
+    IndexerMapReduce.initMRJob(crawlDb, linkDb, segments, job, addBinaryContent);
+
+    // NOW PASSED ON THE COMMAND LINE AS A HADOOP PARAM
+    // job.set(SolrConstants.SERVER_URL, solrUrl);
+
+    job.setBoolean(IndexerMapReduce.INDEXER_DELETE, deleteGone);
+    job.setBoolean(IndexerMapReduce.URL_FILTERING, filter);
+    job.setBoolean(IndexerMapReduce.URL_NORMALIZING, normalize);
+    job.setBoolean(IndexerMapReduce.INDEXER_BINARY_AS_BASE64, base64);
+
+    if (params != null) {
+      job.set(IndexerMapReduce.INDEXER_PARAMS, params);
+    }
+
+    job.setReduceSpeculativeExecution(false);
+
+    final Path tmp = new Path("tmp_" + System.currentTimeMillis() + "-"
+        + new Random().nextInt());
+
+    FileOutputFormat.setOutputPath(job, tmp);
+    try {
+      RunningJob indexJob = JobClient.runJob(job);
+      // do the commits once and for all the reducers in one go
+      if (!noCommit) {
+        writers.open(job, "commit");
+        writers.commit();
+      }
+      LOG.info("Indexer: number of documents indexed, deleted, or skipped:");
+      for (Counter counter : indexJob.getCounters().getGroup("IndexerStatus")) {
+        LOG.info("Indexer: {}  {}",
+            String.format(Locale.ROOT, "%6d", counter.getValue()),
+            counter.getName());
+      }
+      long end = System.currentTimeMillis();
+      LOG.info("Indexer: finished at " + sdf.format(end) + ", elapsed: "
+          + TimingUtil.elapsedTime(start, end));
+    } finally {
+      FileSystem.get(job).delete(tmp, true);
+    }
+  }
+
+  public int run(String[] args) throws Exception {
+    if (args.length < 2) {
+      System.err
+      //.println("Usage: Indexer <crawldb> [-linkdb <linkdb>] [-params k1=v1&k2=v2...] (<segment> ... | -dir <segments>) [-noCommit] [-deleteGone] [-filter] [-normalize]");
+      .println("Usage: Indexer <crawldb> [-linkdb <linkdb>] [-params k1=v1&k2=v2...] (<segment> ... | -dir <segments>) [-noCommit] [-deleteGone] [-filter] [-normalize] [-addBinaryContent] [-base64]");
+      IndexWriters writers = new IndexWriters(getConf());
+      System.err.println(writers.describe());
+      return -1;
+    }
+
+    final Path crawlDb = new Path(args[0]);
+    Path linkDb = null;
+
+    final List<Path> segments = new ArrayList<Path>();
+    String params = null;
+
+    boolean noCommit = false;
+    boolean deleteGone = false;
+    boolean filter = false;
+    boolean normalize = false;
+    boolean addBinaryContent = false;
+    boolean base64 = false;
+
+    for (int i = 1; i < args.length; i++) {
+      FileSystem fs = null;
+      Path dir = null;
+      if (args[i].equals("-linkdb")) {
+        linkDb = new Path(args[++i]);
+      } else if (args[i].equals("-dir")) {
+        dir = new Path(args[++i]);
+        fs = dir.getFileSystem(getConf());
+        FileStatus[] fstats = fs.listStatus(dir,
+            HadoopFSUtil.getPassDirectoriesFilter(fs));
+        Path[] files = HadoopFSUtil.getPaths(fstats);
+        for (Path p : files) {
+          if (SegmentChecker.isIndexable(p,fs)) {
+            segments.add(p);
+          }
+        }
+      } else if (args[i].equals("-noCommit")) {
+        noCommit = true;
+      } else if (args[i].equals("-deleteGone")) {
+        deleteGone = true;
+      } else if (args[i].equals("-filter")) {
+        filter = true;
+      } else if (args[i].equals("-normalize")) {
+        normalize = true;
+      } else if (args[i].equals("-addBinaryContent")) {
+        addBinaryContent = true;
+      } else if (args[i].equals("-base64")) {
+        base64 = true;
+      } else if (args[i].equals("-params")) {
+        params = args[++i];
+      } else {
+        dir = new Path(args[i]);
+        fs = dir.getFileSystem(getConf());
+        if (SegmentChecker.isIndexable(dir,fs)) {
+          segments.add(dir);
+        }
+      }
+    }
+
+    try {
+      index(crawlDb, linkDb, segments, noCommit, deleteGone, params, filter, normalize, addBinaryContent, base64);
+      return 0;
+    } catch (final Exception e) {
+      LOG.error("Indexer: {}", StringUtils.stringifyException(e));
+      return -1;
+    }
+  }
+
+  public static void main(String[] args) throws Exception {
+    final int res = ToolRunner.run(NutchConfiguration.create(),
+        new IndexingJob(), args);
+    System.exit(res);
+  }
+
+
+  //Used for REST API
+  @Override
+  public Map<String, Object> run(Map<String, Object> args, String crawlId) throws Exception {
+    boolean noCommit = false;
+    boolean deleteGone = false; 
+    boolean filter = false;
+    boolean normalize = false;
+    boolean isSegment = false;
+    String params= null;
+    Configuration conf = getConf();
+
+    Path crawlDb;
+    if(args.containsKey(Nutch.ARG_CRAWLDB)) {
+      Object crawldbPath = args.get(Nutch.ARG_CRAWLDB);
+      if(crawldbPath instanceof Path) {
+        crawlDb = (Path) crawldbPath;
+      }
+      else {
+        crawlDb = new Path(crawldbPath.toString());
+      }
+    }
+    else {
+      crawlDb = new Path(crawlId+"/crawldb");
+    }
+
+    Path linkdb = null;
+    List<Path> segments = new ArrayList<Path>();
+
+    if(args.containsKey(Nutch.ARG_LINKDB)){
+      if(args.containsKey(Nutch.ARG_LINKDB)) {
+        Object path = args.get(Nutch.ARG_LINKDB);
+        if(path instanceof Path) {
+          linkdb = (Path) path;
+        }
+        else {
+          linkdb = new Path(path.toString());
+        }
+      }
+      else {
+        linkdb = new Path(crawlId+"/linkdb");
+      }
+    }
+
+    if(args.containsKey(Nutch.ARG_SEGMENTDIR)){
+      isSegment = true;
+      Path segmentsDir;
+      Object segDir = args.get(Nutch.ARG_SEGMENTDIR);
+      if(segDir instanceof Path) {
+        segmentsDir = (Path) segDir;
+      }
+      else {
+        segmentsDir = new Path(segDir.toString());
+      }
+      FileSystem fs = segmentsDir.getFileSystem(getConf());
+      FileStatus[] fstats = fs.listStatus(segmentsDir,
+          HadoopFSUtil.getPassDirectoriesFilter(fs));
+      Path[] files = HadoopFSUtil.getPaths(fstats);
+      for (Path p : files) {
+        if (SegmentChecker.isIndexable(p,fs)) {
+          segments.add(p);
+        }
+      }     
+    }
+
+    if(args.containsKey(Nutch.ARG_SEGMENT)){
+      isSegment = true;
+      Object seg = args.get(Nutch.ARG_SEGMENT);
+      ArrayList<String> segmentList = new ArrayList<String>();
+      if(seg instanceof ArrayList) {
+        segmentList = (ArrayList<String>)seg;
+      }
+      for(String segment: segmentList) {
+        segments.add(new Path(segment));
+      }
+    }
+
+    if(!isSegment){
+      String segment_dir = crawlId+"/segments";
+      File segmentsDir = new File(segment_dir);
+      File[] segmentsList = segmentsDir.listFiles();  
+      Arrays.sort(segmentsList, new Comparator<File>(){
+        @Override
+        public int compare(File f1, File f2) {
+          if(f1.lastModified()>f2.lastModified())
+            return -1;
+          else
+            return 0;
+        }      
+      });
+      Path segment = new Path(segmentsList[0].getPath());
+      segments.add(segment);
+    }
+
+    if(args.containsKey("noCommit")){
+      noCommit = true;
+    }
+    if(args.containsKey("deleteGone")){
+      deleteGone = true;
+    }
+    if(args.containsKey("normalize")){
+      normalize = true;
+    }
+    if(args.containsKey("filter")){
+      filter = true;
+    }
+    if(args.containsKey("params")){
+      params = (String)args.get("params");
+    }
+    setConf(conf);
+    index(crawlDb, linkdb, segments, noCommit, deleteGone, params, filter,
+        normalize);
+    Map<String, Object> results = new HashMap<String, Object>();
+    results.put(Nutch.VAL_RESULT, 0);
+    return results;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/indexer/NutchDocument.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/indexer/NutchDocument.java b/nutch-core/src/main/java/org/apache/nutch/indexer/NutchDocument.java
new file mode 100644
index 0000000..efdde02
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/indexer/NutchDocument.java
@@ -0,0 +1,144 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.Map.Entry;
+
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.VersionMismatchException;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.io.WritableUtils;
+import org.apache.nutch.metadata.Metadata;
+
+/** A {@link NutchDocument} is the unit of indexing. */
+public class NutchDocument implements Writable,
+    Iterable<Entry<String, NutchField>> {
+
+  public static final byte VERSION = 2;
+
+  private Map<String, NutchField> fields;
+
+  private Metadata documentMeta;
+
+  private float weight;
+
+  public NutchDocument() {
+    fields = new HashMap<String, NutchField>();
+    documentMeta = new Metadata();
+    weight = 1.0f;
+  }
+
+  public void add(String name, Object value) {
+    NutchField field = fields.get(name);
+    if (field == null) {
+      field = new NutchField(value);
+      fields.put(name, field);
+    } else {
+      field.add(value);
+    }
+  }
+
+  public Object getFieldValue(String name) {
+    NutchField field = fields.get(name);
+    if (field == null) {
+      return null;
+    }
+    if (field.getValues().size() == 0) {
+      return null;
+    }
+    return field.getValues().get(0);
+  }
+
+  public NutchField getField(String name) {
+    return fields.get(name);
+  }
+
+  public NutchField removeField(String name) {
+    return fields.remove(name);
+  }
+
+  public Collection<String> getFieldNames() {
+    return fields.keySet();
+  }
+
+  /** Iterate over all fields. */
+  public Iterator<Entry<String, NutchField>> iterator() {
+    return fields.entrySet().iterator();
+  }
+
+  public float getWeight() {
+    return weight;
+  }
+
+  public void setWeight(float weight) {
+    this.weight = weight;
+  }
+
+  public Metadata getDocumentMeta() {
+    return documentMeta;
+  }
+
+  public void readFields(DataInput in) throws IOException {
+    fields.clear();
+    byte version = in.readByte();
+    if (version != VERSION) {
+      throw new VersionMismatchException(VERSION, version);
+    }
+    int size = WritableUtils.readVInt(in);
+    for (int i = 0; i < size; i++) {
+      String name = Text.readString(in);
+      NutchField field = new NutchField();
+      field.readFields(in);
+      fields.put(name, field);
+    }
+    weight = in.readFloat();
+    documentMeta.readFields(in);
+  }
+
+  public void write(DataOutput out) throws IOException {
+    out.writeByte(VERSION);
+    WritableUtils.writeVInt(out, fields.size());
+    for (Map.Entry<String, NutchField> entry : fields.entrySet()) {
+      Text.writeString(out, entry.getKey());
+      NutchField field = entry.getValue();
+      field.write(out);
+    }
+    out.writeFloat(weight);
+    documentMeta.write(out);
+  }
+
+  public String toString() {
+    StringBuilder sb = new StringBuilder();
+    sb.append("doc {\n");
+    for (Map.Entry<String, NutchField> entry : fields.entrySet()) {
+      sb.append("\t");
+      sb.append(entry.getKey());
+      sb.append(":\t");
+      sb.append(entry.getValue());
+      sb.append("\n");
+    }
+    sb.append("}\n");
+    return sb.toString();
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/indexer/NutchField.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/indexer/NutchField.java b/nutch-core/src/main/java/org/apache/nutch/indexer/NutchField.java
new file mode 100644
index 0000000..33911e1
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/indexer/NutchField.java
@@ -0,0 +1,137 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.indexer;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Date;
+import java.util.List;
+
+import org.apache.hadoop.io.*;
+
+/**
+ * This class represents a multi-valued field with a weight. Values are
+ * arbitrary objects.
+ */
+public class NutchField implements Writable {
+  private float weight;
+  private List<Object> values = new ArrayList<Object>();
+
+  public NutchField() {
+  }
+
+  public NutchField(Object value) {
+    this(value, 1.0f);
+  }
+
+  public NutchField(Object value, float weight) {
+    this.weight = weight;
+    if (value instanceof Collection) {
+      values.addAll((Collection<?>) value);
+    } else {
+      values.add(value);
+    }
+  }
+
+  public void add(Object value) {
+    values.add(value);
+  }
+
+  public float getWeight() {
+    return weight;
+  }
+
+  public void setWeight(float weight) {
+    this.weight = weight;
+  }
+
+  public List<Object> getValues() {
+    return values;
+  }
+
+  public void reset() {
+    weight = 1.0f;
+    values.clear();
+  }
+
+  @Override
+  public Object clone() throws CloneNotSupportedException {
+    NutchField result = (NutchField) super.clone();
+    result.weight = weight;
+    result.values = values;
+
+    return result;
+  }
+
+  @Override
+  public void readFields(DataInput in) throws IOException {
+    weight = in.readFloat();
+    int count = in.readInt();
+    values = new ArrayList<Object>();
+    for (int i = 0; i < count; i++) {
+      String type = Text.readString(in);
+
+      if (type.equals("java.lang.String")) {
+        values.add(Text.readString(in));
+      } else if (type.equals("java.lang.Boolean")) {
+        values.add(in.readBoolean());
+      } else if (type.equals("java.lang.Integer")) {
+        values.add(in.readInt());
+      } else if (type.equals("java.lang.Float")) {
+        values.add(in.readFloat());
+      } else if (type.equals("java.lang.Long")) {
+        values.add(in.readLong());
+      } else if (type.equals("java.util.Date")) {
+        values.add(new Date(in.readLong()));
+      }
+    }
+  }
+
+  @Override
+  public void write(DataOutput out) throws IOException {
+    out.writeFloat(weight);
+    out.writeInt(values.size());
+    for (Object value : values) {
+
+      Text.writeString(out, value.getClass().getName());
+
+      if (value instanceof Boolean) {
+        out.writeBoolean((Boolean) value);
+      } else if (value instanceof Integer) {
+        out.writeInt((Integer) value);
+      } else if (value instanceof Long) {
+        out.writeLong((Long) value);
+      } else if (value instanceof Float) {
+        out.writeFloat((Float) value);
+      } else if (value instanceof String) {
+        Text.writeString(out, (String) value);
+      } else if (value instanceof Date) {
+        Date date = (Date) value;
+        out.writeLong(date.getTime());
+      }
+    }
+  }
+
+  public String toString() {
+    return values.toString();
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/indexer/NutchIndexAction.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/indexer/NutchIndexAction.java b/nutch-core/src/main/java/org/apache/nutch/indexer/NutchIndexAction.java
new file mode 100644
index 0000000..b2517c3
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/indexer/NutchIndexAction.java
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.hadoop.io.Writable;
+
+import org.apache.nutch.indexer.NutchDocument;
+
+/**
+ * A {@link NutchIndexAction} is the new unit of indexing holding the document
+ * and action information.
+ */
+public class NutchIndexAction implements Writable {
+
+  public static final byte ADD = 0;
+  public static final byte DELETE = 1;
+  public static final byte UPDATE = 2;
+
+  public NutchDocument doc = null;
+  public byte action = ADD;
+
+  protected NutchIndexAction() {
+  }
+
+  public NutchIndexAction(NutchDocument doc, byte action) {
+    this.doc = doc;
+    this.action = action;
+  }
+
+  public void readFields(DataInput in) throws IOException {
+    action = in.readByte();
+    doc = new NutchDocument();
+    doc.readFields(in);
+  }
+
+  public void write(DataOutput out) throws IOException {
+    out.write(action);
+    doc.write(out);
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/indexer/package.html
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/indexer/package.html b/nutch-core/src/main/java/org/apache/nutch/indexer/package.html
new file mode 100644
index 0000000..825eaae
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/indexer/package.html
@@ -0,0 +1,10 @@
+<html>
+<body>
+Index content, configure and run indexing and cleaning jobs to 
+add, update, and delete documents from an index. Two tasks are
+delegated to plugins:
+<ul>
+<li>indexing filters fill index fields of each documents</li>
+<li>index writer plugins send documents to index back-ends (Solr, etc.).
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/metadata/CreativeCommons.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/metadata/CreativeCommons.java b/nutch-core/src/main/java/org/apache/nutch/metadata/CreativeCommons.java
new file mode 100644
index 0000000..f9c425b
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/metadata/CreativeCommons.java
@@ -0,0 +1,35 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.metadata;
+
+/**
+ * A collection of Creative Commons properties names.
+ * 
+ * @see <a href="http://www.creativecommons.org/">creativecommons.org</a>
+ * 
+ * @author Chris Mattmann
+ * @author J&eacute;r&ocirc;me Charron
+ */
+public interface CreativeCommons {
+
+  public final static String LICENSE_URL = "License-Url";
+
+  public final static String LICENSE_LOCATION = "License-Location";
+
+  public final static String WORK_TYPE = "Work-Type";
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/metadata/DublinCore.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/metadata/DublinCore.java b/nutch-core/src/main/java/org/apache/nutch/metadata/DublinCore.java
new file mode 100644
index 0000000..9724d80
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/metadata/DublinCore.java
@@ -0,0 +1,161 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.metadata;
+
+/**
+ * A collection of Dublin Core metadata names.
+ * 
+ * @see <a href="http://dublincore.org">dublincore.org</a>
+ * 
+ * @author Chris Mattmann
+ * @author J&eacute;r&ocirc;me Charron
+ */
+public interface DublinCore {
+
+  /**
+   * Typically, Format may include the media-type or dimensions of the resource.
+   * Format may be used to determine the software, hardware or other equipment
+   * needed to display or operate the resource. Examples of dimensions include
+   * size and duration. Recommended best practice is to select a value from a
+   * controlled vocabulary (for example, the list of Internet Media Types [MIME]
+   * defining computer media formats).
+   */
+  public static final String FORMAT = "format";
+
+  /**
+   * Recommended best practice is to identify the resource by means of a string
+   * or number conforming to a formal identification system. Example formal
+   * identification systems include the Uniform Resource Identifier (URI)
+   * (including the Uniform Resource Locator (URL)), the Digital Object
+   * Identifier (DOI) and the International Standard Book Number (ISBN).
+   */
+  public static final String IDENTIFIER = "identifier";
+
+  /**
+   * Date on which the resource was changed.
+   */
+  public static final String MODIFIED = "modified";
+
+  /**
+   * An entity responsible for making contributions to the content of the
+   * resource. Examples of a Contributor include a person, an organisation, or a
+   * service. Typically, the name of a Contributor should be used to indicate
+   * the entity.
+   */
+  public static final String CONTRIBUTOR = "contributor";
+
+  /**
+   * The extent or scope of the content of the resource. Coverage will typically
+   * include spatial location (a place name or geographic coordinates), temporal
+   * period (a period label, date, or date range) or jurisdiction (such as a
+   * named administrative entity). Recommended best practice is to select a
+   * value from a controlled vocabulary (for example, the Thesaurus of
+   * Geographic Names [TGN]) and that, where appropriate, named places or time
+   * periods be used in preference to numeric identifiers such as sets of
+   * coordinates or date ranges.
+   */
+  public static final String COVERAGE = "coverage";
+
+  /**
+   * An entity primarily responsible for making the content of the resource.
+   * Examples of a Creator include a person, an organisation, or a service.
+   * Typically, the name of a Creator should be used to indicate the entity.
+   */
+  public static final String CREATOR = "creator";
+
+  /**
+   * A date associated with an event in the life cycle of the resource.
+   * Typically, Date will be associated with the creation or availability of the
+   * resource. Recommended best practice for encoding the date value is defined
+   * in a profile of ISO 8601 [W3CDTF] and follows the YYYY-MM-DD format.
+   */
+  public static final String DATE = "date";
+
+  /**
+   * An account of the content of the resource. Description may include but is
+   * not limited to: an abstract, table of contents, reference to a graphical
+   * representation of content or a free-text account of the content.
+   */
+  public static final String DESCRIPTION = "description";
+
+  /**
+   * A language of the intellectual content of the resource. Recommended best
+   * practice is to use RFC 3066 [RFC3066], which, in conjunction with ISO 639
+   * [ISO639], defines two- and three-letter primary language tags with optional
+   * subtags. Examples include "en" or "eng" for English, "akk" for Akkadian,
+   * and "en-GB" for English used in the United Kingdom.
+   */
+  public static final String LANGUAGE = "language";
+
+  /**
+   * An entity responsible for making the resource available. Examples of a
+   * Publisher include a person, an organisation, or a service. Typically, the
+   * name of a Publisher should be used to indicate the entity.
+   */
+  public static final String PUBLISHER = "publisher";
+
+  /**
+   * A reference to a related resource. Recommended best practice is to
+   * reference the resource by means of a string or number conforming to a
+   * formal identification system.
+   */
+  public static final String RELATION = "relation";
+
+  /**
+   * Information about rights held in and over the resource. Typically, a Rights
+   * element will contain a rights management statement for the resource, or
+   * reference a service providing such information. Rights information often
+   * encompasses Intellectual Property Rights (IPR), Copyright, and various
+   * Property Rights. If the Rights element is absent, no assumptions can be
+   * made about the status of these and other rights with respect to the
+   * resource.
+   */
+  public static final String RIGHTS = "rights";
+
+  /**
+   * A reference to a resource from which the present resource is derived. The
+   * present resource may be derived from the Source resource in whole or in
+   * part. Recommended best practice is to reference the resource by means of a
+   * string or number conforming to a formal identification system.
+   */
+  public static final String SOURCE = "source";
+
+  /**
+   * The topic of the content of the resource. Typically, a Subject will be
+   * expressed as keywords, key phrases or classification codes that describe a
+   * topic of the resource. Recommended best practice is to select a value from
+   * a controlled vocabulary or formal classification scheme.
+   */
+  public static final String SUBJECT = "subject";
+
+  /**
+   * A name given to the resource. Typically, a Title will be a name by which
+   * the resource is formally known.
+   */
+  public static final String TITLE = "title";
+
+  /**
+   * The nature or genre of the content of the resource. Type includes terms
+   * describing general categories, functions, genres, or aggregation levels for
+   * content. Recommended best practice is to select a value from a controlled
+   * vocabulary (for example, the DCMI Type Vocabulary [DCMITYPE]). To describe
+   * the physical or digital manifestation of the resource, use the Format
+   * element.
+   */
+  public static final String TYPE = "type";
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/metadata/Feed.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/metadata/Feed.java b/nutch-core/src/main/java/org/apache/nutch/metadata/Feed.java
new file mode 100644
index 0000000..2697da6
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/metadata/Feed.java
@@ -0,0 +1,38 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.metadata;
+
+/**
+ * A collection of Feed property names extracted by the ROME library.
+ * 
+ * 
+ * @author mattmann
+ * @author dogacan
+ */
+public interface Feed {
+
+  public static final String FEED_AUTHOR = "author";
+
+  public static final String FEED_TAGS = "tag";
+
+  public static final String FEED_PUBLISHED = "published";
+
+  public static final String FEED_UPDATED = "updated";
+
+  public static final String FEED = "feed";
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/metadata/HttpHeaders.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/metadata/HttpHeaders.java b/nutch-core/src/main/java/org/apache/nutch/metadata/HttpHeaders.java
new file mode 100644
index 0000000..78b8797
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/metadata/HttpHeaders.java
@@ -0,0 +1,51 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.metadata;
+
+import org.apache.hadoop.io.Text;
+
+/**
+ * A collection of HTTP header names.
+ * 
+ * @see <a href="http://rfc-ref.org/RFC-TEXTS/2616/">Hypertext Transfer Protocol
+ *      -- HTTP/1.1 (RFC 2616)</a>
+ */
+public interface HttpHeaders {
+
+  public final static String TRANSFER_ENCODING = "Transfer-Encoding";
+
+  public final static String CONTENT_ENCODING = "Content-Encoding";
+
+  public final static String CONTENT_LANGUAGE = "Content-Language";
+
+  public final static String CONTENT_LENGTH = "Content-Length";
+
+  public final static String CONTENT_LOCATION = "Content-Location";
+
+  public static final String CONTENT_DISPOSITION = "Content-Disposition";
+
+  public final static String CONTENT_MD5 = "Content-MD5";
+
+  public final static String CONTENT_TYPE = "Content-Type";
+
+  public static final Text WRITABLE_CONTENT_TYPE = new Text(CONTENT_TYPE);
+
+  public final static String LAST_MODIFIED = "Last-Modified";
+
+  public final static String LOCATION = "Location";
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/metadata/MetaWrapper.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/metadata/MetaWrapper.java b/nutch-core/src/main/java/org/apache/nutch/metadata/MetaWrapper.java
new file mode 100644
index 0000000..a43fa9d
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/metadata/MetaWrapper.java
@@ -0,0 +1,120 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.metadata;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Writable;
+import org.apache.nutch.crawl.NutchWritable;
+
+/**
+ * This is a simple decorator that adds metadata to any Writable-s that can be
+ * serialized by <tt>NutchWritable</tt>. This is useful when data needs to be
+ * temporarily enriched during processing, but this temporary metadata doesn't
+ * need to be permanently stored after the job is done.
+ * 
+ * @author Andrzej Bialecki
+ */
+public class MetaWrapper extends NutchWritable {
+  private Metadata metadata;
+
+  public MetaWrapper() {
+    super();
+    metadata = new Metadata();
+  }
+
+  public MetaWrapper(Writable instance, Configuration conf) {
+    super(instance);
+    metadata = new Metadata();
+    setConf(conf);
+  }
+
+  public MetaWrapper(Metadata metadata, Writable instance, Configuration conf) {
+    super(instance);
+    if (metadata == null)
+      metadata = new Metadata();
+    this.metadata = metadata;
+    setConf(conf);
+  }
+
+  /**
+   * Get all metadata.
+   */
+  public Metadata getMetadata() {
+    return metadata;
+  }
+
+  /**
+   * Add metadata. See {@link Metadata#add(String, String)} for more
+   * information.
+   * 
+   * @param name
+   *          metadata name
+   * @param value
+   *          metadata value
+   */
+  public void addMeta(String name, String value) {
+    metadata.add(name, value);
+  }
+
+  /**
+   * Set metadata. See {@link Metadata#set(String, String)} for more
+   * information.
+   * 
+   * @param name
+   * @param value
+   */
+  public void setMeta(String name, String value) {
+    metadata.set(name, value);
+  }
+
+  /**
+   * Get metadata. See {@link Metadata#get(String)} for more information.
+   * 
+   * @param name
+   * @return metadata value
+   */
+  public String getMeta(String name) {
+    return metadata.get(name);
+  }
+
+  /**
+   * Get multiple metadata. See {@link Metadata#getValues(String)} for more
+   * information.
+   * 
+   * @param name
+   * @return multiple values
+   */
+  public String[] getMetaValues(String name) {
+    return metadata.getValues(name);
+  }
+
+  public void readFields(DataInput in) throws IOException {
+    super.readFields(in);
+    metadata = new Metadata();
+    metadata.readFields(in);
+  }
+
+  public void write(DataOutput out) throws IOException {
+    super.write(out);
+    metadata.write(out);
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/metadata/Metadata.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/metadata/Metadata.java b/nutch-core/src/main/java/org/apache/nutch/metadata/Metadata.java
new file mode 100644
index 0000000..8a57ee3
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/metadata/Metadata.java
@@ -0,0 +1,280 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.metadata;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.Enumeration;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Properties;
+
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+
+/**
+ * A multi-valued metadata container.
+ */
+public class Metadata implements Writable, CreativeCommons, DublinCore,
+    HttpHeaders, Nutch, Feed {
+
+  /**
+   * A map of all metadata attributes.
+   */
+  private Map<String, String[]> metadata = null;
+
+  /**
+   * Constructs a new, empty metadata.
+   */
+  public Metadata() {
+    metadata = new HashMap<String, String[]>();
+  }
+
+  /**
+   * Returns true if named value is multivalued.
+   * 
+   * @param name
+   *          name of metadata
+   * @return true is named value is multivalued, false if single value or null
+   */
+  public boolean isMultiValued(final String name) {
+    return metadata.get(name) != null && metadata.get(name).length > 1;
+  }
+
+  /**
+   * Returns an array of the names contained in the metadata.
+   * 
+   * @return Metadata names
+   */
+  public String[] names() {
+    return metadata.keySet().toArray(new String[metadata.keySet().size()]);
+  }
+
+  /**
+   * Get the value associated to a metadata name. If many values are assiociated
+   * to the specified name, then the first one is returned.
+   * 
+   * @param name
+   *          of the metadata.
+   * @return the value associated to the specified metadata name.
+   */
+  public String get(final String name) {
+    String[] values = metadata.get(name);
+    if (values == null) {
+      return null;
+    } else {
+      return values[0];
+    }
+  }
+
+  /**
+   * Get the values associated to a metadata name.
+   * 
+   * @param name
+   *          of the metadata.
+   * @return the values associated to a metadata name.
+   */
+  public String[] getValues(final String name) {
+    return _getValues(name);
+  }
+
+  private String[] _getValues(final String name) {
+    String[] values = metadata.get(name);
+    if (values == null) {
+      values = new String[0];
+    }
+    return values;
+  }
+
+  /**
+   * Add a metadata name/value mapping. Add the specified value to the list of
+   * values associated to the specified metadata name.
+   * 
+   * @param name
+   *          the metadata name.
+   * @param value
+   *          the metadata value.
+   */
+  public void add(final String name, final String value) {
+    String[] values = metadata.get(name);
+    if (values == null) {
+      set(name, value);
+    } else {
+      String[] newValues = new String[values.length + 1];
+      System.arraycopy(values, 0, newValues, 0, values.length);
+      newValues[newValues.length - 1] = value;
+      metadata.put(name, newValues);
+    }
+  }
+
+  /**
+   * Add all name/value mappings (merge two metadata mappings). If a name
+   * already exists in current metadata the values are added to existing values.
+   *
+   * @param metadata
+   *          other Metadata to be merged
+   */
+  public void addAll(Metadata metadata) {
+    for (String name : metadata.names()) {
+      String[] addValues = metadata.getValues(name);
+      if (addValues == null)
+        continue;
+      String[] oldValues = this.metadata.get(name);
+      if (oldValues == null) {
+        this.metadata.put(name, addValues);
+      } else {
+        String[] newValues = new String[oldValues.length + addValues.length];
+        System.arraycopy(oldValues, 0, newValues, 0, oldValues.length);
+        System.arraycopy(addValues, 0, newValues, oldValues.length,
+            addValues.length);
+        this.metadata.put(name, newValues);
+      }
+    }
+  }
+
+  /**
+   * Copy All key-value pairs from properties.
+   * 
+   * @param properties
+   *          properties to copy from
+   */
+  public void setAll(Properties properties) {
+    Enumeration<?> names = properties.propertyNames();
+    while (names.hasMoreElements()) {
+      String name = (String) names.nextElement();
+      metadata.put(name, new String[] { properties.getProperty(name) });
+    }
+  }
+
+  /**
+   * Set metadata name/value. Associate the specified value to the specified
+   * metadata name. If some previous values were associated to this name, they
+   * are removed.
+   * 
+   * @param name
+   *          the metadata name.
+   * @param value
+   *          the metadata value.
+   */
+  public void set(String name, String value) {
+    metadata.put(name, new String[] { value });
+  }
+
+  /**
+   * Remove a metadata and all its associated values.
+   * 
+   * @param name
+   *          metadata name to remove
+   */
+  public void remove(String name) {
+    metadata.remove(name);
+  }
+
+  /**
+   * Returns the number of metadata names in this metadata.
+   * 
+   * @return number of metadata names
+   */
+  public int size() {
+    return metadata.size();
+  }
+
+  /** Remove all mappings from metadata. */
+  public void clear() {
+    metadata.clear();
+  }
+
+  public boolean equals(Object o) {
+
+    if (o == null) {
+      return false;
+    }
+
+    Metadata other = null;
+    try {
+      other = (Metadata) o;
+    } catch (ClassCastException cce) {
+      return false;
+    }
+
+    if (other.size() != size()) {
+      return false;
+    }
+
+    String[] names = names();
+    for (int i = 0; i < names.length; i++) {
+      String[] otherValues = other._getValues(names[i]);
+      String[] thisValues = _getValues(names[i]);
+      if (otherValues.length != thisValues.length) {
+        return false;
+      }
+      for (int j = 0; j < otherValues.length; j++) {
+        if (!otherValues[j].equals(thisValues[j])) {
+          return false;
+        }
+      }
+    }
+    return true;
+  }
+
+  public String toString() {
+    StringBuffer buf = new StringBuffer();
+    String[] names = names();
+    for (int i = 0; i < names.length; i++) {
+      String[] values = _getValues(names[i]);
+      for (int j = 0; j < values.length; j++) {
+        buf.append(names[i]).append("=").append(values[j]).append(" ");
+      }
+    }
+    return buf.toString();
+  }
+
+  public final void write(DataOutput out) throws IOException {
+    out.writeInt(size());
+    String[] values = null;
+    String[] names = names();
+    for (int i = 0; i < names.length; i++) {
+      Text.writeString(out, names[i]);
+      values = _getValues(names[i]);
+      int cnt = 0;
+      for (int j = 0; j < values.length; j++) {
+        if (values[j] != null)
+          cnt++;
+      }
+      out.writeInt(cnt);
+      for (int j = 0; j < values.length; j++) {
+        if (values[j] != null) {
+          Text.writeString(out, values[j]);
+        }
+      }
+    }
+  }
+
+  public final void readFields(DataInput in) throws IOException {
+    int keySize = in.readInt();
+    String key;
+    for (int i = 0; i < keySize; i++) {
+      key = Text.readString(in);
+      int valueSize = in.readInt();
+      for (int j = 0; j < valueSize; j++) {
+        add(key, Text.readString(in));
+      }
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/metadata/Nutch.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/metadata/Nutch.java b/nutch-core/src/main/java/org/apache/nutch/metadata/Nutch.java
new file mode 100644
index 0000000..de80399
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/metadata/Nutch.java
@@ -0,0 +1,98 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.metadata;
+
+import org.apache.hadoop.io.Text;
+
+/**
+ * A collection of Nutch internal metadata constants.
+ * 
+ * @author Chris Mattmann
+ * @author J&eacute;r&ocirc;me Charron
+ */
+public interface Nutch {
+
+	public static final String ORIGINAL_CHAR_ENCODING = "OriginalCharEncoding";
+
+	public static final String CHAR_ENCODING_FOR_CONVERSION = "CharEncodingForConversion";
+
+	public static final String SIGNATURE_KEY = "nutch.content.digest";
+
+	public static final String SEGMENT_NAME_KEY = "nutch.segment.name";
+
+	public static final String SCORE_KEY = "nutch.crawl.score";
+
+	public static final String GENERATE_TIME_KEY = "_ngt_";
+
+	public static final Text WRITABLE_GENERATE_TIME_KEY = new Text(
+			GENERATE_TIME_KEY);
+
+	public static final Text PROTOCOL_STATUS_CODE_KEY = new Text("nutch.protocol.code");
+
+	public static final String PROTO_STATUS_KEY = "_pst_";
+
+	public static final Text WRITABLE_PROTO_STATUS_KEY = new Text(
+			PROTO_STATUS_KEY);
+
+	public static final String FETCH_TIME_KEY = "_ftk_";
+
+	public static final String FETCH_STATUS_KEY = "_fst_";
+
+	/**
+	 * Sites may request that search engines don't provide access to cached
+	 * documents.
+	 */
+	public static final String CACHING_FORBIDDEN_KEY = "caching.forbidden";
+
+	/** Show both original forbidden content and summaries (default). */
+	public static final String CACHING_FORBIDDEN_NONE = "none";
+
+	/** Don't show either original forbidden content or summaries. */
+	public static final String CACHING_FORBIDDEN_ALL = "all";
+
+	/** Don't show original forbidden content, but show summaries. */
+	public static final String CACHING_FORBIDDEN_CONTENT = "content";
+
+	public static final String REPR_URL_KEY = "_repr_";
+
+	public static final Text WRITABLE_REPR_URL_KEY = new Text(REPR_URL_KEY);
+
+	/** Used by AdaptiveFetchSchedule to maintain custom fetch interval */
+	public static final String FIXED_INTERVAL_KEY = "fixedInterval";
+
+	public static final Text WRITABLE_FIXED_INTERVAL_KEY = new Text(
+			FIXED_INTERVAL_KEY);
+
+	
+	 /** For progress of job. Used by the Nutch REST service */
+	public static final String STAT_PROGRESS = "progress";
+	/**Used by Nutch REST service */
+	public static final String CRAWL_ID_KEY = "storage.crawl.id";
+	/** Argument key to specify location of the seed url dir for the REST endpoints **/
+	public static final String ARG_SEEDDIR = "url_dir";
+	/** Argument key to specify the location of crawldb for the REST endpoints **/
+	public static final String ARG_CRAWLDB = "crawldb";
+	/** Argument key to specify the location of linkdb for the REST endpoints **/
+	public static final String ARG_LINKDB = "linkdb";
+	/** Name of the key used in the Result Map sent back by the REST endpoint **/
+	public static final String VAL_RESULT = "result";
+	/** Argument key to specify the location of a directory of segments for the REST endpoints.
+	 * Similar to the -dir command in the bin/nutch script **/
+	public static final String ARG_SEGMENTDIR = "segment_dir";
+	/** Argument key to specify the location of individual segment for the REST endpoints **/
+	public static final String ARG_SEGMENT = "segment";
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/metadata/SpellCheckedMetadata.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/metadata/SpellCheckedMetadata.java b/nutch-core/src/main/java/org/apache/nutch/metadata/SpellCheckedMetadata.java
new file mode 100644
index 0000000..164ca1d
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/metadata/SpellCheckedMetadata.java
@@ -0,0 +1,150 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.metadata;
+
+import java.lang.reflect.Field;
+import java.lang.reflect.Modifier;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.commons.lang.StringUtils;
+
+/**
+ * A decorator to Metadata that adds spellchecking capabilities to property
+ * names. Currently used spelling vocabulary contains just the httpheaders from
+ * {@link HttpHeaders} class.
+ * 
+ */
+public class SpellCheckedMetadata extends Metadata {
+
+  /**
+   * Treshold divider.
+   * 
+   * <code>threshold = searched.length() / TRESHOLD_DIVIDER;</code>
+   */
+  private static final int TRESHOLD_DIVIDER = 3;
+
+  /**
+   * Normalized name to name mapping.
+   */
+  private final static Map<String, String> NAMES_IDX = new HashMap<String, String>();
+
+  /**
+   * Array holding map keys.
+   */
+  private static String[] normalized = null;
+
+  static {
+
+    // Uses following array to fill the metanames index and the
+    // metanames list.
+    Class<?>[] spellthese = { HttpHeaders.class };
+
+    for (Class<?> spellCheckedNames : spellthese) {
+      for (Field field : spellCheckedNames.getFields()) {
+        int mods = field.getModifiers();
+        if (Modifier.isFinal(mods) && Modifier.isPublic(mods)
+            && Modifier.isStatic(mods) && field.getType().equals(String.class)) {
+          try {
+            String val = (String) field.get(null);
+            NAMES_IDX.put(normalize(val), val);
+          } catch (Exception e) {
+            // Simply ignore...
+          }
+        }
+      }
+    }
+    normalized = NAMES_IDX.keySet().toArray(new String[NAMES_IDX.size()]);
+  }
+
+  /**
+   * Normalizes String.
+   * 
+   * @param str
+   *          the string to normalize
+   * @return normalized String
+   */
+  private static String normalize(final String str) {
+    char c;
+    StringBuffer buf = new StringBuffer();
+    for (int i = 0; i < str.length(); i++) {
+      c = str.charAt(i);
+      if (Character.isLetter(c)) {
+        buf.append(Character.toLowerCase(c));
+      }
+    }
+    return buf.toString();
+  }
+
+  /**
+   * Get the normalized name of metadata attribute name. This method tries to
+   * find a well-known metadata name (one of the metadata names defined in this
+   * class) that matches the specified name. The matching is error tolerent. For
+   * instance,
+   * <ul>
+   * <li>content-type gives Content-Type</li>
+   * <li>CoNtEntType gives Content-Type</li>
+   * <li>ConTnTtYpe gives Content-Type</li>
+   * </ul>
+   * If no matching with a well-known metadata name is found, then the original
+   * name is returned.
+   * 
+   * @param name
+   *          Name to normalize
+   * @return normalized name
+   */
+  public static String getNormalizedName(final String name) {
+    String searched = normalize(name);
+    String value = NAMES_IDX.get(searched);
+
+    if ((value == null) && (normalized != null)) {
+      int threshold = searched.length() / TRESHOLD_DIVIDER;
+      for (int i = 0; i < normalized.length && value == null; i++) {
+        if (StringUtils.getLevenshteinDistance(searched, normalized[i]) < threshold) {
+          value = NAMES_IDX.get(normalized[i]);
+        }
+      }
+    }
+    return (value != null) ? value : name;
+  }
+
+  @Override
+  public void remove(final String name) {
+    super.remove(getNormalizedName(name));
+  }
+
+  @Override
+  public void add(final String name, final String value) {
+    super.add(getNormalizedName(name), value);
+  }
+
+  @Override
+  public String[] getValues(final String name) {
+    return super.getValues(getNormalizedName(name));
+  }
+
+  @Override
+  public String get(final String name) {
+    return super.get(getNormalizedName(name));
+  }
+
+  @Override
+  public void set(final String name, final String value) {
+    super.set(getNormalizedName(name), value);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/metadata/package.html
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/metadata/package.html b/nutch-core/src/main/java/org/apache/nutch/metadata/package.html
new file mode 100644
index 0000000..53281bb
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/metadata/package.html
@@ -0,0 +1,6 @@
+<html>
+<body>
+A Multi-valued Metadata container, and set
+of constant fields for Nutch Metadata.
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/net/URLExemptionFilter.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/net/URLExemptionFilter.java b/nutch-core/src/main/java/org/apache/nutch/net/URLExemptionFilter.java
new file mode 100644
index 0000000..8de5800
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/net/URLExemptionFilter.java
@@ -0,0 +1,43 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.net;
+
+//Hadoop
+import org.apache.hadoop.conf.Configurable;
+// Nutch
+import org.apache.nutch.plugin.Pluggable;
+
+/**
+ * Interface used to allow exemptions to external domain resources by overriding <code>db.ignore.external.links</code>.
+ * This is useful when the crawl is focused to a domain but resources like images are hosted on CDN.
+ */
+
+public interface URLExemptionFilter extends Pluggable, Configurable{
+
+  /** The name of the extension point. */
+  public final static String X_POINT_ID = URLExemptionFilter.class.getName();
+
+  /**
+   * Checks if toUrl is exempted when the ignore external is enabled
+   * @param fromUrl : the source url which generated the outlink
+   * @param toUrl : the destination url which needs to be checked for exemption
+   * @return true when toUrl is exempted from dbIgnore
+   */
+  public boolean filter(String fromUrl, String toUrl);
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/net/URLExemptionFilters.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/net/URLExemptionFilters.java b/nutch-core/src/main/java/org/apache/nutch/net/URLExemptionFilters.java
new file mode 100644
index 0000000..d362f2e
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/net/URLExemptionFilters.java
@@ -0,0 +1,64 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.net;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.plugin.Extension;
+import org.apache.nutch.plugin.PluginRepository;
+import org.apache.nutch.plugin.PluginRuntimeException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/** Creates and caches {@link URLExemptionFilter} implementing plugins. */
+public class URLExemptionFilters {
+
+  private static final Logger LOG = LoggerFactory.getLogger(URLExemptionFilters.class);
+
+  private URLExemptionFilter[] filters;
+
+  public URLExemptionFilters(Configuration conf) {
+    Extension[] extensions = PluginRepository.get(conf)
+        .getExtensionPoint(URLExemptionFilter.X_POINT_ID).getExtensions();
+    filters = new URLExemptionFilter[extensions.length];
+    for (int i = 0; i < extensions.length; i++) {
+      try {
+        filters[i] = (URLExemptionFilter) extensions[i].getExtensionInstance();
+      } catch (PluginRuntimeException e) {
+        throw new IllegalStateException(e);
+      }
+    }
+    LOG.info("Found {} extensions at point:'{}'", filters.length,
+        URLExemptionFilter.X_POINT_ID);
+  }
+
+
+  /** Run all defined filters. Assume logical AND. */
+  public boolean isExempted(String fromUrl, String toUrl) {
+    if (filters.length < 1) {
+      //at least one filter should be on
+      return false;
+    }
+    //validate from, to and filters
+    boolean exempted = fromUrl != null && toUrl != null;
+    //An URL is exempted when all the filters accept it to pass through
+    for (int i = 0; i < this.filters.length && exempted; i++) {
+      exempted = this.filters[i].filter(fromUrl, toUrl);
+    }
+    return exempted;
+  }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/net/URLFilter.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/net/URLFilter.java b/nutch-core/src/main/java/org/apache/nutch/net/URLFilter.java
new file mode 100644
index 0000000..01efbcd
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/net/URLFilter.java
@@ -0,0 +1,40 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.net;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configurable;
+
+// Nutch imports
+import org.apache.nutch.plugin.Pluggable;
+
+/**
+ * Interface used to limit which URLs enter Nutch. Used by the injector and the
+ * db updater.
+ */
+
+public interface URLFilter extends Pluggable, Configurable {
+  /** The name of the extension point. */
+  public final static String X_POINT_ID = URLFilter.class.getName();
+
+  /*
+   * Interface for a filter that transforms a URL: it can pass the original URL
+   * through or "delete" the URL by returning null
+   */
+  public String filter(String urlString);
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/net/URLFilterChecker.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/net/URLFilterChecker.java b/nutch-core/src/main/java/org/apache/nutch/net/URLFilterChecker.java
new file mode 100644
index 0000000..89a3d00
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/net/URLFilterChecker.java
@@ -0,0 +1,134 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.net;
+
+import org.apache.nutch.plugin.Extension;
+import org.apache.nutch.plugin.ExtensionPoint;
+import org.apache.nutch.plugin.PluginRepository;
+
+import org.apache.hadoop.conf.Configuration;
+
+import org.apache.nutch.util.NutchConfiguration;
+
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
+
+/**
+ * Checks one given filter or all filters.
+ * 
+ * @author John Xing
+ */
+public class URLFilterChecker {
+
+  private Configuration conf;
+
+  public URLFilterChecker(Configuration conf) {
+    this.conf = conf;
+  }
+
+  private void checkOne(String filterName) throws Exception {
+    URLFilter filter = null;
+
+    ExtensionPoint point = PluginRepository.get(conf).getExtensionPoint(
+        URLFilter.X_POINT_ID);
+
+    if (point == null)
+      throw new RuntimeException(URLFilter.X_POINT_ID + " not found.");
+
+    Extension[] extensions = point.getExtensions();
+
+    for (int i = 0; i < extensions.length; i++) {
+      Extension extension = extensions[i];
+      filter = (URLFilter) extension.getExtensionInstance();
+      if (filter.getClass().getName().equals(filterName)) {
+        break;
+      } else {
+        filter = null;
+      }
+    }
+
+    if (filter == null)
+      throw new RuntimeException("Filter " + filterName + " not found.");
+
+    // jerome : should we keep this behavior?
+    // if (LogFormatter.hasLoggedSevere())
+    // throw new RuntimeException("Severe error encountered.");
+
+    System.out.println("Checking URLFilter " + filterName);
+
+    BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
+    String line;
+    while ((line = in.readLine()) != null) {
+      String out = filter.filter(line);
+      if (out != null) {
+        System.out.print("+");
+        System.out.println(out);
+      } else {
+        System.out.print("-");
+        System.out.println(line);
+      }
+    }
+  }
+
+  private void checkAll() throws Exception {
+    System.out.println("Checking combination of all URLFilters available");
+
+    BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
+    String line;
+    while ((line = in.readLine()) != null) {
+      URLFilters filters = new URLFilters(this.conf);
+      String out = filters.filter(line);
+      if (out != null) {
+        System.out.print("+");
+        System.out.println(out);
+      } else {
+        System.out.print("-");
+        System.out.println(line);
+      }
+    }
+  }
+
+  public static void main(String[] args) throws Exception {
+
+    String usage = "Usage: URLFilterChecker (-filterName filterName | -allCombined) \n"
+        + "Tool takes a list of URLs, one per line, passed via STDIN.\n";
+
+    if (args.length == 0) {
+      System.err.println(usage);
+      System.exit(-1);
+    }
+
+    String filterName = null;
+    if (args[0].equals("-filterName")) {
+      if (args.length != 2) {
+        System.err.println(usage);
+        System.exit(-1);
+      }
+      filterName = args[1];
+    }
+
+    URLFilterChecker checker = new URLFilterChecker(NutchConfiguration.create());
+    if (filterName != null) {
+      checker.checkOne(filterName);
+    } else {
+      checker.checkAll();
+    }
+
+    System.exit(0);
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/net/URLFilterException.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/net/URLFilterException.java b/nutch-core/src/main/java/org/apache/nutch/net/URLFilterException.java
new file mode 100644
index 0000000..b367b56
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/net/URLFilterException.java
@@ -0,0 +1,39 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.net;
+
+@SuppressWarnings("serial")
+public class URLFilterException extends Exception {
+
+  public URLFilterException() {
+    super();
+  }
+
+  public URLFilterException(String message) {
+    super(message);
+  }
+
+  public URLFilterException(String message, Throwable cause) {
+    super(message, cause);
+  }
+
+  public URLFilterException(Throwable cause) {
+    super(cause);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/net/URLFilters.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/net/URLFilters.java b/nutch-core/src/main/java/org/apache/nutch/net/URLFilters.java
new file mode 100644
index 0000000..3deccca
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/net/URLFilters.java
@@ -0,0 +1,44 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.net;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.plugin.PluginRepository;
+
+/** Creates and caches {@link URLFilter} implementing plugins. */
+public class URLFilters {
+
+  public static final String URLFILTER_ORDER = "urlfilter.order";
+  private URLFilter[] filters;
+
+  public URLFilters(Configuration conf) {
+    this.filters = (URLFilter[]) PluginRepository.get(conf).getOrderedPlugins(
+        URLFilter.class, URLFilter.X_POINT_ID, URLFILTER_ORDER);
+  }
+
+  /** Run all defined filters. Assume logical AND. */
+  public String filter(String urlString) throws URLFilterException {
+    for (int i = 0; i < this.filters.length; i++) {
+      if (urlString == null)
+        return null;
+      urlString = this.filters[i].filter(urlString);
+
+    }
+    return urlString;
+  }
+}

[49/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build for nutch-core and nutch-plugins

Posted by th...@apache.org.

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/crawl/CrawlDb.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/crawl/CrawlDb.java b/nutch-core/src/main/java/org/apache/nutch/crawl/CrawlDb.java
new file mode 100644
index 0000000..1537cdc
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/crawl/CrawlDb.java
@@ -0,0 +1,349 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+import java.io.*;
+import java.text.SimpleDateFormat;
+import java.util.*;
+
+// Commons Logging imports
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.io.*;
+import org.apache.hadoop.fs.*;
+import org.apache.hadoop.conf.*;
+import org.apache.hadoop.mapred.*;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.util.*;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.util.FSUtils;
+import org.apache.nutch.util.HadoopFSUtil;
+import org.apache.nutch.util.LockUtil;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.NutchJob;
+import org.apache.nutch.util.NutchTool;
+import org.apache.nutch.util.TimingUtil;
+
+/**
+ * This class takes the output of the fetcher and updates the crawldb
+ * accordingly.
+ */
+public class CrawlDb extends NutchTool implements Tool {
+  public static final Logger LOG = LoggerFactory.getLogger(CrawlDb.class);
+
+  public static final String CRAWLDB_ADDITIONS_ALLOWED = "db.update.additions.allowed";
+
+  public static final String CRAWLDB_PURGE_404 = "db.update.purge.404";
+
+  public static final String CURRENT_NAME = "current";
+
+  public static final String LOCK_NAME = ".locked";
+
+  public CrawlDb() {
+  }
+
+  public CrawlDb(Configuration conf) {
+    setConf(conf);
+  }
+
+  public void update(Path crawlDb, Path[] segments, boolean normalize,
+      boolean filter) throws IOException {
+    boolean additionsAllowed = getConf().getBoolean(CRAWLDB_ADDITIONS_ALLOWED,
+        true);
+    update(crawlDb, segments, normalize, filter, additionsAllowed, false);
+  }
+
+  public void update(Path crawlDb, Path[] segments, boolean normalize,
+      boolean filter, boolean additionsAllowed, boolean force)
+      throws IOException {
+    FileSystem fs = FileSystem.get(getConf());
+    Path lock = new Path(crawlDb, LOCK_NAME);
+    LockUtil.createLockFile(fs, lock, force);
+    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+    long start = System.currentTimeMillis();
+
+    JobConf job = CrawlDb.createJob(getConf(), crawlDb);
+    job.setBoolean(CRAWLDB_ADDITIONS_ALLOWED, additionsAllowed);
+    job.setBoolean(CrawlDbFilter.URL_FILTERING, filter);
+    job.setBoolean(CrawlDbFilter.URL_NORMALIZING, normalize);
+
+    boolean url404Purging = job.getBoolean(CRAWLDB_PURGE_404, false);
+
+    if (LOG.isInfoEnabled()) {
+      LOG.info("CrawlDb update: starting at " + sdf.format(start));
+      LOG.info("CrawlDb update: db: " + crawlDb);
+      LOG.info("CrawlDb update: segments: " + Arrays.asList(segments));
+      LOG.info("CrawlDb update: additions allowed: " + additionsAllowed);
+      LOG.info("CrawlDb update: URL normalizing: " + normalize);
+      LOG.info("CrawlDb update: URL filtering: " + filter);
+      LOG.info("CrawlDb update: 404 purging: " + url404Purging);
+    }
+
+    for (int i = 0; i < segments.length; i++) {
+      Path fetch = new Path(segments[i], CrawlDatum.FETCH_DIR_NAME);
+      Path parse = new Path(segments[i], CrawlDatum.PARSE_DIR_NAME);
+      if (fs.exists(fetch) && fs.exists(parse)) {
+        FileInputFormat.addInputPath(job, fetch);
+        FileInputFormat.addInputPath(job, parse);
+      } else {
+        LOG.info(" - skipping invalid segment " + segments[i]);
+      }
+    }
+
+    if (LOG.isInfoEnabled()) {
+      LOG.info("CrawlDb update: Merging segment data into db.");
+    }
+    try {
+      JobClient.runJob(job);
+    } catch (IOException e) {
+      LockUtil.removeLockFile(fs, lock);
+      Path outPath = FileOutputFormat.getOutputPath(job);
+      if (fs.exists(outPath))
+        fs.delete(outPath, true);
+      throw e;
+    }
+
+    CrawlDb.install(job, crawlDb);
+    long end = System.currentTimeMillis();
+    LOG.info("CrawlDb update: finished at " + sdf.format(end) + ", elapsed: "
+        + TimingUtil.elapsedTime(start, end));
+  }
+
+  /*
+   * Configure a new CrawlDb in a temp folder at crawlDb/<rand>
+   */
+  public static JobConf createJob(Configuration config, Path crawlDb)
+      throws IOException {
+    Path newCrawlDb = new Path(crawlDb, Integer.toString(new Random()
+        .nextInt(Integer.MAX_VALUE)));
+
+    JobConf job = new NutchJob(config);
+    job.setJobName("crawldb " + crawlDb);
+
+    Path current = new Path(crawlDb, CURRENT_NAME);
+    if (FileSystem.get(job).exists(current)) {
+      FileInputFormat.addInputPath(job, current);
+    }
+    job.setInputFormat(SequenceFileInputFormat.class);
+
+    job.setMapperClass(CrawlDbFilter.class);
+    job.setReducerClass(CrawlDbReducer.class);
+
+    FileOutputFormat.setOutputPath(job, newCrawlDb);
+    job.setOutputFormat(MapFileOutputFormat.class);
+    job.setOutputKeyClass(Text.class);
+    job.setOutputValueClass(CrawlDatum.class);
+
+    // https://issues.apache.org/jira/browse/NUTCH-1110
+    job.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);
+
+    return job;
+  }
+
+  public static void install(JobConf job, Path crawlDb) throws IOException {
+    boolean preserveBackup = job.getBoolean("db.preserve.backup", true);
+
+    Path newCrawlDb = FileOutputFormat.getOutputPath(job);
+    FileSystem fs = new JobClient(job).getFs();
+    Path old = new Path(crawlDb, "old");
+    Path current = new Path(crawlDb, CURRENT_NAME);
+    if (fs.exists(current)) {
+      if (fs.exists(old))
+        fs.delete(old, true);
+      fs.rename(current, old);
+    }
+    fs.mkdirs(crawlDb);
+    fs.rename(newCrawlDb, current);
+    if (!preserveBackup && fs.exists(old))
+      fs.delete(old, true);
+    Path lock = new Path(crawlDb, LOCK_NAME);
+    LockUtil.removeLockFile(fs, lock);
+  }
+
+  public static void install(Job job, Path crawlDb) throws IOException {
+    Configuration conf = job.getConfiguration();
+    boolean preserveBackup = conf.getBoolean("db.preserve.backup", true);
+    FileSystem fs = FileSystem.get(conf);
+    Path old = new Path(crawlDb, "old");
+    Path current = new Path(crawlDb, CURRENT_NAME);
+    Path tempCrawlDb = org.apache.hadoop.mapreduce.lib.output.FileOutputFormat
+        .getOutputPath(job);
+    FSUtils.replace(fs, old, current, true);
+    FSUtils.replace(fs, current, tempCrawlDb, true);
+    Path lock = new Path(crawlDb, LOCK_NAME);
+    LockUtil.removeLockFile(fs, lock);
+    if (!preserveBackup && fs.exists(old)) {
+      fs.delete(old, true);
+    }
+  }
+
+  public static void main(String[] args) throws Exception {
+    int res = ToolRunner.run(NutchConfiguration.create(), new CrawlDb(), args);
+    System.exit(res);
+  }
+
+  public int run(String[] args) throws Exception {
+    if (args.length < 1) {
+      System.err
+          .println("Usage: CrawlDb <crawldb> (-dir <segments> | <seg1> <seg2> ...) [-force] [-normalize] [-filter] [-noAdditions]");
+      System.err.println("\tcrawldb\tCrawlDb to update");
+      System.err
+          .println("\t-dir segments\tparent directory containing all segments to update from");
+      System.err
+          .println("\tseg1 seg2 ...\tlist of segment names to update from");
+      System.err
+          .println("\t-force\tforce update even if CrawlDb appears to be locked (CAUTION advised)");
+      System.err
+          .println("\t-normalize\tuse URLNormalizer on urls in CrawlDb and segment (usually not needed)");
+      System.err
+          .println("\t-filter\tuse URLFilters on urls in CrawlDb and segment");
+      System.err
+          .println("\t-noAdditions\tonly update already existing URLs, don't add any newly discovered URLs");
+
+      return -1;
+    }
+    boolean normalize = getConf().getBoolean(CrawlDbFilter.URL_NORMALIZING,
+        false);
+    boolean filter = getConf().getBoolean(CrawlDbFilter.URL_FILTERING, false);
+    boolean additionsAllowed = getConf().getBoolean(CRAWLDB_ADDITIONS_ALLOWED,
+        true);
+    boolean force = false;
+    final FileSystem fs = FileSystem.get(getConf());
+    HashSet<Path> dirs = new HashSet<Path>();
+    for (int i = 1; i < args.length; i++) {
+      if (args[i].equals("-normalize")) {
+        normalize = true;
+      } else if (args[i].equals("-filter")) {
+        filter = true;
+      } else if (args[i].equals("-force")) {
+        force = true;
+      } else if (args[i].equals("-noAdditions")) {
+        additionsAllowed = false;
+      } else if (args[i].equals("-dir")) {
+        FileStatus[] paths = fs.listStatus(new Path(args[++i]),
+            HadoopFSUtil.getPassDirectoriesFilter(fs));
+        dirs.addAll(Arrays.asList(HadoopFSUtil.getPaths(paths)));
+      } else {
+        dirs.add(new Path(args[i]));
+      }
+    }
+    try {
+      update(new Path(args[0]), dirs.toArray(new Path[dirs.size()]), normalize,
+          filter, additionsAllowed, force);
+      return 0;
+    } catch (Exception e) {
+      LOG.error("CrawlDb update: " + StringUtils.stringifyException(e));
+      return -1;
+    }
+  }
+
+  /*
+   * Used for Nutch REST service
+   */
+  @Override
+  public Map<String, Object> run(Map<String, Object> args, String crawlId) throws Exception {
+
+    Map<String, Object> results = new HashMap<String, Object>();
+
+    boolean normalize = getConf().getBoolean(CrawlDbFilter.URL_NORMALIZING,
+        false);
+    boolean filter = getConf().getBoolean(CrawlDbFilter.URL_FILTERING, false);
+    boolean additionsAllowed = getConf().getBoolean(CRAWLDB_ADDITIONS_ALLOWED,
+        true);
+    boolean force = false;
+    HashSet<Path> dirs = new HashSet<Path>();
+
+    if (args.containsKey("normalize")) {
+      normalize = true;
+    } 
+    if (args.containsKey("filter")) {
+      filter = true;
+    } 
+    if (args.containsKey("force")) {
+      force = true;
+    } 
+    if (args.containsKey("noAdditions")) {
+      additionsAllowed = false;
+    }
+
+    Path crawlDb;
+    if(args.containsKey(Nutch.ARG_CRAWLDB)) {
+      Object crawldbPath = args.get(Nutch.ARG_CRAWLDB);
+      if(crawldbPath instanceof Path) {
+        crawlDb = (Path) crawldbPath;
+      }
+      else {
+        crawlDb = new Path(crawldbPath.toString());
+      }
+    }
+    else {
+      crawlDb = new Path(crawlId+"/crawldb");
+    }
+
+    Path segmentsDir;
+    final FileSystem fs = FileSystem.get(getConf());
+    if(args.containsKey(Nutch.ARG_SEGMENTDIR)) {
+      Object segDir = args.get(Nutch.ARG_SEGMENTDIR);
+      if(segDir instanceof Path) {
+        segmentsDir = (Path) segDir;
+      }
+      else {
+        segmentsDir = new Path(segDir.toString());
+      }
+      FileStatus[] paths = fs.listStatus(segmentsDir,
+          HadoopFSUtil.getPassDirectoriesFilter(fs));
+      dirs.addAll(Arrays.asList(HadoopFSUtil.getPaths(paths)));
+    }
+
+    else if(args.containsKey(Nutch.ARG_SEGMENT)) {
+      Object segments = args.get(Nutch.ARG_SEGMENT);
+      ArrayList<String> segmentList = new ArrayList<String>();
+      if(segments instanceof ArrayList) {
+        segmentList = (ArrayList<String>)segments;
+      }
+      for(String segment: segmentList) {
+        dirs.add(new Path(segment));
+      }
+    }
+    else {
+      String segment_dir = crawlId+"/segments";
+      File dir = new File(segment_dir);
+      File[] segmentsList = dir.listFiles();  
+      Arrays.sort(segmentsList, new Comparator<File>(){
+        @Override
+        public int compare(File f1, File f2) {
+          if(f1.lastModified()>f2.lastModified())
+            return -1;
+          else
+            return 0;
+        }      
+      });
+      dirs.add(new Path(segmentsList[0].getPath()));
+    }
+    try {
+      update(crawlDb, dirs.toArray(new Path[dirs.size()]), normalize,
+          filter, additionsAllowed, force);
+      results.put(Nutch.VAL_RESULT, Integer.toString(0));
+      return results;
+    } catch (Exception e) {
+      LOG.error("CrawlDb update: " + StringUtils.stringifyException(e));
+      results.put(Nutch.VAL_RESULT, Integer.toString(-1));
+      return results;
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/crawl/CrawlDbFilter.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/crawl/CrawlDbFilter.java b/nutch-core/src/main/java/org/apache/nutch/crawl/CrawlDbFilter.java
new file mode 100644
index 0000000..de4c37b
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/crawl/CrawlDbFilter.java
@@ -0,0 +1,111 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+import java.io.IOException;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.Mapper;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.nutch.net.URLFilters;
+import org.apache.nutch.net.URLNormalizers;
+
+/**
+ * This class provides a way to separate the URL normalization and filtering
+ * steps from the rest of CrawlDb manipulation code.
+ * 
+ * @author Andrzej Bialecki
+ */
+public class CrawlDbFilter implements
+    Mapper<Text, CrawlDatum, Text, CrawlDatum> {
+  public static final String URL_FILTERING = "crawldb.url.filters";
+
+  public static final String URL_NORMALIZING = "crawldb.url.normalizers";
+
+  public static final String URL_NORMALIZING_SCOPE = "crawldb.url.normalizers.scope";
+
+  private boolean urlFiltering;
+
+  private boolean urlNormalizers;
+
+  private boolean url404Purging;
+
+  private URLFilters filters;
+
+  private URLNormalizers normalizers;
+
+  private String scope;
+
+  public static final Logger LOG = LoggerFactory.getLogger(CrawlDbFilter.class);
+
+  public void configure(JobConf job) {
+    urlFiltering = job.getBoolean(URL_FILTERING, false);
+    urlNormalizers = job.getBoolean(URL_NORMALIZING, false);
+    url404Purging = job.getBoolean(CrawlDb.CRAWLDB_PURGE_404, false);
+
+    if (urlFiltering) {
+      filters = new URLFilters(job);
+    }
+    if (urlNormalizers) {
+      scope = job.get(URL_NORMALIZING_SCOPE, URLNormalizers.SCOPE_CRAWLDB);
+      normalizers = new URLNormalizers(job, scope);
+    }
+  }
+
+  public void close() {
+  }
+
+  private Text newKey = new Text();
+
+  public void map(Text key, CrawlDatum value,
+      OutputCollector<Text, CrawlDatum> output, Reporter reporter)
+      throws IOException {
+
+    String url = key.toString();
+
+    // https://issues.apache.org/jira/browse/NUTCH-1101 check status first,
+    // cheaper than normalizing or filtering
+    if (url404Purging && CrawlDatum.STATUS_DB_GONE == value.getStatus()) {
+      url = null;
+    }
+    if (url != null && urlNormalizers) {
+      try {
+        url = normalizers.normalize(url, scope); // normalize the url
+      } catch (Exception e) {
+        LOG.warn("Skipping " + url + ":" + e);
+        url = null;
+      }
+    }
+    if (url != null && urlFiltering) {
+      try {
+        url = filters.filter(url); // filter the url
+      } catch (Exception e) {
+        LOG.warn("Skipping " + url + ":" + e);
+        url = null;
+      }
+    }
+    if (url != null) { // if it passes
+      newKey.set(url); // collect it
+      output.collect(newKey, value);
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/crawl/CrawlDbMerger.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/crawl/CrawlDbMerger.java b/nutch-core/src/main/java/org/apache/nutch/crawl/CrawlDbMerger.java
new file mode 100644
index 0000000..cd775d8
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/crawl/CrawlDbMerger.java
@@ -0,0 +1,216 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+import java.io.IOException;
+import java.text.SimpleDateFormat;
+import java.util.*;
+import java.util.Map.Entry;
+
+// Commons Logging imports
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapred.*;
+import org.apache.hadoop.util.*;
+import org.apache.hadoop.conf.*;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.NutchJob;
+import org.apache.nutch.util.TimingUtil;
+
+/**
+ * This tool merges several CrawlDb-s into one, optionally filtering URLs
+ * through the current URLFilters, to skip prohibited pages.
+ * 
+ * <p>
+ * It's possible to use this tool just for filtering - in that case only one
+ * CrawlDb should be specified in arguments.
+ * </p>
+ * <p>
+ * If more than one CrawlDb contains information about the same URL, only the
+ * most recent version is retained, as determined by the value of
+ * {@link org.apache.nutch.crawl.CrawlDatum#getFetchTime()}. However, all
+ * metadata information from all versions is accumulated, with newer values
+ * taking precedence over older values.
+ * 
+ * @author Andrzej Bialecki
+ */
+public class CrawlDbMerger extends Configured implements Tool {
+  private static final Logger LOG = LoggerFactory
+      .getLogger(CrawlDbMerger.class);
+
+  public static class Merger extends MapReduceBase implements
+      Reducer<Text, CrawlDatum, Text, CrawlDatum> {
+    private org.apache.hadoop.io.MapWritable meta;
+    private CrawlDatum res = new CrawlDatum();
+    private FetchSchedule schedule;
+
+    public void close() throws IOException {
+    }
+
+    public void configure(JobConf conf) {
+      schedule = FetchScheduleFactory.getFetchSchedule(conf);
+    }
+
+    public void reduce(Text key, Iterator<CrawlDatum> values,
+        OutputCollector<Text, CrawlDatum> output, Reporter reporter)
+        throws IOException {
+      long resTime = 0L;
+      boolean resSet = false;
+      meta = new org.apache.hadoop.io.MapWritable();
+      while (values.hasNext()) {
+        CrawlDatum val = values.next();
+        if (!resSet) {
+          res.set(val);
+          resSet = true;
+          resTime = schedule.calculateLastFetchTime(res);
+          for (Entry<Writable, Writable> e : res.getMetaData().entrySet()) {
+            meta.put(e.getKey(), e.getValue());
+          }
+          continue;
+        }
+        // compute last fetch time, and pick the latest
+        long valTime = schedule.calculateLastFetchTime(val);
+        if (valTime > resTime) {
+          // collect all metadata, newer values override older values
+          for (Entry<Writable, Writable> e : val.getMetaData().entrySet()) {
+            meta.put(e.getKey(), e.getValue());
+          }
+          res.set(val);
+          resTime = valTime;
+        } else {
+          // insert older metadata before newer
+          for (Entry<Writable, Writable> e : meta.entrySet()) {
+            val.getMetaData().put(e.getKey(), e.getValue());
+          }
+          meta = val.getMetaData();
+        }
+      }
+      res.setMetaData(meta);
+      output.collect(key, res);
+    }
+  }
+
+  public CrawlDbMerger() {
+
+  }
+
+  public CrawlDbMerger(Configuration conf) {
+    setConf(conf);
+  }
+
+  public void merge(Path output, Path[] dbs, boolean normalize, boolean filter)
+      throws Exception {
+    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+    long start = System.currentTimeMillis();
+    LOG.info("CrawlDb merge: starting at " + sdf.format(start));
+
+    JobConf job = createMergeJob(getConf(), output, normalize, filter);
+    for (int i = 0; i < dbs.length; i++) {
+      if (LOG.isInfoEnabled()) {
+        LOG.info("Adding " + dbs[i]);
+      }
+      FileInputFormat.addInputPath(job, new Path(dbs[i], CrawlDb.CURRENT_NAME));
+    }
+    JobClient.runJob(job);
+    FileSystem fs = FileSystem.get(getConf());
+    if (fs.exists(output))
+      fs.delete(output, true);
+    fs.mkdirs(output);
+    fs.rename(FileOutputFormat.getOutputPath(job), new Path(output,
+        CrawlDb.CURRENT_NAME));
+    long end = System.currentTimeMillis();
+    LOG.info("CrawlDb merge: finished at " + sdf.format(end) + ", elapsed: "
+        + TimingUtil.elapsedTime(start, end));
+  }
+
+  public static JobConf createMergeJob(Configuration conf, Path output,
+      boolean normalize, boolean filter) {
+    Path newCrawlDb = new Path("crawldb-merge-"
+        + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
+
+    JobConf job = new NutchJob(conf);
+    job.setJobName("crawldb merge " + output);
+
+    job.setInputFormat(SequenceFileInputFormat.class);
+
+    job.setMapperClass(CrawlDbFilter.class);
+    job.setBoolean(CrawlDbFilter.URL_FILTERING, filter);
+    job.setBoolean(CrawlDbFilter.URL_NORMALIZING, normalize);
+    job.setReducerClass(Merger.class);
+
+    FileOutputFormat.setOutputPath(job, newCrawlDb);
+    job.setOutputFormat(MapFileOutputFormat.class);
+    job.setOutputKeyClass(Text.class);
+    job.setOutputValueClass(CrawlDatum.class);
+
+    return job;
+  }
+
+  /**
+   * @param args
+   */
+  public static void main(String[] args) throws Exception {
+    int res = ToolRunner.run(NutchConfiguration.create(), new CrawlDbMerger(),
+        args);
+    System.exit(res);
+  }
+
+  public int run(String[] args) throws Exception {
+    if (args.length < 2) {
+      System.err
+          .println("Usage: CrawlDbMerger <output_crawldb> <crawldb1> [<crawldb2> <crawldb3> ...] [-normalize] [-filter]");
+      System.err.println("\toutput_crawldb\toutput CrawlDb");
+      System.err
+          .println("\tcrawldb1 ...\tinput CrawlDb-s (single input CrawlDb is ok)");
+      System.err
+          .println("\t-normalize\tuse URLNormalizer on urls in the crawldb(s) (usually not needed)");
+      System.err.println("\t-filter\tuse URLFilters on urls in the crawldb(s)");
+      return -1;
+    }
+    Path output = new Path(args[0]);
+    ArrayList<Path> dbs = new ArrayList<Path>();
+    boolean filter = false;
+    boolean normalize = false;
+    FileSystem fs = FileSystem.get(getConf());
+    for (int i = 1; i < args.length; i++) {
+      if (args[i].equals("-filter")) {
+        filter = true;
+        continue;
+      } else if (args[i].equals("-normalize")) {
+        normalize = true;
+        continue;
+      }
+      final Path dbPath = new Path(args[i]);
+      if (fs.exists(dbPath))
+        dbs.add(dbPath);
+    }
+    try {
+      merge(output, dbs.toArray(new Path[dbs.size()]), normalize, filter);
+      return 0;
+    } catch (Exception e) {
+      LOG.error("CrawlDb merge: " + StringUtils.stringifyException(e));
+      return -1;
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/crawl/CrawlDbReader.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/crawl/CrawlDbReader.java b/nutch-core/src/main/java/org/apache/nutch/crawl/CrawlDbReader.java
new file mode 100644
index 0000000..5db5f95
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/crawl/CrawlDbReader.java
@@ -0,0 +1,887 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+import java.io.DataOutputStream;
+import java.io.File;
+import java.io.IOException;
+import java.io.Closeable;
+import java.net.URL;
+import java.util.Date;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.Random;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.TreeMap;
+
+
+// Commons Logging imports
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.FloatWritable;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.MapFile;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapred.FileInputFormat;
+import org.apache.hadoop.mapred.FileOutputFormat;
+import org.apache.hadoop.mapred.JobClient;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.MapFileOutputFormat;
+import org.apache.hadoop.mapred.Mapper;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.RecordWriter;
+import org.apache.hadoop.mapred.Reducer;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.hadoop.mapred.SequenceFileInputFormat;
+import org.apache.hadoop.mapred.SequenceFileOutputFormat;
+import org.apache.hadoop.mapred.TextOutputFormat;
+import org.apache.hadoop.mapred.lib.HashPartitioner;
+import org.apache.hadoop.mapred.lib.IdentityMapper;
+import org.apache.hadoop.mapred.lib.IdentityReducer;
+import org.apache.hadoop.util.Progressable;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.nutch.util.JexlUtil;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.NutchJob;
+import org.apache.nutch.util.StringUtil;
+import org.apache.nutch.util.TimingUtil;
+import org.apache.commons.jexl2.Expression;
+import org.apache.commons.jexl2.JexlEngine;
+import org.apache.commons.lang.time.DateUtils;
+
+/**
+ * Read utility for the CrawlDB.
+ * 
+ * @author Andrzej Bialecki
+ * 
+ */
+public class CrawlDbReader extends Configured implements Closeable, Tool {
+
+  public static final Logger LOG = LoggerFactory.getLogger(CrawlDbReader.class);
+
+  private MapFile.Reader[] readers = null;
+
+  private void openReaders(String crawlDb, JobConf config)
+      throws IOException {
+    if (readers != null)
+      return;
+    FileSystem fs = FileSystem.get(config);
+    readers = MapFileOutputFormat.getReaders(fs, new Path(crawlDb,
+        CrawlDb.CURRENT_NAME), config);
+  }
+
+  private void closeReaders() {
+    if (readers == null)
+      return;
+    for (int i = 0; i < readers.length; i++) {
+      try {
+        readers[i].close();
+      } catch (Exception e) {
+
+      }
+    }
+  }
+
+  public static class CrawlDatumCsvOutputFormat extends
+      FileOutputFormat<Text, CrawlDatum> {
+    protected static class LineRecordWriter implements
+        RecordWriter<Text, CrawlDatum> {
+      private DataOutputStream out;
+
+      public LineRecordWriter(DataOutputStream out) {
+        this.out = out;
+        try {
+          out.writeBytes("Url,Status code,Status name,Fetch Time,Modified Time,Retries since fetch,Retry interval seconds,Retry interval days,Score,Signature,Metadata\n");
+        } catch (IOException e) {
+        }
+      }
+
+      public synchronized void write(Text key, CrawlDatum value)
+          throws IOException {
+        out.writeByte('"');
+        out.writeBytes(key.toString());
+        out.writeByte('"');
+        out.writeByte(',');
+        out.writeBytes(Integer.toString(value.getStatus()));
+        out.writeByte(',');
+        out.writeByte('"');
+        out.writeBytes(CrawlDatum.getStatusName(value.getStatus()));
+        out.writeByte('"');
+        out.writeByte(',');
+        out.writeBytes(new Date(value.getFetchTime()).toString());
+        out.writeByte(',');
+        out.writeBytes(new Date(value.getModifiedTime()).toString());
+        out.writeByte(',');
+        out.writeBytes(Integer.toString(value.getRetriesSinceFetch()));
+        out.writeByte(',');
+        out.writeBytes(Float.toString(value.getFetchInterval()));
+        out.writeByte(',');
+        out.writeBytes(Float.toString((value.getFetchInterval() / FetchSchedule.SECONDS_PER_DAY)));
+        out.writeByte(',');
+        out.writeBytes(Float.toString(value.getScore()));
+        out.writeByte(',');
+        out.writeByte('"');
+        out.writeBytes(value.getSignature() != null ? StringUtil
+            .toHexString(value.getSignature()) : "null");
+        out.writeByte('"');
+        out.writeByte(',');
+        out.writeByte('"');
+        if (value.getMetaData() != null) {
+          for (Entry<Writable, Writable> e : value.getMetaData().entrySet()) {
+            out.writeBytes(e.getKey().toString());
+            out.writeByte(':');
+            out.writeBytes(e.getValue().toString());
+            out.writeBytes("|||");
+          }
+        }
+        out.writeByte('"');
+
+        out.writeByte('\n');
+      }
+
+      public synchronized void close(Reporter reporter) throws IOException {
+        out.close();
+      }
+    }
+
+    public RecordWriter<Text, CrawlDatum> getRecordWriter(FileSystem fs,
+        JobConf job, String name, Progressable progress) throws IOException {
+      Path dir = FileOutputFormat.getOutputPath(job);
+      DataOutputStream fileOut = fs.create(new Path(dir, name), progress);
+      return new LineRecordWriter(fileOut);
+    }
+  }
+
+  public static class CrawlDbStatMapper implements
+      Mapper<Text, CrawlDatum, Text, LongWritable> {
+    LongWritable COUNT_1 = new LongWritable(1);
+    private boolean sort = false;
+
+    public void configure(JobConf job) {
+      sort = job.getBoolean("db.reader.stats.sort", false);
+    }
+
+    public void close() {
+    }
+
+    public void map(Text key, CrawlDatum value,
+        OutputCollector<Text, LongWritable> output, Reporter reporter)
+        throws IOException {
+      output.collect(new Text("T"), COUNT_1);
+      output.collect(new Text("status " + value.getStatus()), COUNT_1);
+      output
+          .collect(new Text("retry " + value.getRetriesSinceFetch()), COUNT_1);
+      output.collect(new Text("sc"), new LongWritable(
+          (long) (value.getScore() * 1000.0)));
+      // fetch time (in minutes to prevent from overflows when summing up)
+      output.collect(new Text("ft"),
+          new LongWritable(value.getFetchTime() / (1000 * 60)));
+      // fetch interval (in seconds)
+      output.collect(new Text("fi"),
+          new LongWritable(value.getFetchInterval()));
+      if (sort) {
+        URL u = new URL(key.toString());
+        String host = u.getHost();
+        output.collect(new Text("status " + value.getStatus() + " " + host),
+            COUNT_1);
+      }
+    }
+  }
+
+  public static class CrawlDbStatCombiner implements
+      Reducer<Text, LongWritable, Text, LongWritable> {
+    LongWritable val = new LongWritable();
+
+    public CrawlDbStatCombiner() {
+    }
+
+    public void configure(JobConf job) {
+    }
+
+    public void close() {
+    }
+
+    private void reduceMinMaxTotal(String keyPrefix, Iterator<LongWritable> values,
+        OutputCollector<Text, LongWritable> output, Reporter reporter)
+        throws IOException {
+      long total = 0;
+      long min = Long.MAX_VALUE;
+      long max = Long.MIN_VALUE;
+      while (values.hasNext()) {
+        LongWritable cnt = values.next();
+        if (cnt.get() < min)
+          min = cnt.get();
+        if (cnt.get() > max)
+          max = cnt.get();
+        total += cnt.get();
+      }
+      output.collect(new Text(keyPrefix+"n"), new LongWritable(min));
+      output.collect(new Text(keyPrefix+"x"), new LongWritable(max));
+      output.collect(new Text(keyPrefix+"t"), new LongWritable(total));
+    }
+    
+    public void reduce(Text key, Iterator<LongWritable> values,
+        OutputCollector<Text, LongWritable> output, Reporter reporter)
+        throws IOException {
+      val.set(0L);
+      String k = key.toString();
+      if (k.equals("sc") || k.equals("ft") || k.equals("fi")) {
+        reduceMinMaxTotal(k, values, output, reporter);
+      } else {
+        while (values.hasNext()) {
+          LongWritable cnt = values.next();
+          val.set(val.get() + cnt.get());
+        }
+        output.collect(key, val);
+      }
+    }
+  }
+
+  public static class CrawlDbStatReducer implements
+      Reducer<Text, LongWritable, Text, LongWritable> {
+    public void configure(JobConf job) {
+    }
+
+    public void close() {
+    }
+
+    public void reduce(Text key, Iterator<LongWritable> values,
+        OutputCollector<Text, LongWritable> output, Reporter reporter)
+        throws IOException {
+
+      String k = key.toString();
+      if (k.equals("T")) {
+        // sum all values for this key
+        long sum = 0;
+        while (values.hasNext()) {
+          sum += values.next().get();
+        }
+        // output sum
+        output.collect(key, new LongWritable(sum));
+      } else if (k.startsWith("status") || k.startsWith("retry")) {
+        LongWritable cnt = new LongWritable();
+        while (values.hasNext()) {
+          LongWritable val = values.next();
+          cnt.set(cnt.get() + val.get());
+        }
+        output.collect(key, cnt);
+      } else if (k.equals("scx") || k.equals("ftx") || k.equals("fix")) {
+        LongWritable cnt = new LongWritable(Long.MIN_VALUE);
+        while (values.hasNext()) {
+          LongWritable val = values.next();
+          if (cnt.get() < val.get())
+            cnt.set(val.get());
+        }
+        output.collect(key, cnt);
+      } else if (k.equals("scn") || k.equals("ftn") || k.equals("fin")) {
+        LongWritable cnt = new LongWritable(Long.MAX_VALUE);
+        while (values.hasNext()) {
+          LongWritable val = values.next();
+          if (cnt.get() > val.get())
+            cnt.set(val.get());
+        }
+        output.collect(key, cnt);
+      } else if (k.equals("sct") || k.equals("ftt") || k.equals("fit")) {
+        LongWritable cnt = new LongWritable();
+        while (values.hasNext()) {
+          LongWritable val = values.next();
+          cnt.set(cnt.get() + val.get());
+        }
+        output.collect(key, cnt);
+      }
+    }
+  }
+
+  public static class CrawlDbTopNMapper implements
+      Mapper<Text, CrawlDatum, FloatWritable, Text> {
+    private static final FloatWritable fw = new FloatWritable();
+    private float min = 0.0f;
+
+    public void configure(JobConf job) {
+      min = job.getFloat("db.reader.topn.min", 0.0f);
+    }
+
+    public void close() {
+    }
+
+    public void map(Text key, CrawlDatum value,
+        OutputCollector<FloatWritable, Text> output, Reporter reporter)
+        throws IOException {
+      if (value.getScore() < min)
+        return; // don't collect low-scoring records
+      fw.set(-value.getScore()); // reverse sorting order
+      output.collect(fw, key); // invert mapping: score -> url
+    }
+  }
+
+  public static class CrawlDbTopNReducer implements
+      Reducer<FloatWritable, Text, FloatWritable, Text> {
+    private long topN;
+    private long count = 0L;
+
+    public void reduce(FloatWritable key, Iterator<Text> values,
+        OutputCollector<FloatWritable, Text> output, Reporter reporter)
+        throws IOException {
+      while (values.hasNext() && count < topN) {
+        key.set(-key.get());
+        output.collect(key, values.next());
+        count++;
+      }
+    }
+
+    public void configure(JobConf job) {
+      topN = job.getLong("db.reader.topn", 100) / job.getNumReduceTasks();
+    }
+
+    public void close() {
+    }
+  }
+
+  public void close() {
+    closeReaders();
+  }
+
+  private TreeMap<String, LongWritable> processStatJobHelper(String crawlDb, Configuration config, boolean sort) throws IOException{
+	  Path tmpFolder = new Path(crawlDb, "stat_tmp" + System.currentTimeMillis());
+
+	  JobConf job = new NutchJob(config);
+	  job.setJobName("stats " + crawlDb);
+	  job.setBoolean("db.reader.stats.sort", sort);
+
+	  FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME));
+	  job.setInputFormat(SequenceFileInputFormat.class);
+
+	  job.setMapperClass(CrawlDbStatMapper.class);
+	  job.setCombinerClass(CrawlDbStatCombiner.class);
+	  job.setReducerClass(CrawlDbStatReducer.class);
+
+	  FileOutputFormat.setOutputPath(job, tmpFolder);
+	  job.setOutputFormat(SequenceFileOutputFormat.class);
+	  job.setOutputKeyClass(Text.class);
+	  job.setOutputValueClass(LongWritable.class);
+
+	  // https://issues.apache.org/jira/browse/NUTCH-1029
+	  job.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);
+
+	  JobClient.runJob(job);
+
+	  // reading the result
+	  FileSystem fileSystem = FileSystem.get(config);
+	  SequenceFile.Reader[] readers = SequenceFileOutputFormat.getReaders(config,
+			  tmpFolder);
+
+	  Text key = new Text();
+	  LongWritable value = new LongWritable();
+
+	  TreeMap<String, LongWritable> stats = new TreeMap<String, LongWritable>();
+	  for (int i = 0; i < readers.length; i++) {
+		  SequenceFile.Reader reader = readers[i];
+		  while (reader.next(key, value)) {
+			  String k = key.toString();
+			  LongWritable val = stats.get(k);
+			  if (val == null) {
+				  val = new LongWritable();
+				  if (k.equals("scx") || k.equals("ftx") || k.equals("fix"))
+					  val.set(Long.MIN_VALUE);
+				  if (k.equals("scn") || k.equals("ftn") || k.equals("fin"))
+					  val.set(Long.MAX_VALUE);
+				  stats.put(k, val);
+			  }
+			  if (k.equals("scx") || k.equals("ftx") || k.equals("fix")) {
+				  if (val.get() < value.get())
+					  val.set(value.get());
+			  } else if (k.equals("scn") || k.equals("ftn") || k.equals("fin")) {
+				  if (val.get() > value.get())
+					  val.set(value.get());
+			  } else {
+				  val.set(val.get() + value.get());
+			  }
+		  }
+		  reader.close();
+	  }
+	  // removing the tmp folder
+	  fileSystem.delete(tmpFolder, true);
+	  return stats;
+  }
+  
+  public void processStatJob(String crawlDb, Configuration config, boolean sort)
+      throws IOException {
+
+    if (LOG.isInfoEnabled()) {
+      LOG.info("CrawlDb statistics start: " + crawlDb);
+    }
+    TreeMap<String, LongWritable> stats = processStatJobHelper(crawlDb, config, sort);
+
+    if (LOG.isInfoEnabled()) {
+      LOG.info("Statistics for CrawlDb: " + crawlDb);
+      LongWritable totalCnt = stats.get("T");
+      stats.remove("T");
+      LOG.info("TOTAL urls:\t" + totalCnt.get());
+      for (Map.Entry<String, LongWritable> entry : stats.entrySet()) {
+        String k = entry.getKey();
+        LongWritable val = entry.getValue();
+        if (k.equals("scn")) {
+          LOG.info("min score:\t" + (val.get() / 1000.0f));
+        } else if (k.equals("scx")) {
+          LOG.info("max score:\t" + (val.get() / 1000.0f));
+        } else if (k.equals("sct")) {
+          LOG.info("avg score:\t"
+              + (float) ((((double) val.get()) / totalCnt.get()) / 1000.0));
+        } else if (k.equals("ftn")) {
+          LOG.info("earliest fetch time:\t" + new Date(1000 * 60 * val.get()));
+        } else if (k.equals("ftx")) {
+          LOG.info("latest fetch time:\t" + new Date(1000 * 60 * val.get()));
+        } else if (k.equals("ftt")) {
+          LOG.info("avg of fetch times:\t"
+              + new Date(1000 * 60 * (val.get() / totalCnt.get())));
+        } else if (k.equals("fin")) {
+          LOG.info("shortest fetch interval:\t{}",
+              TimingUtil.secondsToDaysHMS(val.get()));
+        } else if (k.equals("fix")) {
+          LOG.info("longest fetch interval:\t{}",
+              TimingUtil.secondsToDaysHMS(val.get()));
+        } else if (k.equals("fit")) {
+          LOG.info("avg fetch interval:\t{}",
+              TimingUtil.secondsToDaysHMS(val.get() / totalCnt.get()));
+        } else if (k.startsWith("status")) {
+          String[] st = k.split(" ");
+          int code = Integer.parseInt(st[1]);
+          if (st.length > 2)
+            LOG.info("   " + st[2] + " :\t" + val);
+          else
+            LOG.info(st[0] + " " + code + " ("
+                + CrawlDatum.getStatusName((byte) code) + "):\t" + val);
+        } else
+          LOG.info(k + ":\t" + val);
+      }
+    }
+    if (LOG.isInfoEnabled()) {
+      LOG.info("CrawlDb statistics: done");
+    }
+
+  }
+
+  public CrawlDatum get(String crawlDb, String url, JobConf config)
+      throws IOException {
+    Text key = new Text(url);
+    CrawlDatum val = new CrawlDatum();
+    openReaders(crawlDb, config);
+    CrawlDatum res = (CrawlDatum) MapFileOutputFormat.getEntry(readers,
+        new HashPartitioner<Text, CrawlDatum>(), key, val);
+    return res;
+  }
+
+  public void readUrl(String crawlDb, String url, JobConf config)
+      throws IOException {
+    CrawlDatum res = get(crawlDb, url, config);
+    System.out.println("URL: " + url);
+    if (res != null) {
+      System.out.println(res);
+    } else {
+      System.out.println("not found");
+    }
+  }
+
+  public void processDumpJob(String crawlDb, String output,
+      JobConf config, String format, String regex, String status,
+      Integer retry, String expr) throws IOException {
+    if (LOG.isInfoEnabled()) {
+      LOG.info("CrawlDb dump: starting");
+      LOG.info("CrawlDb db: " + crawlDb);
+    }
+
+    Path outFolder = new Path(output);
+
+    JobConf job = new NutchJob(config);
+    job.setJobName("dump " + crawlDb);
+
+    FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME));
+    job.setInputFormat(SequenceFileInputFormat.class);
+    FileOutputFormat.setOutputPath(job, outFolder);
+
+    if (format.equals("csv")) {
+      job.setOutputFormat(CrawlDatumCsvOutputFormat.class);
+    } else if (format.equals("crawldb")) {
+      job.setOutputFormat(MapFileOutputFormat.class);
+    } else {
+      job.setOutputFormat(TextOutputFormat.class);
+    }
+
+    if (status != null)
+      job.set("status", status);
+    if (regex != null)
+      job.set("regex", regex);
+    if (retry != null)
+      job.setInt("retry", retry);
+    if (expr != null) {
+      job.set("expr", expr);
+      LOG.info("CrawlDb db: expr: " + expr);
+    }
+
+    job.setMapperClass(CrawlDbDumpMapper.class);
+    job.setOutputKeyClass(Text.class);
+    job.setOutputValueClass(CrawlDatum.class);
+
+    JobClient.runJob(job);
+    if (LOG.isInfoEnabled()) {
+      LOG.info("CrawlDb dump: done");
+    }
+  }
+
+  public static class CrawlDbDumpMapper implements
+      Mapper<Text, CrawlDatum, Text, CrawlDatum> {
+    Pattern pattern = null;
+    Matcher matcher = null;
+    String status = null;
+    Integer retry = null;
+    Expression expr = null;
+
+    public void configure(JobConf job) {
+      if (job.get("regex", null) != null) {
+        pattern = Pattern.compile(job.get("regex"));
+      }
+      status = job.get("status", null);
+      retry = job.getInt("retry", -1);
+      
+      if (job.get("expr", null) != null) {
+        expr = JexlUtil.parseExpression(job.get("expr", null));
+      }
+    }
+
+    public void close() {
+    }
+
+    public void map(Text key, CrawlDatum value,
+        OutputCollector<Text, CrawlDatum> output, Reporter reporter)
+        throws IOException {
+
+      // check retry
+      if (retry != -1) {
+        if (value.getRetriesSinceFetch() < retry) {
+          return;
+        }
+      }
+
+      // check status
+      if (status != null
+          && !status.equalsIgnoreCase(CrawlDatum.getStatusName(value
+              .getStatus())))
+        return;
+
+      // check regex
+      if (pattern != null) {
+        matcher = pattern.matcher(key.toString());
+        if (!matcher.matches()) {
+          return;
+        }
+      }
+      
+      // check expr
+      if (expr != null) {
+        if (!value.evaluate(expr)) {
+          return;
+        }
+      }
+
+      output.collect(key, value);
+    }
+  }
+
+  public void processTopNJob(String crawlDb, long topN, float min,
+      String output, JobConf config) throws IOException {
+
+    if (LOG.isInfoEnabled()) {
+      LOG.info("CrawlDb topN: starting (topN=" + topN + ", min=" + min + ")");
+      LOG.info("CrawlDb db: " + crawlDb);
+    }
+
+    Path outFolder = new Path(output);
+    Path tempDir = new Path(config.get("mapred.temp.dir", ".")
+        + "/readdb-topN-temp-"
+        + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
+
+    JobConf job = new NutchJob(config);
+    job.setJobName("topN prepare " + crawlDb);
+    FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME));
+    job.setInputFormat(SequenceFileInputFormat.class);
+    job.setMapperClass(CrawlDbTopNMapper.class);
+    job.setReducerClass(IdentityReducer.class);
+
+    FileOutputFormat.setOutputPath(job, tempDir);
+    job.setOutputFormat(SequenceFileOutputFormat.class);
+    job.setOutputKeyClass(FloatWritable.class);
+    job.setOutputValueClass(Text.class);
+
+    job.setFloat("db.reader.topn.min", min);
+    JobClient.runJob(job);
+
+    if (LOG.isInfoEnabled()) {
+      LOG.info("CrawlDb topN: collecting topN scores.");
+    }
+    job = new NutchJob(config);
+    job.setJobName("topN collect " + crawlDb);
+    job.setLong("db.reader.topn", topN);
+
+    FileInputFormat.addInputPath(job, tempDir);
+    job.setInputFormat(SequenceFileInputFormat.class);
+    job.setMapperClass(IdentityMapper.class);
+    job.setReducerClass(CrawlDbTopNReducer.class);
+
+    FileOutputFormat.setOutputPath(job, outFolder);
+    job.setOutputFormat(TextOutputFormat.class);
+    job.setOutputKeyClass(FloatWritable.class);
+    job.setOutputValueClass(Text.class);
+
+    job.setNumReduceTasks(1); // create a single file.
+
+    JobClient.runJob(job);
+    FileSystem fs = FileSystem.get(config);
+    fs.delete(tempDir, true);
+    if (LOG.isInfoEnabled()) {
+      LOG.info("CrawlDb topN: done");
+    }
+
+  }
+
+  public int run(String[] args) throws IOException {
+    @SuppressWarnings("resource")
+    CrawlDbReader dbr = new CrawlDbReader();
+
+    if (args.length < 2) {
+      System.err
+          .println("Usage: CrawlDbReader <crawldb> (-stats | -dump <out_dir> | -topN <nnnn> <out_dir> [<min>] | -url <url>)");
+      System.err
+          .println("\t<crawldb>\tdirectory name where crawldb is located");
+      System.err
+          .println("\t-stats [-sort] \tprint overall statistics to System.out");
+      System.err.println("\t\t[-sort]\tlist status sorted by host");
+      System.err
+          .println("\t-dump <out_dir> [-format normal|csv|crawldb]\tdump the whole db to a text file in <out_dir>");
+      System.err.println("\t\t[-format csv]\tdump in Csv format");
+      System.err
+          .println("\t\t[-format normal]\tdump in standard format (default option)");
+      System.err.println("\t\t[-format crawldb]\tdump as CrawlDB");
+      System.err.println("\t\t[-regex <expr>]\tfilter records with expression");
+      System.err.println("\t\t[-retry <num>]\tminimum retry count");
+      System.err
+          .println("\t\t[-status <status>]\tfilter records by CrawlDatum status");
+      System.err.println("\t\t[-expr <expr>]\tJexl expression to evaluate for this record");
+      System.err
+          .println("\t-url <url>\tprint information on <url> to System.out");
+      System.err
+          .println("\t-topN <nnnn> <out_dir> [<min>]\tdump top <nnnn> urls sorted by score to <out_dir>");
+      System.err
+          .println("\t\t[<min>]\tskip records with scores below this value.");
+      System.err.println("\t\t\tThis can significantly improve performance.");
+      return -1;
+    }
+    String param = null;
+    String crawlDb = args[0];
+    JobConf job = new NutchJob(getConf());
+    for (int i = 1; i < args.length; i++) {
+      if (args[i].equals("-stats")) {
+        boolean toSort = false;
+        if (i < args.length - 1 && "-sort".equals(args[i + 1])) {
+          toSort = true;
+          i++;
+        }
+        dbr.processStatJob(crawlDb, job, toSort);
+      } else if (args[i].equals("-dump")) {
+        param = args[++i];
+        String format = "normal";
+        String regex = null;
+        Integer retry = null;
+        String status = null;
+        String expr = null;
+        for (int j = i + 1; j < args.length; j++) {
+          if (args[j].equals("-format")) {
+            format = args[++j];
+            i = i + 2;
+          }
+          if (args[j].equals("-regex")) {
+            regex = args[++j];
+            i = i + 2;
+          }
+          if (args[j].equals("-retry")) {
+            retry = Integer.parseInt(args[++j]);
+            i = i + 2;
+          }
+          if (args[j].equals("-status")) {
+            status = args[++j];
+            i = i + 2;
+          }
+          if (args[j].equals("-expr")) {
+            expr = args[++j];
+            i=i+2;
+          }
+        }
+        dbr.processDumpJob(crawlDb, param, job, format, regex, status, retry, expr);
+      } else if (args[i].equals("-url")) {
+        param = args[++i];
+        dbr.readUrl(crawlDb, param, job);
+      } else if (args[i].equals("-topN")) {
+        param = args[++i];
+        long topN = Long.parseLong(param);
+        param = args[++i];
+        float min = 0.0f;
+        if (i < args.length - 1) {
+          min = Float.parseFloat(args[++i]);
+        }
+        dbr.processTopNJob(crawlDb, topN, min, param, job);
+      } else {
+        System.err.println("\nError: wrong argument " + args[i]);
+        return -1;
+      }
+    }
+    return 0;
+  }
+  
+  public static void main(String[] args) throws Exception {
+    int result = ToolRunner.run(NutchConfiguration.create(),
+        new CrawlDbReader(), args);
+    System.exit(result);
+  }
+
+  public Object query(Map<String, String> args, Configuration conf, String type, String crawlId) throws Exception {
+ 
+
+    Map<String, Object> results = new HashMap<String, Object>();
+    String crawlDb = crawlId + "/crawldb";
+
+    if(type.equalsIgnoreCase("stats")){
+      boolean sort = false;
+      if(args.containsKey("sort")){
+        if(args.get("sort").equalsIgnoreCase("true"))
+          sort = true;
+      }
+      TreeMap<String , LongWritable> stats = processStatJobHelper(crawlDb, NutchConfiguration.create(), sort);
+      LongWritable totalCnt = stats.get("T");
+      stats.remove("T");
+      results.put("totalUrls", String.valueOf(totalCnt.get()));
+      Map<String, Object> statusMap = new HashMap<String, Object>();      
+
+      for (Map.Entry<String, LongWritable> entry : stats.entrySet()) {
+        String k = entry.getKey();
+        LongWritable val = entry.getValue();
+        if (k.equals("scn")) {
+          results.put("minScore", String.valueOf((val.get() / 1000.0f)));
+        } else if (k.equals("scx")) {
+          results.put("maxScore", String.valueOf((val.get() / 1000.0f)));
+        } else if (k.equals("sct")) {
+          results.put("avgScore", String.valueOf((float) ((((double) val.get()) / totalCnt.get()) / 1000.0)));
+        } else if (k.startsWith("status")) {
+          String[] st = k.split(" ");
+          int code = Integer.parseInt(st[1]);
+          if (st.length > 2){
+            @SuppressWarnings("unchecked")
+            Map<String, Object> individualStatusInfo = (Map<String, Object>) statusMap.get(String.valueOf(code));
+            Map<String, String> hostValues;
+            if(individualStatusInfo.containsKey("hostValues")){
+              hostValues= (Map<String, String>) individualStatusInfo.get("hostValues");
+            }
+            else{
+              hostValues = new HashMap<String, String>();
+              individualStatusInfo.put("hostValues", hostValues);
+            }
+            hostValues.put(st[2], String.valueOf(val));
+          }
+          else{
+            Map<String, Object> individualStatusInfo = new HashMap<String, Object>();
+
+            individualStatusInfo.put("statusValue", CrawlDatum.getStatusName((byte) code));
+            individualStatusInfo.put("count", String.valueOf(val));
+
+            statusMap.put(String.valueOf(code), individualStatusInfo);
+          }
+        } else
+          results.put(k, String.valueOf(val));			  
+      }
+      results.put("status", statusMap);
+      return results;
+    }
+    if(type.equalsIgnoreCase("dump")){
+      String output = args.get("out_dir");
+      String format = "normal";
+      String regex = null;
+      Integer retry = null;
+      String status = null;
+      String expr = null;
+      if (args.containsKey("format")) {
+        format = args.get("format");
+      }
+      if (args.containsKey("regex")) {
+        regex = args.get("regex");
+      }
+      if (args.containsKey("retry")) {
+        retry = Integer.parseInt(args.get("retry"));
+      }
+      if (args.containsKey("status")) {
+        status = args.get("status");
+      }
+      if (args.containsKey("expr")) {
+        expr = args.get("expr");
+      }
+      processDumpJob(crawlDb, output, new NutchJob(conf), format, regex, status, retry, expr);
+      File dumpFile = new File(output+"/part-00000");
+      return dumpFile;		  
+    }
+    if (type.equalsIgnoreCase("topN")) {
+      String output = args.get("out_dir");
+      long topN = Long.parseLong(args.get("nnn"));
+      float min = 0.0f;
+      if(args.containsKey("min")){
+        min = Float.parseFloat(args.get("min"));
+      }
+      processTopNJob(crawlDb, topN, min, output, new NutchJob(conf));
+      File dumpFile = new File(output+"/part-00000");
+      return dumpFile;
+    }
+
+    if(type.equalsIgnoreCase("url")){
+      String url = args.get("url");
+      CrawlDatum res = get(crawlDb, url, new NutchJob(conf));
+      results.put("status", res.getStatus());
+      results.put("fetchTime", new Date(res.getFetchTime()));
+      results.put("modifiedTime", new Date(res.getModifiedTime()));
+      results.put("retriesSinceFetch", res.getRetriesSinceFetch());
+      results.put("retryInterval", res.getFetchInterval());
+      results.put("score", res.getScore());
+      results.put("signature", StringUtil.toHexString(res.getSignature()));
+      Map<String, String> metadata = new HashMap<String, String>();
+      if(res.getMetaData()!=null){
+        for (Entry<Writable, Writable> e : res.getMetaData().entrySet()) {
+          metadata.put(String.valueOf(e.getKey()), String.valueOf(e.getValue()));
+        }
+      }
+      results.put("metadata", metadata);
+
+      return results;
+    }
+    return results;
+  }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/crawl/CrawlDbReducer.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/crawl/CrawlDbReducer.java b/nutch-core/src/main/java/org/apache/nutch/crawl/CrawlDbReducer.java
new file mode 100644
index 0000000..1ae73b8
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/crawl/CrawlDbReducer.java
@@ -0,0 +1,339 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map.Entry;
+import java.io.IOException;
+
+// Logging imports
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.io.*;
+import org.apache.hadoop.mapred.*;
+import org.apache.hadoop.util.PriorityQueue;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.scoring.ScoringFilterException;
+import org.apache.nutch.scoring.ScoringFilters;
+
+/** Merge new page entries with existing entries. */
+public class CrawlDbReducer implements
+    Reducer<Text, CrawlDatum, Text, CrawlDatum> {
+  public static final Logger LOG = LoggerFactory
+      .getLogger(CrawlDbReducer.class);
+
+  private int retryMax;
+  private CrawlDatum result = new CrawlDatum();
+  private InlinkPriorityQueue linked = null;
+  private ScoringFilters scfilters = null;
+  private boolean additionsAllowed;
+  private int maxInterval;
+  private FetchSchedule schedule;
+
+  public void configure(JobConf job) {
+    retryMax = job.getInt("db.fetch.retry.max", 3);
+    scfilters = new ScoringFilters(job);
+    additionsAllowed = job.getBoolean(CrawlDb.CRAWLDB_ADDITIONS_ALLOWED, true);
+    maxInterval = job.getInt("db.fetch.interval.max", 0);
+    schedule = FetchScheduleFactory.getFetchSchedule(job);
+    int maxLinks = job.getInt("db.update.max.inlinks", 10000);
+    linked = new InlinkPriorityQueue(maxLinks);
+  }
+
+  public void close() {
+  }
+
+  public void reduce(Text key, Iterator<CrawlDatum> values,
+      OutputCollector<Text, CrawlDatum> output, Reporter reporter)
+      throws IOException {
+
+    CrawlDatum fetch = new CrawlDatum();
+    CrawlDatum old = new CrawlDatum();
+
+    boolean fetchSet = false;
+    boolean oldSet = false;
+    byte[] signature = null;
+    boolean multiple = false; // avoid deep copy when only single value exists
+    linked.clear();
+    org.apache.hadoop.io.MapWritable metaFromParse = null;
+
+    while (values.hasNext()) {
+      CrawlDatum datum = values.next();
+      if (!multiple && values.hasNext())
+        multiple = true;
+      if (CrawlDatum.hasDbStatus(datum)) {
+        if (!oldSet) {
+          if (multiple) {
+            old.set(datum);
+          } else {
+            // no need for a deep copy - this is the only value
+            old = datum;
+          }
+          oldSet = true;
+        } else {
+          // always take the latest version
+          if (old.getFetchTime() < datum.getFetchTime())
+            old.set(datum);
+        }
+        continue;
+      }
+
+      if (CrawlDatum.hasFetchStatus(datum)) {
+        if (!fetchSet) {
+          if (multiple) {
+            fetch.set(datum);
+          } else {
+            fetch = datum;
+          }
+          fetchSet = true;
+        } else {
+          // always take the latest version
+          if (fetch.getFetchTime() < datum.getFetchTime())
+            fetch.set(datum);
+        }
+        continue;
+      }
+
+      switch (datum.getStatus()) { // collect other info
+      case CrawlDatum.STATUS_LINKED:
+        CrawlDatum link;
+        if (multiple) {
+          link = new CrawlDatum();
+          link.set(datum);
+        } else {
+          link = datum;
+        }
+        linked.insert(link);
+        break;
+      case CrawlDatum.STATUS_SIGNATURE:
+        signature = datum.getSignature();
+        break;
+      case CrawlDatum.STATUS_PARSE_META:
+        metaFromParse = datum.getMetaData();
+        break;
+      default:
+        LOG.warn("Unknown status, key: " + key + ", datum: " + datum);
+      }
+    }
+
+    // copy the content of the queue into a List
+    // in reversed order
+    int numLinks = linked.size();
+    List<CrawlDatum> linkList = new ArrayList<CrawlDatum>(numLinks);
+    for (int i = numLinks - 1; i >= 0; i--) {
+      linkList.add(linked.pop());
+    }
+
+    // if it doesn't already exist, skip it
+    if (!oldSet && !additionsAllowed)
+      return;
+
+    // if there is no fetched datum, perhaps there is a link
+    if (!fetchSet && linkList.size() > 0) {
+      fetch = linkList.get(0);
+      fetchSet = true;
+    }
+
+    // still no new data - record only unchanged old data, if exists, and return
+    if (!fetchSet) {
+      if (oldSet) {// at this point at least "old" should be present
+        output.collect(key, old);
+        reporter.getCounter("CrawlDB status",
+            CrawlDatum.getStatusName(old.getStatus())).increment(1);
+      } else {
+        LOG.warn("Missing fetch and old value, signature=" + signature);
+      }
+      return;
+    }
+
+    if (signature == null)
+      signature = fetch.getSignature();
+    long prevModifiedTime = oldSet ? old.getModifiedTime() : 0L;
+    long prevFetchTime = oldSet ? old.getFetchTime() : 0L;
+
+    // initialize with the latest version, be it fetch or link
+    result.set(fetch);
+    if (oldSet) {
+      // copy metadata from old, if exists
+      if (old.getMetaData().size() > 0) {
+        result.putAllMetaData(old);
+        // overlay with new, if any
+        if (fetch.getMetaData().size() > 0)
+          result.putAllMetaData(fetch);
+      }
+      // set the most recent valid value of modifiedTime
+      if (old.getModifiedTime() > 0 && fetch.getModifiedTime() == 0) {
+        result.setModifiedTime(old.getModifiedTime());
+      }
+    }
+
+    switch (fetch.getStatus()) { // determine new status
+
+    case CrawlDatum.STATUS_LINKED: // it was link
+      if (oldSet) { // if old exists
+        result.set(old); // use it
+      } else {
+        result = schedule.initializeSchedule(key, result);
+        result.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
+        try {
+          scfilters.initialScore(key, result);
+        } catch (ScoringFilterException e) {
+          if (LOG.isWarnEnabled()) {
+            LOG.warn("Cannot filter init score for url " + key
+                + ", using default: " + e.getMessage());
+          }
+          result.setScore(0.0f);
+        }
+      }
+      break;
+
+    case CrawlDatum.STATUS_FETCH_SUCCESS: // succesful fetch
+    case CrawlDatum.STATUS_FETCH_REDIR_TEMP: // successful fetch, redirected
+    case CrawlDatum.STATUS_FETCH_REDIR_PERM:
+    case CrawlDatum.STATUS_FETCH_NOTMODIFIED: // successful fetch, notmodified
+      // https://issues.apache.org/jira/browse/NUTCH-1656
+      if (metaFromParse != null) {
+        for (Entry<Writable, Writable> e : metaFromParse.entrySet()) {
+          result.getMetaData().put(e.getKey(), e.getValue());
+        }
+      }
+      
+      // determine the modification status
+      int modified = FetchSchedule.STATUS_UNKNOWN;
+      if (fetch.getStatus() == CrawlDatum.STATUS_FETCH_NOTMODIFIED) {
+        modified = FetchSchedule.STATUS_NOTMODIFIED;
+      } else if (fetch.getStatus() == CrawlDatum.STATUS_FETCH_SUCCESS) {
+        // only successful fetches (but not redirects, NUTCH-1422)
+        // are detected as "not modified" by signature comparison
+        if (oldSet && old.getSignature() != null && signature != null) {
+          if (SignatureComparator._compare(old.getSignature(), signature) != 0) {
+            modified = FetchSchedule.STATUS_MODIFIED;
+          } else {
+            modified = FetchSchedule.STATUS_NOTMODIFIED;
+          }
+        }
+      }
+      // set the schedule
+      result = schedule.setFetchSchedule(key, result, prevFetchTime,
+          prevModifiedTime, fetch.getFetchTime(), fetch.getModifiedTime(),
+          modified);
+      // set the result status and signature
+      if (modified == FetchSchedule.STATUS_NOTMODIFIED) {
+        result.setStatus(CrawlDatum.STATUS_DB_NOTMODIFIED);
+
+        // NUTCH-1341 The page is not modified according to its signature, let's
+        // reset lastModified as well
+        result.setModifiedTime(prevModifiedTime);
+
+        if (oldSet)
+          result.setSignature(old.getSignature());
+      } else {
+        switch (fetch.getStatus()) {
+        case CrawlDatum.STATUS_FETCH_SUCCESS:
+          result.setStatus(CrawlDatum.STATUS_DB_FETCHED);
+          break;
+        case CrawlDatum.STATUS_FETCH_REDIR_PERM:
+          result.setStatus(CrawlDatum.STATUS_DB_REDIR_PERM);
+          break;
+        case CrawlDatum.STATUS_FETCH_REDIR_TEMP:
+          result.setStatus(CrawlDatum.STATUS_DB_REDIR_TEMP);
+          break;
+        default:
+          LOG.warn("Unexpected status: " + fetch.getStatus()
+              + " resetting to old status.");
+          if (oldSet)
+            result.setStatus(old.getStatus());
+          else
+            result.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
+        }
+        result.setSignature(signature);
+      }
+
+      // if fetchInterval is larger than the system-wide maximum, trigger
+      // an unconditional recrawl. This prevents the page to be stuck at
+      // NOTMODIFIED state, when the old fetched copy was already removed with
+      // old segments.
+      if (maxInterval < result.getFetchInterval())
+        result = schedule.forceRefetch(key, result, false);
+      break;
+    case CrawlDatum.STATUS_SIGNATURE:
+      if (LOG.isWarnEnabled()) {
+        LOG.warn("Lone CrawlDatum.STATUS_SIGNATURE: " + key);
+      }
+      return;
+    case CrawlDatum.STATUS_FETCH_RETRY: // temporary failure
+      if (oldSet) {
+        result.setSignature(old.getSignature()); // use old signature
+      }
+      result = schedule.setPageRetrySchedule(key, result, prevFetchTime,
+          prevModifiedTime, fetch.getFetchTime());
+      if (result.getRetriesSinceFetch() < retryMax) {
+        result.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
+      } else {
+        result.setStatus(CrawlDatum.STATUS_DB_GONE);
+        result = schedule.setPageGoneSchedule(key, result, prevFetchTime,
+            prevModifiedTime, fetch.getFetchTime());
+      }
+      break;
+
+    case CrawlDatum.STATUS_FETCH_GONE: // permanent failure
+      if (oldSet)
+        result.setSignature(old.getSignature()); // use old signature
+      result.setStatus(CrawlDatum.STATUS_DB_GONE);
+      result = schedule.setPageGoneSchedule(key, result, prevFetchTime,
+          prevModifiedTime, fetch.getFetchTime());
+      break;
+
+    default:
+      throw new RuntimeException("Unknown status: " + fetch.getStatus() + " "
+          + key);
+    }
+
+    try {
+      scfilters.updateDbScore(key, oldSet ? old : null, result, linkList);
+    } catch (Exception e) {
+      if (LOG.isWarnEnabled()) {
+        LOG.warn("Couldn't update score, key=" + key + ": " + e);
+      }
+    }
+    // remove generation time, if any
+    result.getMetaData().remove(Nutch.WRITABLE_GENERATE_TIME_KEY);
+    output.collect(key, result);
+    reporter.getCounter("CrawlDB status",
+        CrawlDatum.getStatusName(result.getStatus())).increment(1);
+  }
+
+}
+
+class InlinkPriorityQueue extends PriorityQueue<CrawlDatum> {
+
+  public InlinkPriorityQueue(int maxSize) {
+    initialize(maxSize);
+  }
+
+  /** Determines the ordering of objects in this priority queue. **/
+  protected boolean lessThan(Object arg0, Object arg1) {
+    CrawlDatum candidate = (CrawlDatum) arg0;
+    CrawlDatum least = (CrawlDatum) arg1;
+    return candidate.getScore() > least.getScore();
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/crawl/DeduplicationJob.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/crawl/DeduplicationJob.java b/nutch-core/src/main/java/org/apache/nutch/crawl/DeduplicationJob.java
new file mode 100644
index 0000000..c439570
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/crawl/DeduplicationJob.java
@@ -0,0 +1,389 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.crawl;
+
+import java.io.IOException;
+import java.io.UnsupportedEncodingException;
+import java.net.URLDecoder;
+import java.text.SimpleDateFormat;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.Random;
+import java.util.Arrays;
+
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.BytesWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.Counters.Group;
+import org.apache.hadoop.mapred.FileInputFormat;
+import org.apache.hadoop.mapred.FileOutputFormat;
+import org.apache.hadoop.mapred.JobClient;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.Mapper;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reducer;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.hadoop.mapred.RunningJob;
+import org.apache.hadoop.mapred.SequenceFileInputFormat;
+import org.apache.hadoop.mapred.SequenceFileOutputFormat;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.CrawlDb;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.NutchJob;
+import org.apache.nutch.util.NutchTool;
+import org.apache.nutch.util.TimingUtil;
+import org.apache.nutch.util.URLUtil;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Generic deduplicator which groups fetched URLs with the same digest and marks
+ * all of them as duplicate except the one with the highest score (based on the
+ * score in the crawldb, which is not necessarily the same as the score
+ * indexed). If two (or more) documents have the same score, then the document
+ * with the latest timestamp is kept. If the documents have the same timestamp
+ * then the one with the shortest URL is kept. The documents marked as duplicate
+ * can then be deleted with the command CleaningJob.
+ ***/
+public class DeduplicationJob extends NutchTool implements Tool {
+
+  public static final Logger LOG = LoggerFactory
+      .getLogger(DeduplicationJob.class);
+
+  private final static Text urlKey = new Text("_URLTEMPKEY_");
+  private final static String DEDUPLICATION_GROUP_MODE = "deduplication.group.mode";
+  private final static String DEDUPLICATION_COMPARE_ORDER = "deduplication.compare.order";
+
+  public static class DBFilter implements
+      Mapper<Text, CrawlDatum, BytesWritable, CrawlDatum> {
+      
+    private String groupMode;
+
+    @Override
+    public void configure(JobConf arg0) {
+      groupMode = arg0.get(DEDUPLICATION_GROUP_MODE);
+    }
+
+    @Override
+    public void close() throws IOException {
+    }
+
+    @Override
+    public void map(Text key, CrawlDatum value,
+        OutputCollector<BytesWritable, CrawlDatum> output, Reporter reporter)
+        throws IOException {
+
+      if (value.getStatus() == CrawlDatum.STATUS_DB_FETCHED
+          || value.getStatus() == CrawlDatum.STATUS_DB_NOTMODIFIED) {
+        // || value.getStatus() ==CrawlDatum.STATUS_DB_GONE){
+        byte[] signature = value.getSignature();
+        if (signature == null)
+          return;
+        String url = key.toString();
+        BytesWritable sig = null;
+        byte[] data;
+        switch (groupMode) {
+          case "none":
+            sig = new BytesWritable(signature);
+            break;
+          case "host":
+            byte[] host = URLUtil.getHost(url).getBytes();
+            data = new byte[signature.length + host.length];
+            System.arraycopy(signature, 0, data, 0, signature.length);
+            System.arraycopy(host, 0, data, signature.length, host.length);
+            sig = new BytesWritable(data);
+            break;
+          case "domain":
+            byte[] domain = URLUtil.getDomainName(url).getBytes();
+            data = new byte[signature.length + domain.length];
+            System.arraycopy(signature, 0, data, 0, signature.length);
+            System.arraycopy(domain, 0, data, signature.length, domain.length);
+            sig = new BytesWritable(data);
+            break;
+        }
+        // add the URL as a temporary MD
+        value.getMetaData().put(urlKey, key);
+        // reduce on the signature optionall grouped on host or domain or not at all
+        output.collect(sig, value);
+      }
+    }
+  }
+
+  public static class DedupReducer implements
+      Reducer<BytesWritable, CrawlDatum, Text, CrawlDatum> {
+
+    private String[] compareOrder;
+    
+    @Override
+    public void configure(JobConf arg0) {
+      compareOrder = arg0.get(DEDUPLICATION_COMPARE_ORDER).split(",");
+    }
+
+    private void writeOutAsDuplicate(CrawlDatum datum,
+        OutputCollector<Text, CrawlDatum> output, Reporter reporter)
+        throws IOException {
+      datum.setStatus(CrawlDatum.STATUS_DB_DUPLICATE);
+      Text key = (Text) datum.getMetaData().remove(urlKey);
+      reporter.incrCounter("DeduplicationJobStatus",
+          "Documents marked as duplicate", 1);
+      output.collect(key, datum);
+    }
+
+    @Override
+    public void reduce(BytesWritable key, Iterator<CrawlDatum> values,
+        OutputCollector<Text, CrawlDatum> output, Reporter reporter)
+        throws IOException {
+      CrawlDatum existingDoc = null;
+
+      outerloop:
+      while (values.hasNext()) {
+        if (existingDoc == null) {
+          existingDoc = new CrawlDatum();
+          existingDoc.set(values.next());
+          continue;
+        }
+        CrawlDatum newDoc = values.next();
+
+        for (int i = 0; i < compareOrder.length; i++) {
+          switch (compareOrder[i]) {
+            case "score":
+              // compare based on score
+              if (existingDoc.getScore() < newDoc.getScore()) {
+                writeOutAsDuplicate(existingDoc, output, reporter);
+                existingDoc = new CrawlDatum();
+                existingDoc.set(newDoc);
+                continue outerloop;
+              } else if (existingDoc.getScore() > newDoc.getScore()) {
+                // mark new one as duplicate
+                writeOutAsDuplicate(newDoc, output, reporter);
+                continue outerloop;
+              }
+              break;
+            case "fetchTime":
+              // same score? delete the one which is oldest
+              if (existingDoc.getFetchTime() > newDoc.getFetchTime()) {
+                // mark new one as duplicate
+                writeOutAsDuplicate(newDoc, output, reporter);
+                continue outerloop;
+              } else if (existingDoc.getFetchTime() < newDoc.getFetchTime()) {
+                // mark existing one as duplicate
+                writeOutAsDuplicate(existingDoc, output, reporter);
+                existingDoc = new CrawlDatum();
+                existingDoc.set(newDoc);
+                continue outerloop;
+              }
+              break;
+            case "urlLength":
+              // same time? keep the one which has the shortest URL
+              String urlExisting;
+              String urlnewDoc;
+              try {
+                urlExisting = URLDecoder.decode(existingDoc.getMetaData().get(urlKey).toString(), "UTF8");
+                urlnewDoc = URLDecoder.decode(newDoc.getMetaData().get(urlKey).toString(), "UTF8");
+              } catch (UnsupportedEncodingException e) {
+                LOG.error("Error decoding: " + urlKey);
+                throw new IOException("UnsupportedEncodingException for " + urlKey);
+              }
+              if (urlExisting.length() < urlnewDoc.length()) {
+                // mark new one as duplicate
+                writeOutAsDuplicate(newDoc, output, reporter);
+                continue outerloop;
+              } else if (urlExisting.length() > urlnewDoc.length()) {
+                // mark existing one as duplicate
+                writeOutAsDuplicate(existingDoc, output, reporter);
+                existingDoc = new CrawlDatum();
+                existingDoc.set(newDoc);
+                continue outerloop;
+              }
+              break;
+          }
+        }
+
+      }
+    }
+
+    @Override
+    public void close() throws IOException {
+
+    }
+  }
+
+  /** Combine multiple new entries for a url. */
+  public static class StatusUpdateReducer implements
+      Reducer<Text, CrawlDatum, Text, CrawlDatum> {
+
+    public void configure(JobConf job) {
+    }
+
+    public void close() {
+    }
+
+    private CrawlDatum old = new CrawlDatum();
+    private CrawlDatum duplicate = new CrawlDatum();
+
+    public void reduce(Text key, Iterator<CrawlDatum> values,
+        OutputCollector<Text, CrawlDatum> output, Reporter reporter)
+        throws IOException {
+      boolean duplicateSet = false;
+
+      while (values.hasNext()) {
+        CrawlDatum val = values.next();
+        if (val.getStatus() == CrawlDatum.STATUS_DB_DUPLICATE) {
+          duplicate.set(val);
+          duplicateSet = true;
+        } else {
+          old.set(val);
+        }
+      }
+
+      // keep the duplicate if there is one
+      if (duplicateSet) {
+        output.collect(key, duplicate);
+        return;
+      }
+
+      // no duplicate? keep old one then
+      output.collect(key, old);
+    }
+  }
+
+  public int run(String[] args) throws IOException {
+    if (args.length < 1) {
+      System.err.println("Usage: DeduplicationJob <crawldb> [-group <none|host|domain>] [-compareOrder <score>,<fetchTime>,<urlLength>]");
+      return 1;
+    }
+
+    String group = "none";
+    String crawldb = args[0];
+    String compareOrder = "score,fetchTime,urlLength";
+
+    for (int i = 1; i < args.length; i++) {
+      if (args[i].equals("-group")) 
+        group = args[++i];
+      if (args[i].equals("-compareOrder")) {
+        compareOrder = args[++i];
+
+        if (compareOrder.indexOf("score") == -1 ||
+            compareOrder.indexOf("fetchTime") == -1 ||
+            compareOrder.indexOf("urlLength") == -1) {
+          System.err.println("DeduplicationJob: compareOrder must contain score, fetchTime and urlLength.");
+          return 1;
+        }
+      }
+    }
+
+    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+    long start = System.currentTimeMillis();
+    LOG.info("DeduplicationJob: starting at " + sdf.format(start));
+
+    Path tempDir = new Path(getConf().get("mapred.temp.dir", ".")
+        + "/dedup-temp-"
+        + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
+
+    JobConf job = new NutchJob(getConf());
+
+    job.setJobName("Deduplication on " + crawldb);
+    job.set(DEDUPLICATION_GROUP_MODE, group);
+    job.set(DEDUPLICATION_COMPARE_ORDER, compareOrder);
+
+    FileInputFormat.addInputPath(job, new Path(crawldb, CrawlDb.CURRENT_NAME));
+    job.setInputFormat(SequenceFileInputFormat.class);
+
+    FileOutputFormat.setOutputPath(job, tempDir);
+    job.setOutputFormat(SequenceFileOutputFormat.class);
+
+    job.setMapOutputKeyClass(BytesWritable.class);
+    job.setMapOutputValueClass(CrawlDatum.class);
+
+    job.setOutputKeyClass(Text.class);
+    job.setOutputValueClass(CrawlDatum.class);
+
+    job.setMapperClass(DBFilter.class);
+    job.setReducerClass(DedupReducer.class);
+
+    try {
+      RunningJob rj = JobClient.runJob(job);
+      Group g = rj.getCounters().getGroup("DeduplicationJobStatus");
+      if (g != null) {
+        long dups = g.getCounter("Documents marked as duplicate");
+        LOG.info("Deduplication: " + (int) dups
+            + " documents marked as duplicates");
+      }
+    } catch (final Exception e) {
+      LOG.error("DeduplicationJob: " + StringUtils.stringifyException(e));
+      return -1;
+    }
+
+    // merge with existing crawl db
+    if (LOG.isInfoEnabled()) {
+      LOG.info("Deduplication: Updating status of duplicate urls into crawl db.");
+    }
+
+    Path dbPath = new Path(crawldb);
+    JobConf mergeJob = CrawlDb.createJob(getConf(), dbPath);
+    FileInputFormat.addInputPath(mergeJob, tempDir);
+    mergeJob.setReducerClass(StatusUpdateReducer.class);
+
+    try {
+      JobClient.runJob(mergeJob);
+    } catch (final Exception e) {
+      LOG.error("DeduplicationMergeJob: " + StringUtils.stringifyException(e));
+      return -1;
+    }
+
+    CrawlDb.install(mergeJob, dbPath);
+
+    // clean up
+    FileSystem fs = FileSystem.get(getConf());
+    fs.delete(tempDir, true);
+
+    long end = System.currentTimeMillis();
+    LOG.info("Deduplication finished at " + sdf.format(end) + ", elapsed: "
+        + TimingUtil.elapsedTime(start, end));
+
+    return 0;
+  }
+
+  public static void main(String[] args) throws Exception {
+    int result = ToolRunner.run(NutchConfiguration.create(),
+        new DeduplicationJob(), args);
+    System.exit(result);
+  }
+
+  @Override
+  public Map<String, Object> run(Map<String, Object> args, String crawlId) throws Exception {
+    Map<String, Object> results = new HashMap<String, Object>();
+    String[] arg = new String[1];
+    String crawldb;
+    if(args.containsKey(Nutch.ARG_CRAWLDB)) {
+      crawldb = (String)args.get(Nutch.ARG_CRAWLDB);
+    }
+    else {
+      crawldb = crawlId+"/crawldb";
+    }
+    arg[0] = crawldb;
+    int res = run(arg);
+    results.put(Nutch.VAL_RESULT, Integer.toString(res));
+    return results;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/crawl/DefaultFetchSchedule.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/crawl/DefaultFetchSchedule.java b/nutch-core/src/main/java/org/apache/nutch/crawl/DefaultFetchSchedule.java
new file mode 100755
index 0000000..4a60a1c
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/crawl/DefaultFetchSchedule.java
@@ -0,0 +1,45 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+import org.apache.hadoop.io.Text;
+
+/**
+ * This class implements the default re-fetch schedule. That is, no matter if
+ * the page was changed or not, the <code>fetchInterval</code> remains
+ * unchanged, and the updated page fetchTime will always be set to
+ * <code>fetchTime + fetchInterval * 1000</code>.
+ * 
+ * @author Andrzej Bialecki
+ */
+public class DefaultFetchSchedule extends AbstractFetchSchedule {
+
+  @Override
+  public CrawlDatum setFetchSchedule(Text url, CrawlDatum datum,
+      long prevFetchTime, long prevModifiedTime, long fetchTime,
+      long modifiedTime, int state) {
+    datum = super.setFetchSchedule(url, datum, prevFetchTime, prevModifiedTime,
+        fetchTime, modifiedTime, state);
+    if (datum.getFetchInterval() == 0) {
+      datum.setFetchInterval(defaultInterval);
+    }
+    datum.setFetchTime(fetchTime + (long) datum.getFetchInterval() * 1000);
+    datum.setModifiedTime(modifiedTime);
+    return datum;
+  }
+}

[46/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build for nutch-core and nutch-plugins

Posted by th...@apache.org.

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/fetcher/Fetcher.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/fetcher/Fetcher.java b/nutch-core/src/main/java/org/apache/nutch/fetcher/Fetcher.java
new file mode 100644
index 0000000..aad9ee9
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/fetcher/Fetcher.java
@@ -0,0 +1,600 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.fetcher;
+
+import java.io.File;
+import java.io.IOException;
+import java.text.SimpleDateFormat;
+import java.util.*;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.concurrent.atomic.AtomicLong;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.io.*;
+import org.apache.hadoop.fs.*;
+import org.apache.hadoop.conf.*;
+import org.apache.hadoop.mapred.*;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.NutchWritable;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.protocol.*;
+import org.apache.nutch.util.*;
+
+/**
+ * A queue-based fetcher.
+ * 
+ * <p>
+ * This fetcher uses a well-known model of one producer (a QueueFeeder) and many
+ * consumers (FetcherThread-s).
+ * 
+ * <p>
+ * QueueFeeder reads input fetchlists and populates a set of FetchItemQueue-s,
+ * which hold FetchItem-s that describe the items to be fetched. There are as
+ * many queues as there are unique hosts, but at any given time the total number
+ * of fetch items in all queues is less than a fixed number (currently set to a
+ * multiple of the number of threads).
+ * 
+ * <p>
+ * As items are consumed from the queues, the QueueFeeder continues to add new
+ * input items, so that their total count stays fixed (FetcherThread-s may also
+ * add new items to the queues e.g. as a results of redirection) - until all
+ * input items are exhausted, at which point the number of items in the queues
+ * begins to decrease. When this number reaches 0 fetcher will finish.
+ * 
+ * <p>
+ * This fetcher implementation handles per-host blocking itself, instead of
+ * delegating this work to protocol-specific plugins. Each per-host queue
+ * handles its own "politeness" settings, such as the maximum number of
+ * concurrent requests and crawl delay between consecutive requests - and also a
+ * list of requests in progress, and the time the last request was finished. As
+ * FetcherThread-s ask for new items to be fetched, queues may return eligible
+ * items or null if for "politeness" reasons this host's queue is not yet ready.
+ * 
+ * <p>
+ * If there are still unfetched items in the queues, but none of the items are
+ * ready, FetcherThread-s will spin-wait until either some items become
+ * available, or a timeout is reached (at which point the Fetcher will abort,
+ * assuming the task is hung).
+ * 
+ * @author Andrzej Bialecki
+ */
+public class Fetcher extends NutchTool implements Tool,
+MapRunnable<Text, CrawlDatum, Text, NutchWritable> {
+
+  public static final int PERM_REFRESH_TIME = 5;
+
+  public static final String CONTENT_REDIR = "content";
+
+  public static final String PROTOCOL_REDIR = "protocol";
+
+  public static final Logger LOG = LoggerFactory.getLogger(Fetcher.class);
+
+  public static class InputFormat extends
+  SequenceFileInputFormat<Text, CrawlDatum> {
+    /** Don't split inputs, to keep things polite. */
+    public InputSplit[] getSplits(JobConf job, int nSplits) throws IOException {
+      FileStatus[] files = listStatus(job);
+      FileSplit[] splits = new FileSplit[files.length];
+      for (int i = 0; i < files.length; i++) {
+        FileStatus cur = files[i];
+        splits[i] = new FileSplit(cur.getPath(), 0, cur.getLen(),
+            (String[]) null);
+      }
+      return splits;
+    }
+  }
+
+  @SuppressWarnings("unused")
+  private OutputCollector<Text, NutchWritable> output;
+  private Reporter reporter;
+
+  private String segmentName;
+  private AtomicInteger activeThreads = new AtomicInteger(0);
+  private AtomicInteger spinWaiting = new AtomicInteger(0);
+
+  private long start = System.currentTimeMillis(); // start time of fetcher run
+  private AtomicLong lastRequestStart = new AtomicLong(start);
+
+  private AtomicLong bytes = new AtomicLong(0); // total bytes fetched
+  private AtomicInteger pages = new AtomicInteger(0); // total pages fetched
+  private AtomicInteger errors = new AtomicInteger(0); // total pages errored
+
+  private boolean storingContent;
+  private boolean parsing;
+  FetchItemQueues fetchQueues;
+  QueueFeeder feeder;
+
+  LinkedList<FetcherThread> fetcherThreads = new LinkedList<FetcherThread>();
+
+  public Fetcher() {
+    super(null);
+  }
+
+  public Fetcher(Configuration conf) {
+    super(conf);
+  }
+
+  private void reportStatus(int pagesLastSec, int bytesLastSec)
+      throws IOException {
+    StringBuilder status = new StringBuilder();
+    Long elapsed = new Long((System.currentTimeMillis() - start) / 1000);
+
+    float avgPagesSec = (float) pages.get() / elapsed.floatValue();
+    long avgBytesSec = (bytes.get() / 128l) / elapsed.longValue();
+
+    status.append(activeThreads).append(" threads (").append(spinWaiting.get())
+    .append(" waiting), ");
+    status.append(fetchQueues.getQueueCount()).append(" queues, ");
+    status.append(fetchQueues.getTotalSize()).append(" URLs queued, ");
+    status.append(pages).append(" pages, ").append(errors).append(" errors, ");
+    status.append(String.format("%.2f", avgPagesSec)).append(" pages/s (");
+    status.append(pagesLastSec).append(" last sec), ");
+    status.append(avgBytesSec).append(" kbits/s (")
+    .append((bytesLastSec / 128)).append(" last sec)");
+
+    reporter.setStatus(status.toString());
+  }
+
+  public void configure(JobConf job) {
+    setConf(job);
+
+    this.segmentName = job.get(Nutch.SEGMENT_NAME_KEY);
+    this.storingContent = isStoringContent(job);
+    this.parsing = isParsing(job);
+
+    // if (job.getBoolean("fetcher.verbose", false)) {
+    // LOG.setLevel(Level.FINE);
+    // }
+  }
+
+  public void close() {
+  }
+
+  public static boolean isParsing(Configuration conf) {
+    return conf.getBoolean("fetcher.parse", true);
+  }
+
+  public static boolean isStoringContent(Configuration conf) {
+    return conf.getBoolean("fetcher.store.content", true);
+  }
+
+  public void run(RecordReader<Text, CrawlDatum> input,
+      OutputCollector<Text, NutchWritable> output, Reporter reporter)
+          throws IOException {
+
+    this.output = output;
+    this.reporter = reporter;
+    this.fetchQueues = new FetchItemQueues(getConf());
+
+    int threadCount = getConf().getInt("fetcher.threads.fetch", 10);
+    if (LOG.isInfoEnabled()) {
+      LOG.info("Fetcher: threads: {}", threadCount);
+    }
+
+    int timeoutDivisor = getConf().getInt("fetcher.threads.timeout.divisor", 2);
+    if (LOG.isInfoEnabled()) {
+      LOG.info("Fetcher: time-out divisor: {}", timeoutDivisor);
+    }
+
+    int queueDepthMuliplier = getConf().getInt(
+        "fetcher.queue.depth.multiplier", 50);
+
+    feeder = new QueueFeeder(input, fetchQueues, threadCount
+        * queueDepthMuliplier);
+    // feeder.setPriority((Thread.MAX_PRIORITY + Thread.NORM_PRIORITY) / 2);
+
+    // the value of the time limit is either -1 or the time where it should
+    // finish
+    long timelimit = getConf().getLong("fetcher.timelimit", -1);
+    if (timelimit != -1)
+      feeder.setTimeLimit(timelimit);
+    feeder.start();
+
+    // set non-blocking & no-robots mode for HTTP protocol plugins.
+    getConf().setBoolean(Protocol.CHECK_BLOCKING, false);
+    getConf().setBoolean(Protocol.CHECK_ROBOTS, false);
+
+    for (int i = 0; i < threadCount; i++) { // spawn threads
+      FetcherThread t = new FetcherThread(getConf(), getActiveThreads(), fetchQueues, 
+          feeder, spinWaiting, lastRequestStart, reporter, errors, segmentName,
+          parsing, output, storingContent, pages, bytes);
+      fetcherThreads.add(t);
+      t.start();
+    }
+
+    // select a timeout that avoids a task timeout
+    long timeout = getConf().getInt("mapred.task.timeout", 10 * 60 * 1000)
+        / timeoutDivisor;
+
+    // Used for threshold check, holds pages and bytes processed in the last
+    // second
+    int pagesLastSec;
+    int bytesLastSec;
+
+    int throughputThresholdNumRetries = 0;
+
+    int throughputThresholdPages = getConf().getInt(
+        "fetcher.throughput.threshold.pages", -1);
+    if (LOG.isInfoEnabled()) {
+      LOG.info("Fetcher: throughput threshold: {}", throughputThresholdPages);
+    }
+    int throughputThresholdMaxRetries = getConf().getInt(
+        "fetcher.throughput.threshold.retries", 5);
+    if (LOG.isInfoEnabled()) {
+      LOG.info("Fetcher: throughput threshold retries: {}",
+          throughputThresholdMaxRetries);
+    }
+    long throughputThresholdTimeLimit = getConf().getLong(
+        "fetcher.throughput.threshold.check.after", -1);
+
+    int targetBandwidth = getConf().getInt("fetcher.bandwidth.target", -1) * 1000;
+    int maxNumThreads = getConf().getInt("fetcher.maxNum.threads", threadCount);
+    if (maxNumThreads < threadCount) {
+      LOG.info("fetcher.maxNum.threads can't be < than {} : using {} instead",
+          threadCount, threadCount);
+      maxNumThreads = threadCount;
+    }
+    int bandwidthTargetCheckEveryNSecs = getConf().getInt(
+        "fetcher.bandwidth.target.check.everyNSecs", 30);
+    if (bandwidthTargetCheckEveryNSecs < 1) {
+      LOG.info("fetcher.bandwidth.target.check.everyNSecs can't be < to 1 : using 1 instead");
+      bandwidthTargetCheckEveryNSecs = 1;
+    }
+
+    int maxThreadsPerQueue = getConf().getInt("fetcher.threads.per.queue", 1);
+
+    int bandwidthTargetCheckCounter = 0;
+    long bytesAtLastBWTCheck = 0l;
+
+    do { // wait for threads to exit
+      pagesLastSec = pages.get();
+      bytesLastSec = (int) bytes.get();
+
+      try {
+        Thread.sleep(1000);
+      } catch (InterruptedException e) {
+      }
+
+      pagesLastSec = pages.get() - pagesLastSec;
+      bytesLastSec = (int) bytes.get() - bytesLastSec;
+
+      reporter.incrCounter("FetcherStatus", "bytes_downloaded", bytesLastSec);
+
+      reportStatus(pagesLastSec, bytesLastSec);
+
+      LOG.info("-activeThreads=" + activeThreads + ", spinWaiting="
+          + spinWaiting.get() + ", fetchQueues.totalSize="
+          + fetchQueues.getTotalSize() + ", fetchQueues.getQueueCount="
+          + fetchQueues.getQueueCount());
+
+      if (!feeder.isAlive() && fetchQueues.getTotalSize() < 5) {
+        fetchQueues.dump();
+      }
+
+      // if throughput threshold is enabled
+      if (throughputThresholdTimeLimit < System.currentTimeMillis()
+          && throughputThresholdPages != -1) {
+        // Check if we're dropping below the threshold
+        if (pagesLastSec < throughputThresholdPages) {
+          throughputThresholdNumRetries++;
+          LOG.warn("{}: dropping below configured threshold of {} pages per second",
+              Integer.toString(throughputThresholdNumRetries), Integer.toString(throughputThresholdPages));
+
+          // Quit if we dropped below threshold too many times
+          if (throughputThresholdNumRetries == throughputThresholdMaxRetries) {
+            LOG.warn("Dropped below threshold too many times, killing!");
+
+            // Disable the threshold checker
+            throughputThresholdPages = -1;
+
+            // Empty the queues cleanly and get number of items that were
+            // dropped
+            int hitByThrougputThreshold = fetchQueues.emptyQueues();
+
+            if (hitByThrougputThreshold != 0)
+              reporter.incrCounter("FetcherStatus", "hitByThrougputThreshold",
+                  hitByThrougputThreshold);
+          }
+        }
+      }
+
+      // adjust the number of threads if a target bandwidth has been set
+      if (targetBandwidth > 0) {
+        if (bandwidthTargetCheckCounter < bandwidthTargetCheckEveryNSecs)
+          bandwidthTargetCheckCounter++;
+        else if (bandwidthTargetCheckCounter == bandwidthTargetCheckEveryNSecs) {
+          long bpsSinceLastCheck = ((bytes.get() - bytesAtLastBWTCheck) * 8)
+              / bandwidthTargetCheckEveryNSecs;
+
+          bytesAtLastBWTCheck = bytes.get();
+          bandwidthTargetCheckCounter = 0;
+
+          int averageBdwPerThread = 0;
+          if (activeThreads.get() > 0)
+            averageBdwPerThread = Math.round(bpsSinceLastCheck
+                / activeThreads.get());
+
+          LOG.info("averageBdwPerThread : {} kbps", (averageBdwPerThread / 1000));
+
+          if (bpsSinceLastCheck < targetBandwidth && averageBdwPerThread > 0) {
+            // check whether it is worth doing e.g. more queues than threads
+
+            if ((fetchQueues.getQueueCount() * maxThreadsPerQueue) > activeThreads
+                .get()) {
+
+              long remainingBdw = targetBandwidth - bpsSinceLastCheck;
+              int additionalThreads = Math.round(remainingBdw
+                  / averageBdwPerThread);
+              int availableThreads = maxNumThreads - activeThreads.get();
+
+              // determine the number of available threads (min between
+              // availableThreads and additionalThreads)
+              additionalThreads = (availableThreads < additionalThreads ? availableThreads
+                  : additionalThreads);
+              LOG.info("Has space for more threads ({} vs {} kbps) \t=> adding {} new threads",
+                  (bpsSinceLastCheck / 1000), (targetBandwidth / 1000), additionalThreads);
+              // activate new threads
+              for (int i = 0; i < additionalThreads; i++) {
+                FetcherThread thread = new FetcherThread(getConf(), getActiveThreads(), fetchQueues, 
+                    feeder, spinWaiting, lastRequestStart, reporter, errors, segmentName, parsing,
+                    output, storingContent, pages, bytes);
+                fetcherThreads.add(thread);
+                thread.start();
+              }
+            }
+          } else if (bpsSinceLastCheck > targetBandwidth
+              && averageBdwPerThread > 0) {
+            // if the bandwidth we're using is greater then the expected
+            // bandwidth, we have to stop some threads
+            long excessBdw = bpsSinceLastCheck - targetBandwidth;
+            int excessThreads = Math.round(excessBdw / averageBdwPerThread);
+            LOG.info("Exceeding target bandwidth ({} vs {} kbps). \t=> excessThreads = {}",
+                bpsSinceLastCheck / 1000, (targetBandwidth / 1000), excessThreads);
+            // keep at least one
+            if (excessThreads >= fetcherThreads.size())
+              excessThreads = 0;
+            // de-activates threads
+            for (int i = 0; i < excessThreads; i++) {
+              FetcherThread thread = fetcherThreads.removeLast();
+              thread.setHalted(true);
+            }
+          }
+        }
+      }
+
+      // check timelimit
+      if (!feeder.isAlive()) {
+        int hitByTimeLimit = fetchQueues.checkTimelimit();
+        if (hitByTimeLimit != 0)
+          reporter.incrCounter("FetcherStatus", "hitByTimeLimit",
+              hitByTimeLimit);
+      }
+
+      // some requests seem to hang, despite all intentions
+      if ((System.currentTimeMillis() - lastRequestStart.get()) > timeout) {
+        if (LOG.isWarnEnabled()) {
+          LOG.warn("Aborting with {} hung threads.", activeThreads);
+          for (int i = 0; i < fetcherThreads.size(); i++) {
+            FetcherThread thread = fetcherThreads.get(i);
+            if (thread.isAlive()) {
+              LOG.warn("Thread #{} hung while processing {}", i, thread.getReprUrl());
+              if (LOG.isDebugEnabled()) {
+                StackTraceElement[] stack = thread.getStackTrace();
+                StringBuilder sb = new StringBuilder();
+                sb.append("Stack of thread #").append(i).append(":\n");
+                for (StackTraceElement s : stack) {
+                  sb.append(s.toString()).append('\n');
+                }
+                LOG.debug(sb.toString());
+              }
+            }
+          }
+        }
+        return;
+      }
+
+    } while (activeThreads.get() > 0);
+    LOG.info("-activeThreads={}", activeThreads);
+
+  }
+
+  public void fetch(Path segment, int threads) throws IOException {
+
+    checkConfiguration();
+
+    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+    long start = System.currentTimeMillis();
+    if (LOG.isInfoEnabled()) {
+      LOG.info("Fetcher: starting at {}", sdf.format(start));
+      LOG.info("Fetcher: segment: {}", segment);
+    }
+
+    // set the actual time for the timelimit relative
+    // to the beginning of the whole job and not of a specific task
+    // otherwise it keeps trying again if a task fails
+    long timelimit = getConf().getLong("fetcher.timelimit.mins", -1);
+    if (timelimit != -1) {
+      timelimit = System.currentTimeMillis() + (timelimit * 60 * 1000);
+      LOG.info("Fetcher Timelimit set for : {}", timelimit);
+      getConf().setLong("fetcher.timelimit", timelimit);
+    }
+
+    // Set the time limit after which the throughput threshold feature is
+    // enabled
+    timelimit = getConf().getLong("fetcher.throughput.threshold.check.after",
+        10);
+    timelimit = System.currentTimeMillis() + (timelimit * 60 * 1000);
+    getConf().setLong("fetcher.throughput.threshold.check.after", timelimit);
+
+    int maxOutlinkDepth = getConf().getInt("fetcher.follow.outlinks.depth", -1);
+    if (maxOutlinkDepth > 0) {
+      LOG.info("Fetcher: following outlinks up to depth: {}",
+          Integer.toString(maxOutlinkDepth));
+
+      int maxOutlinkDepthNumLinks = getConf().getInt(
+          "fetcher.follow.outlinks.num.links", 4);
+      int outlinksDepthDivisor = getConf().getInt(
+          "fetcher.follow.outlinks.depth.divisor", 2);
+
+      int totalOutlinksToFollow = 0;
+      for (int i = 0; i < maxOutlinkDepth; i++) {
+        totalOutlinksToFollow += (int) Math.floor(outlinksDepthDivisor
+            / (i + 1) * maxOutlinkDepthNumLinks);
+      }
+
+      LOG.info("Fetcher: maximum outlinks to follow: {}",
+          Integer.toString(totalOutlinksToFollow));
+    }
+
+    JobConf job = new NutchJob(getConf());
+    job.setJobName("fetch " + segment);
+
+    job.setInt("fetcher.threads.fetch", threads);
+    job.set(Nutch.SEGMENT_NAME_KEY, segment.getName());
+
+    // for politeness, don't permit parallel execution of a single task
+    job.setSpeculativeExecution(false);
+
+    FileInputFormat.addInputPath(job, new Path(segment,
+        CrawlDatum.GENERATE_DIR_NAME));
+    job.setInputFormat(InputFormat.class);
+
+    job.setMapRunnerClass(Fetcher.class);
+
+    FileOutputFormat.setOutputPath(job, segment);
+    job.setOutputFormat(FetcherOutputFormat.class);
+    job.setOutputKeyClass(Text.class);
+    job.setOutputValueClass(NutchWritable.class);
+
+    JobClient.runJob(job);
+
+    long end = System.currentTimeMillis();
+    LOG.info("Fetcher: finished at {}, elapsed: {}", sdf.format(end),
+        TimingUtil.elapsedTime(start, end));
+  }
+
+  /** Run the fetcher. */
+  public static void main(String[] args) throws Exception {
+    int res = ToolRunner.run(NutchConfiguration.create(), new Fetcher(), args);
+    System.exit(res);
+  }
+
+  public int run(String[] args) throws Exception {
+
+    String usage = "Usage: Fetcher <segment> [-threads n]";
+
+    if (args.length < 1) {
+      System.err.println(usage);
+      return -1;
+    }
+
+    Path segment = new Path(args[0]);
+
+    int threads = getConf().getInt("fetcher.threads.fetch", 10);
+
+    for (int i = 1; i < args.length; i++) { // parse command line
+      if (args[i].equals("-threads")) { // found -threads option
+        threads = Integer.parseInt(args[++i]);
+      }
+    }
+
+    getConf().setInt("fetcher.threads.fetch", threads);
+
+    try {
+      fetch(segment, threads);
+      return 0;
+    } catch (Exception e) {
+      LOG.error("Fetcher: {}", StringUtils.stringifyException(e));
+      return -1;
+    }
+
+  }
+
+  private void checkConfiguration() {
+    // ensure that a value has been set for the agent name
+    String agentName = getConf().get("http.agent.name");
+    if (agentName == null || agentName.trim().length() == 0) {
+      String message = "Fetcher: No agents listed in 'http.agent.name'"
+          + " property.";
+      if (LOG.isErrorEnabled()) {
+        LOG.error(message);
+      }
+      throw new IllegalArgumentException(message);
+    }
+  }
+
+  private AtomicInteger getActiveThreads() {
+    return activeThreads;
+  }
+
+  @Override
+  public Map<String, Object> run(Map<String, Object> args, String crawlId) throws Exception {
+
+    Map<String, Object> results = new HashMap<String, Object>();
+
+    Path segment;
+    if(args.containsKey(Nutch.ARG_SEGMENT)) {
+      Object seg = args.get(Nutch.ARG_SEGMENT);
+      if(seg instanceof Path) {
+        segment = (Path) seg;
+      }
+      else {
+        segment = new Path(seg.toString());
+      }
+    }
+    else {
+      String segment_dir = crawlId+"/segments";
+      File segmentsDir = new File(segment_dir);
+      File[] segmentsList = segmentsDir.listFiles();  
+      Arrays.sort(segmentsList, new Comparator<File>(){
+        @Override
+        public int compare(File f1, File f2) {
+          if(f1.lastModified()>f2.lastModified())
+            return -1;
+          else
+            return 0;
+        }      
+      });
+      segment = new Path(segmentsList[0].getPath());
+    }
+
+
+    int threads = getConf().getInt("fetcher.threads.fetch", 10);
+
+    // parse command line
+    if (args.containsKey("threads")) { // found -threads option
+      threads = Integer.parseInt((String)args.get("threads"));
+    }
+    getConf().setInt("fetcher.threads.fetch", threads);
+
+    try {
+      fetch(segment, threads);
+      results.put(Nutch.VAL_RESULT, Integer.toString(0));
+      return results;
+    } catch (Exception e) {
+      LOG.error("Fetcher: {}", StringUtils.stringifyException(e));
+      results.put(Nutch.VAL_RESULT, Integer.toString(-1));
+      return results;
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/fetcher/FetcherOutputFormat.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/fetcher/FetcherOutputFormat.java b/nutch-core/src/main/java/org/apache/nutch/fetcher/FetcherOutputFormat.java
new file mode 100644
index 0000000..d526a07
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/fetcher/FetcherOutputFormat.java
@@ -0,0 +1,123 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.fetcher;
+
+import java.io.IOException;
+
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.NutchWritable;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.MapFile;
+import org.apache.hadoop.io.MapFile.Writer.Option;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.SequenceFile.CompressionType;
+import org.apache.hadoop.mapred.FileOutputFormat;
+import org.apache.hadoop.mapred.InvalidJobConfException;
+import org.apache.hadoop.mapred.OutputFormat;
+import org.apache.hadoop.mapred.RecordWriter;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.hadoop.mapred.SequenceFileOutputFormat;
+import org.apache.hadoop.util.Progressable;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseOutputFormat;
+import org.apache.nutch.protocol.Content;
+
+/** Splits FetcherOutput entries into multiple map files. */
+public class FetcherOutputFormat implements OutputFormat<Text, NutchWritable> {
+
+  public void checkOutputSpecs(FileSystem fs, JobConf job) throws IOException {
+    Path out = FileOutputFormat.getOutputPath(job);
+    if ((out == null) && (job.getNumReduceTasks() != 0)) {
+      throw new InvalidJobConfException("Output directory not set in JobConf.");
+    }
+    if (fs == null) {
+      fs = out.getFileSystem(job);
+    }
+    if (fs.exists(new Path(out, CrawlDatum.FETCH_DIR_NAME)))
+      throw new IOException("Segment already fetched!");
+  }
+
+  public RecordWriter<Text, NutchWritable> getRecordWriter(final FileSystem fs,
+      final JobConf job, final String name, final Progressable progress)
+          throws IOException {
+
+    Path out = FileOutputFormat.getOutputPath(job);
+    final Path fetch = new Path(new Path(out, CrawlDatum.FETCH_DIR_NAME), name);
+    final Path content = new Path(new Path(out, Content.DIR_NAME), name);
+
+    final CompressionType compType = SequenceFileOutputFormat
+        .getOutputCompressionType(job);
+
+    Option fKeyClassOpt = MapFile.Writer.keyClass(Text.class);
+    org.apache.hadoop.io.SequenceFile.Writer.Option fValClassOpt = SequenceFile.Writer.valueClass(CrawlDatum.class);
+    org.apache.hadoop.io.SequenceFile.Writer.Option fProgressOpt = SequenceFile.Writer.progressable(progress);
+    org.apache.hadoop.io.SequenceFile.Writer.Option fCompOpt = SequenceFile.Writer.compression(compType);
+
+    final MapFile.Writer fetchOut = new MapFile.Writer(job,
+        fetch, fKeyClassOpt, fValClassOpt, fCompOpt, fProgressOpt);
+
+    return new RecordWriter<Text, NutchWritable>() {
+      private MapFile.Writer contentOut;
+      private RecordWriter<Text, Parse> parseOut;
+
+      {
+        if (Fetcher.isStoringContent(job)) {
+          Option cKeyClassOpt = MapFile.Writer.keyClass(Text.class);
+          org.apache.hadoop.io.SequenceFile.Writer.Option cValClassOpt = SequenceFile.Writer.valueClass(Content.class);
+          org.apache.hadoop.io.SequenceFile.Writer.Option cProgressOpt = SequenceFile.Writer.progressable(progress);
+          org.apache.hadoop.io.SequenceFile.Writer.Option cCompOpt = SequenceFile.Writer.compression(compType);
+          contentOut = new MapFile.Writer(job, content,
+              cKeyClassOpt, cValClassOpt, cCompOpt, cProgressOpt);
+        }
+
+        if (Fetcher.isParsing(job)) {
+          parseOut = new ParseOutputFormat().getRecordWriter(fs, job, name,
+              progress);
+        }
+      }
+
+      public void write(Text key, NutchWritable value) throws IOException {
+
+        Writable w = value.get();
+
+        if (w instanceof CrawlDatum)
+          fetchOut.append(key, w);
+        else if (w instanceof Content && contentOut != null)
+          contentOut.append(key, w);
+        else if (w instanceof Parse && parseOut != null)
+          parseOut.write(key, (Parse) w);
+      }
+
+      public void close(Reporter reporter) throws IOException {
+        fetchOut.close();
+        if (contentOut != null) {
+          contentOut.close();
+        }
+        if (parseOut != null) {
+          parseOut.close(reporter);
+        }
+      }
+
+    };
+
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/fetcher/FetcherThread.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/fetcher/FetcherThread.java b/nutch-core/src/main/java/org/apache/nutch/fetcher/FetcherThread.java
new file mode 100644
index 0000000..e57e735
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/fetcher/FetcherThread.java
@@ -0,0 +1,768 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.fetcher;
+
+import java.io.IOException;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map.Entry;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.concurrent.atomic.AtomicLong;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.NutchWritable;
+import org.apache.nutch.crawl.SignatureFactory;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.net.URLExemptionFilters;
+import org.apache.nutch.net.URLFilterException;
+import org.apache.nutch.net.URLFilters;
+import org.apache.nutch.net.URLNormalizers;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.ParseOutputFormat;
+import org.apache.nutch.parse.ParseResult;
+import org.apache.nutch.parse.ParseSegment;
+import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.parse.ParseText;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.protocol.ProtocolOutput;
+import org.apache.nutch.protocol.ProtocolStatus;
+import org.apache.nutch.scoring.ScoringFilterException;
+import org.apache.nutch.scoring.ScoringFilters;
+import org.apache.nutch.service.NutchServer;
+import org.apache.nutch.util.StringUtil;
+import org.apache.nutch.util.URLUtil;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import crawlercommons.robots.BaseRobotRules;
+
+/**
+ * This class picks items from queues and fetches the pages.
+ */
+public class FetcherThread extends Thread {
+  
+  private static final Logger LOG = LoggerFactory.getLogger(FetcherThread.class);
+
+  private Configuration conf;
+  private URLFilters urlFilters;
+  private URLExemptionFilters urlExemptionFilters;
+  private ScoringFilters scfilters;
+  private ParseUtil parseUtil;
+  private URLNormalizers normalizers;
+  private ProtocolFactory protocolFactory;
+  private long maxCrawlDelay;
+  private String queueMode;
+  private int maxRedirect;
+  private String reprUrl;
+  private boolean redirecting;
+  private int redirectCount;
+  private boolean ignoreInternalLinks;
+  private boolean ignoreExternalLinks;
+  private String ignoreExternalLinksMode;
+
+  // Used by fetcher.follow.outlinks.depth in parse
+  private int maxOutlinksPerPage;
+  private final int maxOutlinks;
+  private final int interval;
+  private int maxOutlinkDepth;
+  private int maxOutlinkDepthNumLinks;
+  private boolean outlinksIgnoreExternal;
+
+  private int outlinksDepthDivisor;
+  private boolean skipTruncated;
+
+  private boolean halted = false;
+
+  private AtomicInteger activeThreads;
+
+  private Object fetchQueues;
+
+  private QueueFeeder feeder;
+
+  private Object spinWaiting;
+
+  private AtomicLong lastRequestStart;
+
+  private Reporter reporter;
+
+  private AtomicInteger errors;
+
+  private String segmentName;
+
+  private boolean parsing;
+
+  private OutputCollector<Text, NutchWritable> output;
+
+  private boolean storingContent;
+
+  private AtomicInteger pages;
+
+  private AtomicLong bytes;
+  
+  //Used by the REST service
+  private FetchNode fetchNode;
+  private boolean reportToNutchServer;
+
+  public FetcherThread(Configuration conf, AtomicInteger activeThreads, FetchItemQueues fetchQueues, 
+      QueueFeeder feeder, AtomicInteger spinWaiting, AtomicLong lastRequestStart, Reporter reporter,
+      AtomicInteger errors, String segmentName, boolean parsing, OutputCollector<Text, NutchWritable> output,
+      boolean storingContent, AtomicInteger pages, AtomicLong bytes) {
+    this.setDaemon(true); // don't hang JVM on exit
+    this.setName("FetcherThread"); // use an informative name
+    this.conf = conf;
+    this.urlFilters = new URLFilters(conf);
+    this.urlExemptionFilters = new URLExemptionFilters(conf);
+    this.scfilters = new ScoringFilters(conf);
+    this.parseUtil = new ParseUtil(conf);
+    this.skipTruncated = conf.getBoolean(ParseSegment.SKIP_TRUNCATED, true);
+    this.protocolFactory = new ProtocolFactory(conf);
+    this.normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_FETCHER);
+    this.maxCrawlDelay = conf.getInt("fetcher.max.crawl.delay", 30) * 1000;
+    this.activeThreads = activeThreads;
+    this.fetchQueues = fetchQueues;
+    this.feeder = feeder;
+    this.spinWaiting = spinWaiting;
+    this.lastRequestStart = lastRequestStart;
+    this.reporter = reporter;
+    this.errors = errors;
+    this.segmentName = segmentName;
+    this.parsing = parsing;
+    this.output = output;
+    this.storingContent = storingContent;
+    this.pages = pages;
+    this.bytes = bytes;
+    queueMode = conf.get("fetcher.queue.mode",
+        FetchItemQueues.QUEUE_MODE_HOST);
+    // check that the mode is known
+    if (!queueMode.equals(FetchItemQueues.QUEUE_MODE_IP)
+        && !queueMode.equals(FetchItemQueues.QUEUE_MODE_DOMAIN)
+        && !queueMode.equals(FetchItemQueues.QUEUE_MODE_HOST)) {
+      LOG.error("Unknown partition mode : " + queueMode
+          + " - forcing to byHost");
+      queueMode = FetchItemQueues.QUEUE_MODE_HOST;
+    }
+    LOG.info("Using queue mode : " + queueMode);
+    this.maxRedirect = conf.getInt("http.redirect.max", 3);
+
+    maxOutlinksPerPage = conf.getInt("db.max.outlinks.per.page", 100);
+    maxOutlinks = (maxOutlinksPerPage < 0) ? Integer.MAX_VALUE
+        : maxOutlinksPerPage;
+    interval = conf.getInt("db.fetch.interval.default", 2592000);
+    ignoreInternalLinks = conf.getBoolean("db.ignore.internal.links", false);
+    ignoreExternalLinks = conf.getBoolean("db.ignore.external.links", false);
+    ignoreExternalLinksMode = conf.get("db.ignore.external.links.mode", "byHost");
+    maxOutlinkDepth = conf.getInt("fetcher.follow.outlinks.depth", -1);
+    outlinksIgnoreExternal = conf.getBoolean(
+        "fetcher.follow.outlinks.ignore.external", false);
+    maxOutlinkDepthNumLinks = conf.getInt(
+        "fetcher.follow.outlinks.num.links", 4);
+    outlinksDepthDivisor = conf.getInt(
+        "fetcher.follow.outlinks.depth.divisor", 2);
+  }
+
+  @SuppressWarnings("fallthrough")
+  public void run() {
+    activeThreads.incrementAndGet(); // count threads
+
+    FetchItem fit = null;
+    try {
+      // checking for the server to be running and fetcher.parse to be true
+      if (parsing && NutchServer.getInstance().isRunning())
+        reportToNutchServer = true;
+      
+      while (true) {
+        // creating FetchNode for storing in FetchNodeDb
+        if (reportToNutchServer)
+          this.fetchNode = new FetchNode();
+        else
+          this.fetchNode = null;
+
+        // check whether must be stopped
+        if (isHalted()) {
+          LOG.debug(getName() + " set to halted");
+          fit = null;
+          return;
+        }
+
+        fit = ((FetchItemQueues) fetchQueues).getFetchItem();
+        if (fit == null) {
+          if (feeder.isAlive() || ((FetchItemQueues) fetchQueues).getTotalSize() > 0) {
+            LOG.debug(getName() + " spin-waiting ...");
+            // spin-wait.
+            ((AtomicInteger) spinWaiting).incrementAndGet();
+            try {
+              Thread.sleep(500);
+            } catch (Exception e) {
+            }
+            ((AtomicInteger) spinWaiting).decrementAndGet();
+            continue;
+          } else {
+            // all done, finish this thread
+            LOG.info("Thread " + getName() + " has no more work available");
+            return;
+          }
+        }
+        lastRequestStart.set(System.currentTimeMillis());
+        Text reprUrlWritable = (Text) fit.datum.getMetaData().get(
+            Nutch.WRITABLE_REPR_URL_KEY);
+        if (reprUrlWritable == null) {
+          setReprUrl(fit.url.toString());
+        } else {
+          setReprUrl(reprUrlWritable.toString());
+        }
+        try {
+          // fetch the page
+          redirecting = false;
+          redirectCount = 0;
+          do {
+            if (LOG.isInfoEnabled()) {
+              LOG.info("fetching " + fit.url + " (queue crawl delay="
+                  + ((FetchItemQueues) fetchQueues).getFetchItemQueue(fit.queueID).crawlDelay
+                  + "ms)");
+            }
+            if (LOG.isDebugEnabled()) {
+              LOG.debug("redirectCount=" + redirectCount);
+            }
+            redirecting = false;
+            Protocol protocol = this.protocolFactory.getProtocol(fit.url
+                .toString());
+            BaseRobotRules rules = protocol.getRobotRules(fit.url, fit.datum);
+            if (!rules.isAllowed(fit.u.toString())) {
+              // unblock
+              ((FetchItemQueues) fetchQueues).finishFetchItem(fit, true);
+              if (LOG.isDebugEnabled()) {
+                LOG.debug("Denied by robots.txt: " + fit.url);
+              }
+              output(fit.url, fit.datum, null,
+                  ProtocolStatus.STATUS_ROBOTS_DENIED,
+                  CrawlDatum.STATUS_FETCH_GONE);
+              reporter.incrCounter("FetcherStatus", "robots_denied", 1);
+              continue;
+            }
+            if (rules.getCrawlDelay() > 0) {
+              if (rules.getCrawlDelay() > maxCrawlDelay && maxCrawlDelay >= 0) {
+                // unblock
+                ((FetchItemQueues) fetchQueues).finishFetchItem(fit, true);
+                LOG.debug("Crawl-Delay for " + fit.url + " too long ("
+                    + rules.getCrawlDelay() + "), skipping");
+                output(fit.url, fit.datum, null,
+                    ProtocolStatus.STATUS_ROBOTS_DENIED,
+                    CrawlDatum.STATUS_FETCH_GONE);
+                reporter.incrCounter("FetcherStatus",
+                    "robots_denied_maxcrawldelay", 1);
+                continue;
+              } else {
+                FetchItemQueue fiq = ((FetchItemQueues) fetchQueues)
+                    .getFetchItemQueue(fit.queueID);
+                fiq.crawlDelay = rules.getCrawlDelay();
+                if (LOG.isDebugEnabled()) {
+                  LOG.debug("Crawl delay for queue: " + fit.queueID
+                      + " is set to " + fiq.crawlDelay
+                      + " as per robots.txt. url: " + fit.url);
+                }
+              }
+            }
+            ProtocolOutput output = protocol.getProtocolOutput(fit.url,
+                fit.datum);
+            ProtocolStatus status = output.getStatus();
+            Content content = output.getContent();
+            ParseStatus pstatus = null;
+            // unblock queue
+            ((FetchItemQueues) fetchQueues).finishFetchItem(fit);
+
+            String urlString = fit.url.toString();
+            
+            // used for FetchNode
+            if (fetchNode != null) {
+              fetchNode.setStatus(status.getCode());
+              fetchNode.setFetchTime(System.currentTimeMillis());
+              fetchNode.setUrl(fit.url);
+            }
+
+            reporter.incrCounter("FetcherStatus", status.getName(), 1);
+
+            switch (status.getCode()) {
+
+            case ProtocolStatus.WOULDBLOCK:
+              // retry ?
+              ((FetchItemQueues) fetchQueues).addFetchItem(fit);
+              break;
+
+            case ProtocolStatus.SUCCESS: // got a page
+              pstatus = output(fit.url, fit.datum, content, status,
+                  CrawlDatum.STATUS_FETCH_SUCCESS, fit.outlinkDepth);
+              updateStatus(content.getContent().length);
+              if (pstatus != null && pstatus.isSuccess()
+                  && pstatus.getMinorCode() == ParseStatus.SUCCESS_REDIRECT) {
+                String newUrl = pstatus.getMessage();
+                int refreshTime = Integer.valueOf(pstatus.getArgs()[1]);
+                Text redirUrl = handleRedirect(fit.url, fit.datum, urlString,
+                    newUrl, refreshTime < Fetcher.PERM_REFRESH_TIME,
+                    Fetcher.CONTENT_REDIR);
+                if (redirUrl != null) {
+                  fit = queueRedirect(redirUrl, fit);
+                }
+              }
+              break;
+
+            case ProtocolStatus.MOVED: // redirect
+            case ProtocolStatus.TEMP_MOVED:
+              int code;
+              boolean temp;
+              if (status.getCode() == ProtocolStatus.MOVED) {
+                code = CrawlDatum.STATUS_FETCH_REDIR_PERM;
+                temp = false;
+              } else {
+                code = CrawlDatum.STATUS_FETCH_REDIR_TEMP;
+                temp = true;
+              }
+              output(fit.url, fit.datum, content, status, code);
+              String newUrl = status.getMessage();
+              Text redirUrl = handleRedirect(fit.url, fit.datum, urlString,
+                  newUrl, temp, Fetcher.PROTOCOL_REDIR);
+              if (redirUrl != null) {
+                fit = queueRedirect(redirUrl, fit);
+              } else {
+                // stop redirecting
+                redirecting = false;
+              }
+              break;
+
+            case ProtocolStatus.EXCEPTION:
+              logError(fit.url, status.getMessage());
+              int killedURLs = ((FetchItemQueues) fetchQueues).checkExceptionThreshold(fit
+                  .getQueueID());
+              if (killedURLs != 0)
+                reporter.incrCounter("FetcherStatus",
+                    "AboveExceptionThresholdInQueue", killedURLs);
+              /* FALLTHROUGH */
+            case ProtocolStatus.RETRY: // retry
+            case ProtocolStatus.BLOCKED:
+              output(fit.url, fit.datum, null, status,
+                  CrawlDatum.STATUS_FETCH_RETRY);
+              break;
+
+            case ProtocolStatus.GONE: // gone
+            case ProtocolStatus.NOTFOUND:
+            case ProtocolStatus.ACCESS_DENIED:
+            case ProtocolStatus.ROBOTS_DENIED:
+              output(fit.url, fit.datum, null, status,
+                  CrawlDatum.STATUS_FETCH_GONE);
+              break;
+
+            case ProtocolStatus.NOTMODIFIED:
+              output(fit.url, fit.datum, null, status,
+                  CrawlDatum.STATUS_FETCH_NOTMODIFIED);
+              break;
+
+            default:
+              if (LOG.isWarnEnabled()) {
+                LOG.warn("Unknown ProtocolStatus: " + status.getCode());
+              }
+              output(fit.url, fit.datum, null, status,
+                  CrawlDatum.STATUS_FETCH_RETRY);
+            }
+
+            if (redirecting && redirectCount > maxRedirect) {
+              ((FetchItemQueues) fetchQueues).finishFetchItem(fit);
+              if (LOG.isInfoEnabled()) {
+                LOG.info(" - redirect count exceeded " + fit.url);
+              }
+              output(fit.url, fit.datum, null,
+                  ProtocolStatus.STATUS_REDIR_EXCEEDED,
+                  CrawlDatum.STATUS_FETCH_GONE);
+            }
+
+          } while (redirecting && (redirectCount <= maxRedirect));
+
+        } catch (Throwable t) { // unexpected exception
+          // unblock
+          ((FetchItemQueues) fetchQueues).finishFetchItem(fit);
+          logError(fit.url, StringUtils.stringifyException(t));
+          output(fit.url, fit.datum, null, ProtocolStatus.STATUS_FAILED,
+              CrawlDatum.STATUS_FETCH_RETRY);
+        }
+      }
+
+    } catch (Throwable e) {
+      if (LOG.isErrorEnabled()) {
+        LOG.error("fetcher caught:" + e.toString());
+      }
+    } finally {
+      if (fit != null)
+        ((FetchItemQueues) fetchQueues).finishFetchItem(fit);
+      activeThreads.decrementAndGet(); // count threads
+      LOG.info("-finishing thread " + getName() + ", activeThreads="
+          + activeThreads);
+    }
+  }
+
+  private Text handleRedirect(Text url, CrawlDatum datum, String urlString,
+      String newUrl, boolean temp, String redirType)
+      throws MalformedURLException, URLFilterException {
+    newUrl = normalizers.normalize(newUrl, URLNormalizers.SCOPE_FETCHER);
+    newUrl = urlFilters.filter(newUrl);
+
+    try {
+      String origHost = new URL(urlString).getHost().toLowerCase();
+      String newHost = new URL(newUrl).getHost().toLowerCase();
+      if (ignoreExternalLinks) {
+        if (!origHost.equals(newHost)) {
+          if (LOG.isDebugEnabled()) {
+            LOG.debug(" - ignoring redirect " + redirType + " from "
+                + urlString + " to " + newUrl
+                + " because external links are ignored");
+          }
+          return null;
+        }
+      }
+      
+      if (ignoreInternalLinks) {
+        if (origHost.equals(newHost)) {
+          if (LOG.isDebugEnabled()) {
+            LOG.debug(" - ignoring redirect " + redirType + " from "
+                + urlString + " to " + newUrl
+                + " because internal links are ignored");
+          }
+          return null;
+        }
+      }
+    } catch (MalformedURLException e) { }
+    
+    if (newUrl != null && !newUrl.equals(urlString)) {
+      reprUrl = URLUtil.chooseRepr(reprUrl, newUrl, temp);
+      url = new Text(newUrl);
+      if (maxRedirect > 0) {
+        redirecting = true;
+        redirectCount++;
+        if (LOG.isDebugEnabled()) {
+          LOG.debug(" - " + redirType + " redirect to " + url
+              + " (fetching now)");
+        }
+        return url;
+      } else {
+        CrawlDatum newDatum = new CrawlDatum(CrawlDatum.STATUS_LINKED,
+            datum.getFetchInterval(), datum.getScore());
+        // transfer existing metadata
+        newDatum.getMetaData().putAll(datum.getMetaData());
+        try {
+          scfilters.initialScore(url, newDatum);
+        } catch (ScoringFilterException e) {
+          e.printStackTrace();
+        }
+        if (reprUrl != null) {
+          newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY,
+              new Text(reprUrl));
+        }
+        output(url, newDatum, null, null, CrawlDatum.STATUS_LINKED);
+        if (LOG.isDebugEnabled()) {
+          LOG.debug(" - " + redirType + " redirect to " + url
+              + " (fetching later)");
+        }
+        return null;
+      }
+    } else {
+      if (LOG.isDebugEnabled()) {
+        LOG.debug(" - " + redirType + " redirect skipped: "
+            + (newUrl != null ? "to same url" : "filtered"));
+      }
+      return null;
+    }
+  }
+
+  private FetchItem queueRedirect(Text redirUrl, FetchItem fit)
+      throws ScoringFilterException {
+    CrawlDatum newDatum = new CrawlDatum(CrawlDatum.STATUS_DB_UNFETCHED,
+        fit.datum.getFetchInterval(), fit.datum.getScore());
+    // transfer all existing metadata to the redirect
+    newDatum.getMetaData().putAll(fit.datum.getMetaData());
+    scfilters.initialScore(redirUrl, newDatum);
+    if (reprUrl != null) {
+      newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY,
+          new Text(reprUrl));
+    }
+    fit = FetchItem.create(redirUrl, newDatum, queueMode);
+    if (fit != null) {
+      FetchItemQueue fiq = ((FetchItemQueues) fetchQueues).getFetchItemQueue(fit.queueID);
+      fiq.addInProgressFetchItem(fit);
+    } else {
+      // stop redirecting
+      redirecting = false;
+      reporter.incrCounter("FetcherStatus", "FetchItem.notCreated.redirect",
+          1);
+    }
+    return fit;
+  }
+
+  private void logError(Text url, String message) {
+    if (LOG.isInfoEnabled()) {
+      LOG.info("fetch of " + url + " failed with: " + message);
+    }
+    errors.incrementAndGet();
+  }
+
+  private ParseStatus output(Text key, CrawlDatum datum, Content content,
+      ProtocolStatus pstatus, int status) {
+
+    return output(key, datum, content, pstatus, status, 0);
+  }
+
+  private ParseStatus output(Text key, CrawlDatum datum, Content content,
+      ProtocolStatus pstatus, int status, int outlinkDepth) {
+
+    datum.setStatus(status);
+    datum.setFetchTime(System.currentTimeMillis());
+    if (pstatus != null)
+      datum.getMetaData().put(Nutch.WRITABLE_PROTO_STATUS_KEY, pstatus);
+
+    ParseResult parseResult = null;
+    if (content != null) {
+      Metadata metadata = content.getMetadata();
+
+      // store the guessed content type in the crawldatum
+      if (content.getContentType() != null)
+        datum.getMetaData().put(new Text(Metadata.CONTENT_TYPE),
+            new Text(content.getContentType()));
+
+      // add segment to metadata
+      metadata.set(Nutch.SEGMENT_NAME_KEY, segmentName);
+      // add score to content metadata so that ParseSegment can pick it up.
+      try {
+        scfilters.passScoreBeforeParsing(key, datum, content);
+      } catch (Exception e) {
+        if (LOG.isWarnEnabled()) {
+          LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");
+        }
+      }
+      /*
+       * Note: Fetcher will only follow meta-redirects coming from the
+       * original URL.
+       */
+      if (parsing && status == CrawlDatum.STATUS_FETCH_SUCCESS) {
+        if (!skipTruncated
+            || (skipTruncated && !ParseSegment.isTruncated(content))) {
+          try {
+            parseResult = this.parseUtil.parse(content);
+          } catch (Exception e) {
+            LOG.warn("Error parsing: " + key + ": "
+                + StringUtils.stringifyException(e));
+          }
+        }
+
+        if (parseResult == null) {
+          byte[] signature = SignatureFactory.getSignature(conf)
+              .calculate(content, new ParseStatus().getEmptyParse(conf));
+          datum.setSignature(signature);
+        }
+      }
+
+      /*
+       * Store status code in content So we can read this value during parsing
+       * (as a separate job) and decide to parse or not.
+       */
+      content.getMetadata().add(Nutch.FETCH_STATUS_KEY,
+          Integer.toString(status));
+    }
+
+    try {
+      output.collect(key, new NutchWritable(datum));
+      if (content != null && storingContent)
+        output.collect(key, new NutchWritable(content));
+      if (parseResult != null) {
+        for (Entry<Text, Parse> entry : parseResult) {
+          Text url = entry.getKey();
+          Parse parse = entry.getValue();
+          ParseStatus parseStatus = parse.getData().getStatus();
+          ParseData parseData = parse.getData();
+
+          if (!parseStatus.isSuccess()) {
+            LOG.warn("Error parsing: " + key + ": " + parseStatus);
+            parse = parseStatus.getEmptyParse(conf);
+          }
+
+          // Calculate page signature. For non-parsing fetchers this will
+          // be done in ParseSegment
+          byte[] signature = SignatureFactory.getSignature(conf)
+              .calculate(content, parse);
+          // Ensure segment name and score are in parseData metadata
+          parseData.getContentMeta().set(Nutch.SEGMENT_NAME_KEY, segmentName);
+          parseData.getContentMeta().set(Nutch.SIGNATURE_KEY,
+              StringUtil.toHexString(signature));
+          // Pass fetch time to content meta
+          parseData.getContentMeta().set(Nutch.FETCH_TIME_KEY,
+              Long.toString(datum.getFetchTime()));
+          if (url.equals(key))
+            datum.setSignature(signature);
+          try {
+            scfilters.passScoreAfterParsing(url, content, parse);
+          } catch (Exception e) {
+            if (LOG.isWarnEnabled()) {
+              LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");
+            }
+          }
+
+          String origin = null;
+
+          // collect outlinks for subsequent db update
+          Outlink[] links = parseData.getOutlinks();
+          int outlinksToStore = Math.min(maxOutlinks, links.length);
+          if (ignoreExternalLinks || ignoreInternalLinks) {
+            URL originURL = new URL(url.toString());
+            // based on domain?
+            if ("bydomain".equalsIgnoreCase(ignoreExternalLinksMode)) {
+              origin = URLUtil.getDomainName(originURL).toLowerCase();
+            } 
+            // use host 
+            else {
+              origin = originURL.getHost().toLowerCase();
+            }
+          }
+          
+          //used by fetchNode         
+          if(fetchNode!=null){
+            fetchNode.setOutlinks(links);
+            fetchNode.setTitle(parseData.getTitle());
+            FetchNodeDb.getInstance().put(fetchNode.getUrl().toString(), fetchNode);
+          }
+          int validCount = 0;
+
+          // Process all outlinks, normalize, filter and deduplicate
+          List<Outlink> outlinkList = new ArrayList<Outlink>(outlinksToStore);
+          HashSet<String> outlinks = new HashSet<String>(outlinksToStore);
+          for (int i = 0; i < links.length && validCount < outlinksToStore; i++) {
+            String toUrl = links[i].getToUrl();
+
+            toUrl = ParseOutputFormat.filterNormalize(url.toString(), toUrl,
+                origin, ignoreInternalLinks, ignoreExternalLinks, ignoreExternalLinksMode,
+                    urlFilters, urlExemptionFilters,  normalizers);
+            if (toUrl == null) {
+              continue;
+            }
+
+            validCount++;
+            links[i].setUrl(toUrl);
+            outlinkList.add(links[i]);
+            outlinks.add(toUrl);
+          }
+
+          // Only process depth N outlinks
+          if (maxOutlinkDepth > 0 && outlinkDepth < maxOutlinkDepth) {
+            reporter.incrCounter("FetcherOutlinks", "outlinks_detected",
+                outlinks.size());
+
+            // Counter to limit num outlinks to follow per page
+            int outlinkCounter = 0;
+
+            // Calculate variable number of outlinks by depth using the
+            // divisor (outlinks = Math.floor(divisor / depth * num.links))
+            int maxOutlinksByDepth = (int) Math.floor(outlinksDepthDivisor
+                / (outlinkDepth + 1) * maxOutlinkDepthNumLinks);
+
+            String followUrl;
+
+            // Walk over the outlinks and add as new FetchItem to the queues
+            Iterator<String> iter = outlinks.iterator();
+            while (iter.hasNext() && outlinkCounter < maxOutlinkDepthNumLinks) {
+              followUrl = iter.next();
+
+              // Check whether we'll follow external outlinks
+              if (outlinksIgnoreExternal) {
+                if (!URLUtil.getHost(url.toString()).equals(
+                    URLUtil.getHost(followUrl))) {
+                  continue;
+                }
+              }
+
+              reporter
+                  .incrCounter("FetcherOutlinks", "outlinks_following", 1);
+
+              // Create new FetchItem with depth incremented
+              FetchItem fit = FetchItem.create(new Text(followUrl),
+                  new CrawlDatum(CrawlDatum.STATUS_LINKED, interval),
+                  queueMode, outlinkDepth + 1);
+              ((FetchItemQueues) fetchQueues).addFetchItem(fit);
+
+              outlinkCounter++;
+            }
+          }
+
+          // Overwrite the outlinks in ParseData with the normalized and
+          // filtered set
+          parseData.setOutlinks(outlinkList.toArray(new Outlink[outlinkList
+              .size()]));
+
+          output.collect(url, new NutchWritable(new ParseImpl(new ParseText(
+              parse.getText()), parseData, parse.isCanonical())));
+        }
+      }
+    } catch (IOException e) {
+      if (LOG.isErrorEnabled()) {
+        LOG.error("fetcher caught:" + e.toString());
+      }
+    }
+
+    // return parse status if it exits
+    if (parseResult != null && !parseResult.isEmpty()) {
+      Parse p = parseResult.get(content.getUrl());
+      if (p != null) {
+        reporter.incrCounter("ParserStatus", ParseStatus.majorCodes[p
+            .getData().getStatus().getMajorCode()], 1);
+        return p.getData().getStatus();
+      }
+    }
+    return null;
+  }
+  
+  private void updateStatus(int bytesInPage) throws IOException {
+    pages.incrementAndGet();
+    bytes.addAndGet(bytesInPage);
+  }
+
+  public synchronized void setHalted(boolean halted) {
+    this.halted = halted;
+  }
+
+  public synchronized boolean isHalted() {
+    return halted;
+  }
+
+  public String getReprUrl() {
+    return reprUrl;
+  }
+  
+  private void setReprUrl(String urlString) {
+    this.reprUrl = urlString;
+    
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/fetcher/QueueFeeder.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/fetcher/QueueFeeder.java b/nutch-core/src/main/java/org/apache/nutch/fetcher/QueueFeeder.java
new file mode 100644
index 0000000..79652e7
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/fetcher/QueueFeeder.java
@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.fetcher;
+
+import java.io.IOException;
+
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.RecordReader;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * This class feeds the queues with input items, and re-fills them as items
+ * are consumed by FetcherThread-s.
+ */
+public class QueueFeeder extends Thread {
+  
+  private static final Logger LOG = LoggerFactory.getLogger(QueueFeeder.class);
+
+  
+  private RecordReader<Text, CrawlDatum> reader;
+  private FetchItemQueues queues;
+  private int size;
+  private long timelimit = -1;
+
+  public QueueFeeder(RecordReader<Text, CrawlDatum> reader,
+      FetchItemQueues queues, int size) {
+    this.reader = reader;
+    this.queues = queues;
+    this.size = size;
+    this.setDaemon(true);
+    this.setName("QueueFeeder");
+  }
+
+  public void setTimeLimit(long tl) {
+    timelimit = tl;
+  }
+
+  public void run() {
+    boolean hasMore = true;
+    int cnt = 0;
+    int timelimitcount = 0;
+    while (hasMore) {
+      if (System.currentTimeMillis() >= timelimit && timelimit != -1) {
+        // enough .. lets' simply
+        // read all the entries from the input without processing them
+        try {
+          Text url = new Text();
+          CrawlDatum datum = new CrawlDatum();
+          hasMore = reader.next(url, datum);
+          timelimitcount++;
+        } catch (IOException e) {
+          LOG.error("QueueFeeder error reading input, record " + cnt, e);
+          return;
+        }
+        continue;
+      }
+      int feed = size - queues.getTotalSize();
+      if (feed <= 0) {
+        // queues are full - spin-wait until they have some free space
+        try {
+          Thread.sleep(1000);
+        } catch (Exception e) {
+        }
+        ;
+        continue;
+      } else {
+        LOG.debug("-feeding " + feed + " input urls ...");
+        while (feed > 0 && hasMore) {
+          try {
+            Text url = new Text();
+            CrawlDatum datum = new CrawlDatum();
+            hasMore = reader.next(url, datum);
+            if (hasMore) {
+              queues.addFetchItem(url, datum);
+              cnt++;
+              feed--;
+            }
+          } catch (IOException e) {
+            LOG.error("QueueFeeder error reading input, record " + cnt, e);
+            return;
+          }
+        }
+      }
+    }
+    LOG.info("QueueFeeder finished: total " + cnt
+        + " records + hit by time limit :" + timelimitcount);
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/fetcher/package.html
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/fetcher/package.html b/nutch-core/src/main/java/org/apache/nutch/fetcher/package.html
new file mode 100644
index 0000000..9c843e0
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/fetcher/package.html
@@ -0,0 +1,5 @@
+<html>
+<body>
+The Nutch robot.
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/hostdb/HostDatum.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/hostdb/HostDatum.java b/nutch-core/src/main/java/org/apache/nutch/hostdb/HostDatum.java
new file mode 100644
index 0000000..424fb1e
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/hostdb/HostDatum.java
@@ -0,0 +1,324 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.hostdb;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.Date;
+import java.util.Map.Entry;
+import java.text.SimpleDateFormat;
+
+import org.apache.hadoop.io.MapWritable;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+
+/**
+ */
+public class HostDatum implements Writable, Cloneable {
+  protected int failures = 0;
+  protected float score = 0;
+  protected Date lastCheck = new Date(0);
+  protected String homepageUrl = new String();
+
+  protected MapWritable metaData = new MapWritable();
+
+  // Records the number of times DNS look-up failed, may indicate host no longer exists
+  protected int dnsFailures = 0;
+
+  // Records the number of connection failures, may indicate our netwerk being blocked by firewall
+  protected int connectionFailures = 0;
+
+  protected int unfetched = 0;
+  protected int fetched = 0;
+  protected int notModified = 0;
+  protected int redirTemp = 0;
+  protected int redirPerm = 0;
+  protected int gone = 0;
+
+  public HostDatum() {
+  }
+
+  public HostDatum(float score) {
+    this(score, new Date());
+  }
+
+  public HostDatum(float score, Date lastCheck) {
+    this(score, lastCheck, new String());
+  }
+
+  public HostDatum(float score, Date lastCheck, String homepageUrl) {
+    this.score =  score;
+    this.lastCheck = lastCheck;
+    this.homepageUrl = homepageUrl;
+  }
+
+  public void resetFailures() {
+    setDnsFailures(0);
+    setConnectionFailures(0);
+  }
+
+  public void setDnsFailures(Integer dnsFailures) {
+    this.dnsFailures = dnsFailures;
+  }
+
+  public void setConnectionFailures(Integer connectionFailures) {
+    this.connectionFailures = connectionFailures;
+  }
+
+  public void incDnsFailures() {
+    this.dnsFailures++;
+  }
+
+  public void incConnectionFailures() {
+    this.connectionFailures++;
+  }
+
+  public Integer numFailures() {
+    return getDnsFailures() + getConnectionFailures();
+  }
+
+  public Integer getDnsFailures() {
+    return dnsFailures;
+  }
+
+  public Integer getConnectionFailures() {
+    return connectionFailures;
+  }
+
+  public void setScore(float score) {
+    this.score = score;
+  }
+
+  public void setLastCheck() {
+    setLastCheck(new Date());
+  }
+
+  public void setLastCheck(Date date) {
+    lastCheck = date;
+  }
+
+  public boolean isEmpty() {
+    return (lastCheck.getTime() == 0) ? true : false;
+  }
+
+  public float getScore() {
+    return score;
+  }
+
+  public Integer numRecords() {
+    return unfetched + fetched + gone + redirPerm + redirTemp + notModified;
+  }
+
+  public Date getLastCheck() {
+    return lastCheck;
+  }
+
+  public boolean hasHomepageUrl() {
+    return homepageUrl.length() > 0;
+  }
+
+  public String getHomepageUrl() {
+    return homepageUrl;
+  }
+
+  public void setHomepageUrl(String homepageUrl) {
+    this.homepageUrl = homepageUrl;
+  }
+
+  public void setUnfetched(int val) {
+    unfetched = val;
+  }
+
+  public int getUnfetched() {
+    return unfetched;
+  }
+
+  public void setFetched(int val) {
+    fetched = val;
+  }
+
+  public int getFetched() {
+    return fetched;
+  }
+
+  public void setNotModified(int val) {
+    notModified = val;
+  }
+
+  public int getNotModified() {
+    return notModified;
+  }
+
+  public void setRedirTemp(int val) {
+    redirTemp = val;
+  }
+
+  public int getRedirTemp() {
+    return redirTemp;
+  }
+
+  public void setRedirPerm(int val) {
+    redirPerm = val;
+  }
+
+  public int getRedirPerm() {
+    return redirPerm;
+  }
+
+  public void setGone(int val) {
+    gone = val;
+  }
+
+  public int getGone() {
+    return gone;
+  }
+
+  public void resetStatistics() {
+    setUnfetched(0);
+    setFetched(0);
+    setGone(0);
+    setRedirTemp(0);
+    setRedirPerm(0);
+    setNotModified(0);
+  }
+
+   public void setMetaData(org.apache.hadoop.io.MapWritable mapWritable) {
+     this.metaData = new org.apache.hadoop.io.MapWritable(mapWritable);
+   }
+
+   /**
+    * Add all metadata from other CrawlDatum to this CrawlDatum.
+    *
+    * @param other HostDatum
+    */
+   public void putAllMetaData(HostDatum other) {
+     for (Entry<Writable, Writable> e : other.getMetaData().entrySet()) {
+       getMetaData().put(e.getKey(), e.getValue());
+     }
+   }
+
+  /**
+   * returns a MapWritable if it was set or read in @see readFields(DataInput),
+   * returns empty map in case CrawlDatum was freshly created (lazily instantiated).
+   */
+  public org.apache.hadoop.io.MapWritable getMetaData() {
+    if (this.metaData == null) this.metaData = new org.apache.hadoop.io.MapWritable();
+    return this.metaData;
+  }
+
+  @Override
+  public Object clone() throws CloneNotSupportedException {
+    HostDatum result = (HostDatum)super.clone();
+    result.score = score;
+    result.lastCheck = lastCheck;
+    result.homepageUrl = homepageUrl;
+
+    result.dnsFailures = dnsFailures;
+    result.connectionFailures = connectionFailures;
+
+    result.unfetched = unfetched;
+    result.fetched = fetched;
+    result.notModified = notModified;
+    result.redirTemp = redirTemp;
+    result.redirPerm = redirPerm;
+    result.gone = gone;
+
+    result.metaData = metaData;
+
+    return result;
+  }
+
+  @Override
+  public void readFields(DataInput in) throws IOException {
+    score = in.readFloat();
+    lastCheck = new Date(in.readLong());
+    homepageUrl = Text.readString(in);
+
+    dnsFailures = in.readInt();
+    connectionFailures = in.readInt();
+
+    unfetched= in.readInt();
+    fetched= in.readInt();
+    notModified= in.readInt();
+    redirTemp= in.readInt();
+    redirPerm = in.readInt();
+    gone = in.readInt();
+
+    metaData = new org.apache.hadoop.io.MapWritable();
+    metaData.readFields(in);
+  }
+
+  @Override
+  public void write(DataOutput out) throws IOException {
+    out.writeFloat(score);
+    out.writeLong(lastCheck.getTime());
+    Text.writeString(out, homepageUrl);
+
+    out.writeInt(dnsFailures);
+    out.writeInt(connectionFailures);
+
+    out.writeInt(unfetched);
+    out.writeInt(fetched);
+    out.writeInt(notModified);
+    out.writeInt(redirTemp);
+    out.writeInt(redirPerm);
+    out.writeInt(gone);
+
+    metaData.write(out);
+  }
+
+  @Override
+  public String toString() {
+    StringBuilder buf = new StringBuilder();
+    buf.append(Integer.toString(getUnfetched()));
+    buf.append("\t");
+    buf.append(Integer.toString(getFetched()));
+    buf.append("\t");
+    buf.append(Integer.toString(getGone()));
+    buf.append("\t");
+    buf.append(Integer.toString(getRedirTemp()));
+    buf.append("\t");
+    buf.append(Integer.toString(getRedirPerm()));
+    buf.append("\t");
+    buf.append(Integer.toString(getNotModified()));
+    buf.append("\t");
+    buf.append(Integer.toString(numRecords()));
+    buf.append("\t");
+    buf.append(Integer.toString(getDnsFailures()));
+    buf.append("\t");
+    buf.append(Integer.toString(getConnectionFailures()));
+    buf.append("\t");
+    buf.append(Integer.toString(numFailures()));
+    buf.append("\t");
+    buf.append(Float.toString(score));
+    buf.append("\t");
+    buf.append(new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(lastCheck));
+    buf.append("\t");
+    buf.append(homepageUrl);
+    buf.append("\t");
+    for (Entry<Writable, Writable> e : getMetaData().entrySet()) {
+      buf.append(e.getKey().toString());
+      buf.append(':');
+      buf.append(e.getValue().toString());
+      buf.append("|||");
+    }
+    return buf.toString();
+  }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/hostdb/ReadHostDb.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/hostdb/ReadHostDb.java b/nutch-core/src/main/java/org/apache/nutch/hostdb/ReadHostDb.java
new file mode 100644
index 0000000..240e109
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/hostdb/ReadHostDb.java
@@ -0,0 +1,240 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.hostdb;
+
+import java.io.IOException;
+import java.text.SimpleDateFormat;
+import java.util.Map;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.io.FloatWritable;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.StringUtil;
+import org.apache.nutch.util.TimingUtil;
+import org.apache.nutch.util.URLUtil;
+
+import org.apache.commons.jexl2.JexlContext;
+import org.apache.commons.jexl2.Expression;
+import org.apache.commons.jexl2.JexlEngine;
+import org.apache.commons.jexl2.MapContext;
+
+/**
+ * @see http://commons.apache.org/proper/commons-jexl/reference/syntax.html
+ */
+public class ReadHostDb extends Configured implements Tool {
+
+  public static final Logger LOG = LoggerFactory.getLogger(ReadHostDb.class);
+
+  public static final String HOSTDB_DUMP_HOSTNAMES = "hostdb.dump.hostnames";
+  public static final String HOSTDB_DUMP_HOMEPAGES = "hostdb.dump.homepages";
+  public static final String HOSTDB_FILTER_EXPRESSION = "hostdb.filter.expression";
+
+  static class ReadHostDbMapper extends Mapper<Text, HostDatum, Text, Text> {
+    protected boolean dumpHostnames = false;
+    protected boolean dumpHomepages = false;
+    protected Text emptyText = new Text();
+    protected Expression expr = null;
+
+    public void setup(Context context) {
+      dumpHomepages = context.getConfiguration().getBoolean(HOSTDB_DUMP_HOMEPAGES, false);
+      dumpHostnames = context.getConfiguration().getBoolean(HOSTDB_DUMP_HOSTNAMES, false);
+      String expr = context.getConfiguration().get(HOSTDB_FILTER_EXPRESSION);
+      if (expr != null) {
+        // Create or retrieve a JexlEngine
+        JexlEngine jexl = new JexlEngine();
+        
+        // Dont't be silent and be strict
+        jexl.setSilent(true);
+        jexl.setStrict(true);
+        
+        // Create an expression object
+        this.expr = jexl.createExpression(expr);
+      }
+    }
+
+    public void map(Text key, HostDatum datum, Context context) throws IOException, InterruptedException {     
+      if (expr != null) {
+        // Create a context and add data
+        JexlContext jcontext = new MapContext();
+        
+        // Set some fixed variables
+        jcontext.set("unfetched", datum.getUnfetched());
+        jcontext.set("fetched", datum.getFetched());
+        jcontext.set("gone", datum.getGone());
+        jcontext.set("redirTemp", datum.getRedirTemp());
+        jcontext.set("redirPerm", datum.getRedirPerm());
+        jcontext.set("redirs", datum.getRedirPerm() + datum.getRedirTemp());
+        jcontext.set("notModified", datum.getNotModified());
+        jcontext.set("ok", datum.getFetched() + datum.getNotModified());
+        jcontext.set("numRecords", datum.numRecords());
+        jcontext.set("dnsFailures", datum.getDnsFailures());
+        jcontext.set("connectionFailures", datum.getConnectionFailures());
+        
+        // Set metadata variables
+        for (Map.Entry<Writable, Writable> entry : datum.getMetaData().entrySet()) {
+          Object value = entry.getValue();
+          
+          if (value instanceof FloatWritable) {
+            FloatWritable fvalue = (FloatWritable)value;
+            Text tkey = (Text)entry.getKey();
+            jcontext.set(tkey.toString(), fvalue.get());
+          }
+          
+          if (value instanceof IntWritable) {
+            IntWritable ivalue = (IntWritable)value;
+            Text tkey = (Text)entry.getKey();
+            jcontext.set(tkey.toString(), ivalue.get());
+          }
+        }
+        
+        // Filter this record if evaluation did not pass
+        try {
+          if (!Boolean.TRUE.equals(expr.evaluate(jcontext))) {
+            return;
+          }
+        } catch (Exception e) {
+          LOG.info(e.toString() + " for " + key.toString());
+        }
+      }
+      
+      if (dumpHomepages) {
+        if (datum.hasHomepageUrl()) {
+          context.write(new Text(datum.getHomepageUrl()), emptyText);
+        }
+        return;
+      }
+      
+      if (dumpHostnames) {
+        context.write(key, emptyText);
+        return;
+      }
+      
+      // Write anyway
+      context.write(key, new Text(datum.toString()));
+    }
+  }
+
+  // Todo, reduce unknown hosts to single unknown domain if possible. Enable via configuration
+  // host_a.example.org,host_a.example.org ==> example.org
+//   static class ReadHostDbReducer extends Reduce<Text, Text, Text, Text> {
+//     public void setup(Context context) { }
+//
+//     public void reduce(Text domain, Iterable<Text> hosts, Context context) throws IOException, InterruptedException {
+//
+//     }
+//   }
+
+  private void readHostDb(Path hostDb, Path output, boolean dumpHomepages, boolean dumpHostnames, String expr) throws Exception {
+    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+    long start = System.currentTimeMillis();
+    LOG.info("ReadHostDb: starting at " + sdf.format(start));
+
+    Configuration conf = getConf();
+    conf.setBoolean(HOSTDB_DUMP_HOMEPAGES, dumpHomepages);
+    conf.setBoolean(HOSTDB_DUMP_HOSTNAMES, dumpHostnames);
+    if (expr != null) {
+      conf.set(HOSTDB_FILTER_EXPRESSION, expr);
+    }
+    conf.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);
+    conf.set("mapred.textoutputformat.separator", "\t");
+    
+    Job job = new Job(conf, "ReadHostDb");
+    job.setJarByClass(ReadHostDb.class);
+
+    FileInputFormat.addInputPath(job, new Path(hostDb, "current"));
+    FileOutputFormat.setOutputPath(job, output);
+
+    job.setJarByClass(ReadHostDb.class);
+    job.setMapperClass(ReadHostDbMapper.class);
+
+    job.setInputFormatClass(SequenceFileInputFormat.class);
+    job.setOutputFormatClass(TextOutputFormat.class);
+    job.setMapOutputKeyClass(Text.class);
+    job.setMapOutputValueClass(Text.class);
+    job.setOutputKeyClass(Text.class);
+    job.setOutputValueClass(Text.class);
+    job.setNumReduceTasks(0);
+
+    try {
+      job.waitForCompletion(true);
+    } catch (Exception e) {
+      throw e;
+    }
+
+    long end = System.currentTimeMillis();
+    LOG.info("ReadHostDb: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
+  }
+
+  public static void main(String args[]) throws Exception {
+    int res = ToolRunner.run(NutchConfiguration.create(), new ReadHostDb(), args);
+    System.exit(res);
+  }
+
+  public int run(String[] args) throws Exception {
+    if (args.length < 2) {
+      System.err.println("Usage: ReadHostDb <hostdb> <output> [-dumpHomepages | -dumpHostnames | -expr <expr.>]");
+      return -1;
+    }
+
+    boolean dumpHomepages = false;
+    boolean dumpHostnames = false;
+    String expr = null;
+
+    for (int i = 0; i < args.length; i++) {
+      if (args[i].equals("-dumpHomepages")) {
+        LOG.info("ReadHostDb: dumping homepage URL's");
+        dumpHomepages = true;
+      }
+      if (args[i].equals("-dumpHostnames")) {
+        LOG.info("ReadHostDb: dumping hostnames");
+        dumpHostnames = true;
+      }
+      if (args[i].equals("-expr")) {
+        expr = args[i + 1];
+        LOG.info("ReadHostDb: evaluating expression: " + expr);
+        i++;
+      }
+    }
+
+    try {
+      readHostDb(new Path(args[0]), new Path(args[1]), dumpHomepages, dumpHostnames, expr);
+      return 0;
+    } catch (Exception e) {
+      LOG.error("ReadHostDb: " + StringUtils.stringifyException(e));
+      return -1;
+    }
+  }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/hostdb/ResolverThread.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/hostdb/ResolverThread.java b/nutch-core/src/main/java/org/apache/nutch/hostdb/ResolverThread.java
new file mode 100644
index 0000000..e7c7978
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/hostdb/ResolverThread.java
@@ -0,0 +1,121 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.hostdb;
+
+import java.net.InetAddress;
+import java.net.UnknownHostException;
+
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.hadoop.util.StringUtils;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Simple runnable that performs DNS lookup for a single host.
+ */
+public class ResolverThread implements Runnable {
+
+  public static final Logger LOG = LoggerFactory.getLogger(ResolverThread.class);
+
+  protected String host = null;
+  protected HostDatum datum = null;
+  protected Text hostText = new Text();
+  protected OutputCollector<Text,HostDatum> output;
+  protected Reporter reporter;
+  protected int purgeFailedHostsThreshold;
+
+  /**
+   * Constructor.
+   */
+  public ResolverThread(String host, HostDatum datum,
+    OutputCollector<Text,HostDatum> output, Reporter reporter, int purgeFailedHostsThreshold) {
+
+    hostText.set(host);
+    this.host = host;
+    this.datum = datum;
+    this.output = output;
+    this.reporter = reporter;
+    this.purgeFailedHostsThreshold = purgeFailedHostsThreshold;
+  }
+
+  /**
+   *
+   */
+  public void run() {
+    // Resolve the host and act appropriatly
+    try {
+      // Throws an exception if host is not found
+      InetAddress inetAddr = InetAddress.getByName(host);
+
+      if (datum.isEmpty()) {
+        reporter.incrCounter("UpdateHostDb", "new_known_host" ,1);
+        datum.setLastCheck();
+        LOG.info(host + ": new_known_host " + datum);
+      } else if (datum.getDnsFailures() > 0) {
+        reporter.incrCounter("UpdateHostDb", "rediscovered_host" ,1);
+        datum.setLastCheck();
+        datum.setDnsFailures(0);
+        LOG.info(host + ": rediscovered_host " + datum);
+      } else {
+        reporter.incrCounter("UpdateHostDb", "existing_known_host", 1);
+        datum.setLastCheck();
+        LOG.info(host + ": existing_known_host " + datum);
+      }
+
+      // Write the host datum
+      output.collect(hostText, datum);
+    } catch (UnknownHostException e) {
+      try {
+        // If the counter is empty we'll initialize with date = today and 1 failure
+        if (datum.isEmpty()) {
+          datum.setLastCheck();
+          datum.setDnsFailures(1);
+          output.collect(hostText, datum);
+          reporter.incrCounter("UpdateHostDb", "new_unknown_host", 1);
+          LOG.info(host + ": new_unknown_host " + datum);
+        } else {
+          datum.setLastCheck();
+          datum.incDnsFailures();
+
+          // Check if this host should be forgotten
+          if (purgeFailedHostsThreshold == -1 ||
+            purgeFailedHostsThreshold < datum.getDnsFailures()) {
+
+            output.collect(hostText, datum);
+            reporter.incrCounter("UpdateHostDb", "existing_unknown_host" ,1);
+            LOG.info(host + ": existing_unknown_host " + datum);
+          } else {
+            reporter.incrCounter("UpdateHostDb", "purged_unknown_host" ,1);
+            LOG.info(host + ": purged_unknown_host " + datum);
+          }
+        }
+
+        reporter.incrCounter("UpdateHostDb",
+          Integer.toString(datum.numFailures()) + "_times_failed", 1);
+      } catch (Exception ioe) {
+        LOG.warn(StringUtils.stringifyException(ioe));
+      }
+    } catch (Exception e) {
+      LOG.warn(StringUtils.stringifyException(e));
+    }
+    
+    reporter.incrCounter("UpdateHostDb", "checked_hosts", 1);
+  }
+}
\ No newline at end of file

[33/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build for nutch-core and nutch-plugins

Posted by th...@apache.org.

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/MimeUtil.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/util/MimeUtil.java b/nutch-core/src/main/java/org/apache/nutch/util/MimeUtil.java
new file mode 100644
index 0000000..a73187b
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/util/MimeUtil.java
@@ -0,0 +1,279 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util;
+
+// JDK imports
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
+
+// Tika imports
+import org.apache.tika.Tika;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MimeType;
+import org.apache.tika.mime.MimeTypeException;
+import org.apache.tika.mime.MimeTypes;
+import org.apache.tika.mime.MimeTypesFactory;
+
+// Slf4j logging imports
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+// imported for Javadoc
+import org.apache.nutch.protocol.ProtocolOutput;
+
+/**
+ * @author mattmann
+ * @since NUTCH-608
+ * 
+ *        <p>
+ *        This is a facade class to insulate Nutch from its underlying Mime Type
+ *        substrate library, <a href="http://incubator.apache.org/tika/">Apache
+ *        Tika</a>. Any mime handling code should be placed in this utility
+ *        class, and hidden from the Nutch classes that rely on it.
+ *        </p>
+ */
+public final class MimeUtil {
+
+  private static final String SEPARATOR = ";";
+
+  /* our Tika mime type registry */
+  private MimeTypes mimeTypes;
+
+  /* the tika detectors */
+  private Tika tika;
+
+  /* whether or not magic should be employed or not */
+  private boolean mimeMagic;
+
+  /* our log stream */
+  private static final Logger LOG = LoggerFactory.getLogger(MimeUtil.class
+      .getName());
+
+  public MimeUtil(Configuration conf) {
+    tika = new Tika();
+    ObjectCache objectCache = ObjectCache.get(conf);
+    MimeTypes mimeTypez = (MimeTypes) objectCache.getObject(MimeTypes.class
+        .getName());
+    if (mimeTypez == null) {
+      try {
+        String customMimeTypeFile = conf.get("mime.types.file");
+        if (customMimeTypeFile != null
+            && customMimeTypeFile.equals("") == false) {
+          try {
+            LOG.info("Using custom mime.types.file: {}", customMimeTypeFile);
+            mimeTypez = MimeTypesFactory.create(conf
+                .getConfResourceAsInputStream(customMimeTypeFile));
+          } catch (Exception e) {
+            LOG.error("Can't load mime.types.file : " + customMimeTypeFile
+                + " using Tika's default");
+          }
+        }
+        if (mimeTypez == null)
+          mimeTypez = MimeTypes.getDefaultMimeTypes();
+      } catch (Exception e) {
+        LOG.error("Exception in MimeUtil " + e.getMessage());
+        throw new RuntimeException(e);
+      }
+      objectCache.setObject(MimeTypes.class.getName(), mimeTypez);
+    }
+
+    this.mimeTypes = mimeTypez;
+    this.mimeMagic = conf.getBoolean("mime.type.magic", true);
+  }
+
+  /**
+   * Cleans a {@link MimeType} name by removing out the actual {@link MimeType},
+   * from a string of the form:
+   * 
+   * <pre>
+   *      &lt;primary type&gt;/&lt;sub type&gt; ; &lt; optional params
+   * </pre>
+   * 
+   * @param origType
+   *          The original mime type string to be cleaned.
+   * @return The primary type, and subtype, concatenated, e.g., the actual mime
+   *         type.
+   */
+  public static String cleanMimeType(String origType) {
+    if (origType == null)
+      return null;
+
+    // take the origType and split it on ';'
+    String[] tokenizedMimeType = origType.split(SEPARATOR);
+    if (tokenizedMimeType.length > 1) {
+      // there was a ';' in there, take the first value
+      return tokenizedMimeType[0];
+    } else {
+      // there wasn't a ';', so just return the orig type
+      return origType;
+    }
+  }
+
+  /**
+   * A facade interface to trying all the possible mime type resolution
+   * strategies available within Tika. First, the mime type provided in
+   * <code>typeName</code> is cleaned, with {@link #cleanMimeType(String)}. Then
+   * the cleaned mime type is looked up in the underlying Tika {@link MimeTypes}
+   * registry, by its cleaned name. If the {@link MimeType} is found, then that
+   * mime type is used, otherwise URL resolution is used to try and determine
+   * the mime type. However, if <code>mime.type.magic</code> is enabled in
+   * {@link NutchConfiguration}, then mime type magic resolution is used to try
+   * and obtain a better-than-the-default approximation of the {@link MimeType}.
+   * 
+   * @param typeName
+   *          The original mime type, returned from a {@link ProtocolOutput}.
+   * @param url
+   *          The given @see url, that Nutch was trying to crawl.
+   * @param data
+   *          The byte data, returned from the crawl, if any.
+   * @return The correctly, automatically guessed {@link MimeType} name.
+   */
+  public String autoResolveContentType(String typeName, String url, byte[] data) {
+    String retType = null;
+    MimeType type = null;
+    String cleanedMimeType = null;
+
+    cleanedMimeType = MimeUtil.cleanMimeType(typeName);
+    // first try to get the type from the cleaned type name
+    if (cleanedMimeType != null) {
+      try {
+        type = mimeTypes.forName(cleanedMimeType);
+        cleanedMimeType = type.getName();
+      } catch (MimeTypeException mte) {
+        // Seems to be a malformed mime type name...
+        cleanedMimeType = null;
+      }
+    }
+
+    // if returned null, or if it's the default type then try url resolution
+    if (type == null
+        || (type != null && type.getName().equals(MimeTypes.OCTET_STREAM))) {
+      // If no mime-type header, or cannot find a corresponding registered
+      // mime-type, then guess a mime-type from the url pattern
+      try {
+        retType = tika.detect(url) != null ? tika.detect(url) : null;
+      } catch (Exception e) {
+        String message = "Problem loading default Tika configuration";
+        LOG.error(message, e);
+        throw new RuntimeException(e);
+      }
+    } else {
+      retType = type.getName();
+    }
+
+    // if magic is enabled use mime magic to guess if the mime type returned
+    // from the magic guess is different than the one that's already set so far
+    // if it is, and it's not the default mime type, then go with the mime type
+    // returned by the magic
+    if (this.mimeMagic) {
+      String magicType = null;
+      // pass URL (file name) and (cleansed) content type from protocol to Tika
+      Metadata tikaMeta = new Metadata();
+      tikaMeta.add(Metadata.RESOURCE_NAME_KEY, url);
+      tikaMeta.add(Metadata.CONTENT_TYPE,
+          (cleanedMimeType != null ? cleanedMimeType : typeName));
+      try {
+        InputStream stream = TikaInputStream.get(data);
+        try {
+          magicType = mimeTypes.detect(stream, tikaMeta).toString();
+        } finally {
+          stream.close();
+        }
+      } catch (IOException ignore) {
+      }
+
+      if (magicType != null && !magicType.equals(MimeTypes.OCTET_STREAM)
+          && !magicType.equals(MimeTypes.PLAIN_TEXT) && retType != null
+          && !retType.equals(magicType)) {
+
+        // If magic enabled and the current mime type differs from that of the
+        // one returned from the magic, take the magic mimeType
+        retType = magicType;
+      }
+
+      // if type is STILL null after all the resolution strategies, go for the
+      // default type
+      if (retType == null) {
+        try {
+          retType = MimeTypes.OCTET_STREAM;
+        } catch (Exception ignore) {
+        }
+      }
+    }
+
+    return retType;
+  }
+
+  /**
+   * Facade interface to Tika's underlying {@link MimeTypes#getMimeType(String)}
+   * method.
+   * 
+   * @param url
+   *          A string representation of the document {@link URL} to sense the
+   *          {@link MimeType} for.
+   * @return An appropriate {@link MimeType}, identified from the given Document
+   *         url in string form.
+   */
+  public String getMimeType(String url) {
+    return tika.detect(url);
+  }
+
+  /**
+   * A facade interface to Tika's underlying {@link MimeTypes#forName(String)}
+   * method.
+   * 
+   * @param name
+   *          The name of a valid {@link MimeType} in the Tika mime registry.
+   * @return The object representation of the {@link MimeType}, if it exists, or
+   *         null otherwise.
+   */
+  public String forName(String name) {
+    try {
+      return this.mimeTypes.forName(name).toString();
+    } catch (MimeTypeException e) {
+      LOG.error("Exception getting mime type by name: [" + name
+          + "]: Message: " + e.getMessage());
+      return null;
+    }
+  }
+
+  /**
+   * Facade interface to Tika's underlying {@link MimeTypes#getMimeType(File)}
+   * method.
+   * 
+   * @param f
+   *          The {@link File} to sense the {@link MimeType} for.
+   * @return The {@link MimeType} of the given {@link File}, or null if it
+   *         cannot be determined.
+   */
+  public String getMimeType(File f) {
+    try {
+      return tika.detect(f);
+    } catch (Exception e) {
+      LOG.error("Exception getting mime type for file: [" + f.getPath()
+          + "]: Message: " + e.getMessage());
+      return null;
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/NodeWalker.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/util/NodeWalker.java b/nutch-core/src/main/java/org/apache/nutch/util/NodeWalker.java
new file mode 100644
index 0000000..c99bae0
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/util/NodeWalker.java
@@ -0,0 +1,129 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.util;
+
+import java.util.Stack;
+
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+
+/**
+ * <p>
+ * A utility class that allows the walking of any DOM tree using a stack instead
+ * of recursion. As the node tree is walked the next node is popped off of the
+ * stack and all of its children are automatically added to the stack to be
+ * called in tree order.
+ * </p>
+ * 
+ * <p>
+ * Currently this class is not thread safe. It is assumed that only one thread
+ * will be accessing the <code>NodeWalker</code> at any given time.
+ * </p>
+ */
+public class NodeWalker {
+
+  // the root node the the stack holding the nodes
+  private Node currentNode;
+  private NodeList currentChildren;
+  private Stack<Node> nodes;
+
+  /**
+   * Starts the <code>Node</code> tree from the root node.
+   * 
+   * @param rootNode
+   */
+  public NodeWalker(Node rootNode) {
+
+    nodes = new Stack<Node>();
+    nodes.add(rootNode);
+  }
+
+  /**
+   * <p>
+   * Returns the next <code>Node</code> on the stack and pushes all of its
+   * children onto the stack, allowing us to walk the node tree without the use
+   * of recursion. If there are no more nodes on the stack then null is
+   * returned.
+   * </p>
+   * 
+   * @return Node The next <code>Node</code> on the stack or null if there isn't
+   *         a next node.
+   */
+  public Node nextNode() {
+
+    // if no next node return null
+    if (!hasNext()) {
+      return null;
+    }
+
+    // pop the next node off of the stack and push all of its children onto
+    // the stack
+    currentNode = nodes.pop();
+    currentChildren = currentNode.getChildNodes();
+    int childLen = (currentChildren != null) ? currentChildren.getLength() : 0;
+
+    // put the children node on the stack in first to last order
+    for (int i = childLen - 1; i >= 0; i--) {
+      nodes.add(currentChildren.item(i));
+    }
+
+    return currentNode;
+  }
+
+  /**
+   * <p>
+   * Skips over and removes from the node stack the children of the last node.
+   * When getting a next node from the walker, that node's children are
+   * automatically added to the stack. You can call this method to remove those
+   * children from the stack.
+   * </p>
+   * 
+   * <p>
+   * This is useful when you don't want to process deeper into the current path
+   * of the node tree but you want to continue processing sibling nodes.
+   * </p>
+   * 
+   */
+  public void skipChildren() {
+
+    int childLen = (currentChildren != null) ? currentChildren.getLength() : 0;
+
+    for (int i = 0; i < childLen; i++) {
+      Node child = nodes.peek();
+      if (child.equals(currentChildren.item(i))) {
+        nodes.pop();
+      }
+    }
+  }
+
+  /**
+   * Return the current node.
+   * 
+   * @return Node
+   */
+  public Node getCurrentNode() {
+    return currentNode;
+  }
+
+  /**
+   * @return returns true if there are more nodes on the current stack.
+   * 
+   */
+  public boolean hasNext() {
+    return (nodes.size() > 0);
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/NutchConfiguration.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/util/NutchConfiguration.java b/nutch-core/src/main/java/org/apache/nutch/util/NutchConfiguration.java
new file mode 100644
index 0000000..ac71a93
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/util/NutchConfiguration.java
@@ -0,0 +1,104 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util;
+
+import java.util.Map.Entry;
+import java.util.Properties;
+import java.util.UUID;
+
+import org.apache.hadoop.conf.Configuration;
+
+/**
+ * Utility to create Hadoop {@link Configuration}s that include Nutch-specific
+ * resources.
+ */
+public class NutchConfiguration {
+  public static final String UUID_KEY = "nutch.conf.uuid";
+
+  private NutchConfiguration() {
+  } // singleton
+
+  /*
+   * Configuration.hashCode() doesn't return values that correspond to a unique
+   * set of parameters. This is a workaround so that we can track instances of
+   * Configuration created by Nutch.
+   */
+  private static void setUUID(Configuration conf) {
+    UUID uuid = UUID.randomUUID();
+    conf.set(UUID_KEY, uuid.toString());
+  }
+
+  /**
+   * Retrieve a Nutch UUID of this configuration object, or null if the
+   * configuration was created elsewhere.
+   * 
+   * @param conf
+   *          configuration instance
+   * @return uuid or null
+   */
+  public static String getUUID(Configuration conf) {
+    return conf.get(UUID_KEY);
+  }
+
+  /**
+   * Create a {@link Configuration} for Nutch. This will load the standard Nutch
+   * resources, <code>nutch-default.xml</code> and <code>nutch-site.xml</code>
+   * overrides.
+   */
+  public static Configuration create() {
+    Configuration conf = new Configuration();
+    setUUID(conf);
+    addNutchResources(conf);
+    return conf;
+  }
+
+  /**
+   * Create a {@link Configuration} from supplied properties.
+   * 
+   * @param addNutchResources
+   *          if true, then first <code>nutch-default.xml</code>, and then
+   *          <code>nutch-site.xml</code> will be loaded prior to applying the
+   *          properties. Otherwise these resources won't be used.
+   * @param nutchProperties
+   *          a set of properties to define (or override)
+   */
+  public static Configuration create(boolean addNutchResources,
+      Properties nutchProperties) {
+    Configuration conf = new Configuration();
+    setUUID(conf);
+    if (addNutchResources) {
+      addNutchResources(conf);
+    }
+    for (Entry<Object, Object> e : nutchProperties.entrySet()) {
+      conf.set(e.getKey().toString(), e.getValue().toString());
+    }
+    return conf;
+  }
+
+  /**
+   * Add the standard Nutch resources to {@link Configuration}.
+   * 
+   * @param conf
+   *          Configuration object to which configuration is to be added.
+   */
+  private static Configuration addNutchResources(Configuration conf) {
+    conf.addResource("nutch-default.xml");
+    conf.addResource("nutch-site.xml");
+    return conf;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/NutchJob.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/util/NutchJob.java b/nutch-core/src/main/java/org/apache/nutch/util/NutchJob.java
new file mode 100644
index 0000000..8b4f8e0
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/util/NutchJob.java
@@ -0,0 +1,30 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.mapred.JobConf;
+
+/** A {@link JobConf} for Nutch jobs. */
+public class NutchJob extends JobConf {
+
+  public NutchJob(Configuration conf) {
+    super(conf, NutchJob.class);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/NutchTool.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/util/NutchTool.java b/nutch-core/src/main/java/org/apache/nutch/util/NutchTool.java
new file mode 100644
index 0000000..8e75177
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/util/NutchTool.java
@@ -0,0 +1,109 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util;
+
+import java.io.IOException;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.nutch.metadata.Nutch;
+
+public abstract class NutchTool extends Configured {
+
+  protected HashMap<String, Object> results = new HashMap<String, Object>();
+  protected Map<String, Object> status = Collections
+      .synchronizedMap(new HashMap<String, Object>());
+  protected Job currentJob;
+  protected int numJobs;
+  protected int currentJobNum;
+
+  /**
+   * Runs the tool, using a map of arguments. May return results, or null.
+   */
+  public abstract Map<String, Object> run(Map<String, Object> args, String crawlId)
+      throws Exception;
+
+  public NutchTool(Configuration conf){
+    super(conf);
+  }
+
+  public NutchTool(){
+    super(null);
+  }
+
+  /** Returns relative progress of the tool, a float in range [0,1]. */
+  public float getProgress() {
+    float res = 0;
+    if (currentJob != null) {
+      try {
+        res = (currentJob.mapProgress() + currentJob.reduceProgress()) / 2.0f;
+      } catch (IOException e) {
+        e.printStackTrace();
+        res = 0;
+      } catch (IllegalStateException ile) {
+        ile.printStackTrace();
+        res = 0;
+      }
+    }
+    // take into account multiple jobs
+    if (numJobs > 1) {
+      res = (currentJobNum + res) / (float) numJobs;
+    }
+    status.put(Nutch.STAT_PROGRESS, res);
+    return res;
+  }
+
+  /** Returns current status of the running tool. */
+  public Map<String, Object> getStatus() {
+    return status;
+  }
+
+  /**
+   * Stop the job with the possibility to resume. Subclasses should override
+   * this, since by default it calls {@link #killJob()}.
+   * 
+   * @return true if succeeded, false otherwise
+   */
+  public boolean stopJob() throws Exception {
+    return killJob();
+  }
+
+  /**
+   * Kill the job immediately. Clients should assume that any results that the
+   * job produced so far are in inconsistent state or missing.
+   * 
+   * @return true if succeeded, false otherwise.
+   * @throws Exception
+   */
+  public boolean killJob() throws Exception {
+    if (currentJob != null && !currentJob.isComplete()) {
+      try {
+        currentJob.killJob();
+        return true;
+      } catch (Exception e) {
+        e.printStackTrace();
+        return false;
+      }
+    }
+    return false;
+  }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/ObjectCache.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/util/ObjectCache.java b/nutch-core/src/main/java/org/apache/nutch/util/ObjectCache.java
new file mode 100644
index 0000000..0277ee6
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/util/ObjectCache.java
@@ -0,0 +1,56 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.util;
+
+import java.util.HashMap;
+import java.util.WeakHashMap;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+
+public class ObjectCache {
+
+  private static final Logger LOG = LoggerFactory.getLogger(ObjectCache.class);
+
+  private static final WeakHashMap<Configuration, ObjectCache> CACHE = new WeakHashMap<Configuration, ObjectCache>();
+
+  private final HashMap<String, Object> objectMap;
+
+  private ObjectCache() {
+    objectMap = new HashMap<String, Object>();
+  }
+
+  public synchronized static ObjectCache get(Configuration conf) {
+    ObjectCache objectCache = CACHE.get(conf);
+    if (objectCache == null) {
+      LOG.debug("No object cache found for conf=" + conf
+          + ", instantiating a new object cache");
+      objectCache = new ObjectCache();
+      CACHE.put(conf, objectCache);
+    }
+    return objectCache;
+  }
+
+  public synchronized Object getObject(String key) {
+    return objectMap.get(key);
+  }
+
+  public synchronized void setObject(String key, Object value) {
+    objectMap.put(key, value);
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/PrefixStringMatcher.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/util/PrefixStringMatcher.java b/nutch-core/src/main/java/org/apache/nutch/util/PrefixStringMatcher.java
new file mode 100644
index 0000000..e323b67
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/util/PrefixStringMatcher.java
@@ -0,0 +1,119 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util;
+
+import java.util.Collection;
+import java.util.Iterator;
+
+/**
+ * A class for efficiently matching <code>String</code>s against a set of
+ * prefixes.
+ */
+public class PrefixStringMatcher extends TrieStringMatcher {
+
+  /**
+   * Creates a new <code>PrefixStringMatcher</code> which will match
+   * <code>String</code>s with any prefix in the supplied array. Zero-length
+   * <code>Strings</code> are ignored.
+   */
+  public PrefixStringMatcher(String[] prefixes) {
+    super();
+    for (int i = 0; i < prefixes.length; i++)
+      addPatternForward(prefixes[i]);
+  }
+
+  /**
+   * Creates a new <code>PrefixStringMatcher</code> which will match
+   * <code>String</code>s with any prefix in the supplied
+   * <code>Collection</code>.
+   * 
+   * @throws ClassCastException
+   *           if any <code>Object</code>s in the collection are not
+   *           <code>String</code>s
+   */
+  public PrefixStringMatcher(Collection<String> prefixes) {
+    super();
+    Iterator<String> iter = prefixes.iterator();
+    while (iter.hasNext())
+      addPatternForward(iter.next());
+  }
+
+  /**
+   * Returns true if the given <code>String</code> is matched by a prefix in the
+   * trie
+   */
+  public boolean matches(String input) {
+    TrieNode node = root;
+    for (int i = 0; i < input.length(); i++) {
+      node = node.getChild(input.charAt(i));
+      if (node == null)
+        return false;
+      if (node.isTerminal())
+        return true;
+    }
+    return false;
+  }
+
+  /**
+   * Returns the shortest prefix of <code>input<code> that is matched,
+   * or <code>null<code> if no match exists.
+   */
+  public String shortestMatch(String input) {
+    TrieNode node = root;
+    for (int i = 0; i < input.length(); i++) {
+      node = node.getChild(input.charAt(i));
+      if (node == null)
+        return null;
+      if (node.isTerminal())
+        return input.substring(0, i + 1);
+    }
+    return null;
+  }
+
+  /**
+   * Returns the longest prefix of <code>input<code> that is matched,
+   * or <code>null<code> if no match exists.
+   */
+  public String longestMatch(String input) {
+    TrieNode node = root;
+    String result = null;
+    for (int i = 0; i < input.length(); i++) {
+      node = node.getChild(input.charAt(i));
+      if (node == null)
+        break;
+      if (node.isTerminal())
+        result = input.substring(0, i + 1);
+    }
+    return result;
+  }
+
+  public static final void main(String[] argv) {
+    PrefixStringMatcher matcher = new PrefixStringMatcher(new String[] {
+        "abcd", "abc", "aac", "baz", "foo", "foobar" });
+
+    String[] tests = { "a", "ab", "abc", "abcdefg", "apple", "aa", "aac",
+        "aaccca", "abaz", "baz", "bazooka", "fo", "foobar", "kite", };
+
+    for (int i = 0; i < tests.length; i++) {
+      System.out.println("testing: " + tests[i]);
+      System.out.println("   matches: " + matcher.matches(tests[i]));
+      System.out.println("  shortest: " + matcher.shortestMatch(tests[i]));
+      System.out.println("   longest: " + matcher.longestMatch(tests[i]));
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/ProtocolStatusStatistics.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/util/ProtocolStatusStatistics.java b/nutch-core/src/main/java/org/apache/nutch/util/ProtocolStatusStatistics.java
new file mode 100644
index 0000000..d26cbfc
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/util/ProtocolStatusStatistics.java
@@ -0,0 +1,179 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util;
+
+import java.io.File;
+import java.io.IOException;
+import java.net.URL;
+import java.text.SimpleDateFormat;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.TimingUtil;
+import org.apache.nutch.metadata.Nutch;
+
+/**
+ * Extracts protocol status code information from the crawl database.
+ *
+ * ProtocolStatusStatistics will give you information on the count
+ * of all status codes encountered on your crawl. This can be useful
+ * for checking a number of things.
+ *
+ * An example output run showing the number of encountered status
+ * codes such as 200, 300, and a count of un-fetched record.
+ *
+ * 38	200
+ * 19	301
+ * 2	302
+ * 665	UNFETCHED
+ *
+ */
+public class ProtocolStatusStatistics extends Configured implements Tool {
+
+  private static final Logger LOG = LoggerFactory
+      .getLogger(ProtocolStatusStatistics.class);
+
+  private static final Text UNFETCHED_TEXT = new Text("UNFETCHED");
+
+  public static Configuration conf;
+
+  public int run(String[] args) throws Exception {
+    if (args.length < 2) {
+      System.err.println("Usage: ProtocolStatistics inputDirs outDir [numOfReducer]");
+
+      System.err.println("\tinputDirs\tComma separated list of crawldb input directories");
+      System.err.println("\t\t\tE.g.: crawl/crawldb/");
+
+      System.err.println("\toutDir\t\tOutput directory where results should be dumped");
+
+      System.err.println("\t[numOfReducers]\tOptional number of reduce jobs to use. Defaults to 1.");
+      return 1;
+    }
+    String inputDir = args[0];
+    String outputDir = args[1];
+
+    int numOfReducers = 1;
+
+    if (args.length > 3) {
+      numOfReducers = Integer.parseInt(args[3]);
+    }
+
+    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+    long start = System.currentTimeMillis();
+    LOG.info("ProtocolStatistics: starting at " + sdf.format(start));
+
+    String jobName = "ProtocolStatistics";
+
+    conf = getConf();
+    conf.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);
+
+    Job job = Job.getInstance(conf, jobName);
+    job.setJarByClass(ProtocolStatusStatistics.class);
+
+    String[] inputDirsSpecs = inputDir.split(",");
+    for (int i = 0; i < inputDirsSpecs.length; i++) {
+      File completeInputPath = new File(new File(inputDirsSpecs[i]), "current");
+      FileInputFormat.addInputPath(job, new Path(completeInputPath.toString()));
+    }
+
+    job.setInputFormatClass(SequenceFileInputFormat.class);
+    FileOutputFormat.setOutputPath(job, new Path(outputDir));
+    job.setOutputFormatClass(TextOutputFormat.class);
+
+    job.setMapOutputKeyClass(Text.class);
+    job.setMapOutputValueClass(LongWritable.class);
+    job.setOutputKeyClass(Text.class);
+    job.setOutputValueClass(LongWritable.class);
+
+    job.setMapperClass(ProtocolStatusStatisticsMapper.class);
+    job.setReducerClass(ProtocolStatusStatisticsReducer.class);
+    job.setCombinerClass(ProtocolStatusStatisticsCombiner.class);
+    job.setNumReduceTasks(numOfReducers);
+
+    try {
+      job.waitForCompletion(true);
+    } catch (Exception e) {
+      throw e;
+    }
+
+    long end = System.currentTimeMillis();
+    LOG.info("ProtocolStatistics: finished at " + sdf.format(end) + ", elapsed: "
+        + TimingUtil.elapsedTime(start, end));
+    return 0;
+  }
+
+  static class ProtocolStatusStatisticsMapper extends
+      Mapper<Text, CrawlDatum, Text, LongWritable> {
+
+    public void map(Text urlText, CrawlDatum datum, Context context)
+        throws IOException, InterruptedException {
+      if (datum.getMetaData().containsKey(Nutch.PROTOCOL_STATUS_CODE_KEY)) {
+        context.write((Text) datum.getMetaData().get(Nutch.PROTOCOL_STATUS_CODE_KEY), new LongWritable(1));
+      } else {
+        context.write(UNFETCHED_TEXT, new LongWritable(1));
+      }
+    }
+  }
+
+  static class ProtocolStatusStatisticsReducer extends
+      Reducer<Text, LongWritable, LongWritable, Text> {
+    public void reduce(Text key, Iterable<LongWritable> values, Context context)
+        throws IOException, InterruptedException {
+      long total = 0;
+
+      for (LongWritable val : values) {
+        total += val.get();
+      }
+
+      context.write(new LongWritable(total), key);
+    }
+  }
+
+  public static class ProtocolStatusStatisticsCombiner extends
+      Reducer<Text, LongWritable, Text, LongWritable> {
+    public void reduce(Text key, Iterable<LongWritable> values, Context context)
+        throws IOException, InterruptedException {
+      long total = 0;
+
+      for (LongWritable val : values) {
+        total += val.get();
+      }
+      context.write(key, new LongWritable(total));
+    }
+  }
+
+  public static void main(String[] args) throws Exception {
+    ToolRunner.run(NutchConfiguration.create(), new ProtocolStatusStatistics(), args);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/StringUtil.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/util/StringUtil.java b/nutch-core/src/main/java/org/apache/nutch/util/StringUtil.java
new file mode 100644
index 0000000..149269f
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/util/StringUtil.java
@@ -0,0 +1,155 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util;
+
+/**
+ * A collection of String processing utility methods.
+ */
+public class StringUtil {
+
+  /**
+   * Returns a copy of <code>s</code> padded with trailing spaces so that it's
+   * length is <code>length</code>. Strings already <code>length</code>
+   * characters long or longer are not altered.
+   */
+  public static String rightPad(String s, int length) {
+    StringBuffer sb = new StringBuffer(s);
+    for (int i = length - s.length(); i > 0; i--)
+      sb.append(" ");
+    return sb.toString();
+  }
+
+  /**
+   * Returns a copy of <code>s</code> padded with leading spaces so that it's
+   * length is <code>length</code>. Strings already <code>length</code>
+   * characters long or longer are not altered.
+   */
+  public static String leftPad(String s, int length) {
+    StringBuffer sb = new StringBuffer();
+    for (int i = length - s.length(); i > 0; i--)
+      sb.append(" ");
+    sb.append(s);
+    return sb.toString();
+  }
+
+  private static final char[] HEX_DIGITS = { '0', '1', '2', '3', '4', '5', '6',
+      '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' };
+
+  /**
+   * Convenience call for {@link #toHexString(byte[], String, int)}, where
+   * <code>sep = null; lineLen = Integer.MAX_VALUE</code>.
+   * 
+   * @param buf
+   */
+  public static String toHexString(byte[] buf) {
+    return toHexString(buf, null, Integer.MAX_VALUE);
+  }
+
+  /**
+   * Get a text representation of a byte[] as hexadecimal String, where each
+   * pair of hexadecimal digits corresponds to consecutive bytes in the array.
+   * 
+   * @param buf
+   *          input data
+   * @param sep
+   *          separate every pair of hexadecimal digits with this separator, or
+   *          null if no separation is needed.
+   * @param lineLen
+   *          break the output String into lines containing output for lineLen
+   *          bytes.
+   */
+  public static String toHexString(byte[] buf, String sep, int lineLen) {
+    if (buf == null)
+      return null;
+    if (lineLen <= 0)
+      lineLen = Integer.MAX_VALUE;
+    StringBuffer res = new StringBuffer(buf.length * 2);
+    for (int i = 0; i < buf.length; i++) {
+      int b = buf[i];
+      res.append(HEX_DIGITS[(b >> 4) & 0xf]);
+      res.append(HEX_DIGITS[b & 0xf]);
+      if (i > 0 && (i % lineLen) == 0)
+        res.append('\n');
+      else if (sep != null && i < lineLen - 1)
+        res.append(sep);
+    }
+    return res.toString();
+  }
+
+  /**
+   * Convert a String containing consecutive (no inside whitespace) hexadecimal
+   * digits into a corresponding byte array. If the number of digits is not
+   * even, a '0' will be appended in the front of the String prior to
+   * conversion. Leading and trailing whitespace is ignored.
+   * 
+   * @param text
+   *          input text
+   * @return converted byte array, or null if unable to convert
+   */
+  public static byte[] fromHexString(String text) {
+    text = text.trim();
+    if (text.length() % 2 != 0)
+      text = "0" + text;
+    int resLen = text.length() / 2;
+    int loNibble, hiNibble;
+    byte[] res = new byte[resLen];
+    for (int i = 0; i < resLen; i++) {
+      int j = i << 1;
+      hiNibble = charToNibble(text.charAt(j));
+      loNibble = charToNibble(text.charAt(j + 1));
+      if (loNibble == -1 || hiNibble == -1)
+        return null;
+      res[i] = (byte) (hiNibble << 4 | loNibble);
+    }
+    return res;
+  }
+
+  private static final int charToNibble(char c) {
+    if (c >= '0' && c <= '9') {
+      return c - '0';
+    } else if (c >= 'a' && c <= 'f') {
+      return 0xa + (c - 'a');
+    } else if (c >= 'A' && c <= 'F') {
+      return 0xA + (c - 'A');
+    } else {
+      return -1;
+    }
+  }
+
+  /**
+   * Checks if a string is empty (ie is null or empty).
+   */
+  public static boolean isEmpty(String str) {
+    return (str == null) || (str.equals(""));
+  }
+
+  /**
+   * Simple character substitution which cleans all \ufffd chars from a given String.
+   */
+  public static String cleanField(String value) {
+    return value.replaceAll("\ufffd", "");
+  }
+
+  public static void main(String[] args) {
+    if (args.length != 1)
+      System.out.println("Usage: StringUtil <encoding name>");
+    else
+      System.out.println(args[0] + " is resolved to "
+          + EncodingDetector.resolveEncodingAlias(args[0]));
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/SuffixStringMatcher.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/util/SuffixStringMatcher.java b/nutch-core/src/main/java/org/apache/nutch/util/SuffixStringMatcher.java
new file mode 100644
index 0000000..a967c01
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/util/SuffixStringMatcher.java
@@ -0,0 +1,114 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util;
+
+import java.util.Collection;
+import java.util.Iterator;
+
+/**
+ * A class for efficiently matching <code>String</code>s against a set of
+ * suffixes. Zero-length <code>Strings</code> are ignored.
+ */
+public class SuffixStringMatcher extends TrieStringMatcher {
+
+  /**
+   * Creates a new <code>PrefixStringMatcher</code> which will match
+   * <code>String</code>s with any suffix in the supplied array.
+   */
+  public SuffixStringMatcher(String[] suffixes) {
+    super();
+    for (int i = 0; i < suffixes.length; i++)
+      addPatternBackward(suffixes[i]);
+  }
+
+  /**
+   * Creates a new <code>PrefixStringMatcher</code> which will match
+   * <code>String</code>s with any suffix in the supplied
+   * <code>Collection</code>
+   */
+  public SuffixStringMatcher(Collection<String> suffixes) {
+    super();
+    Iterator<String> iter = suffixes.iterator();
+    while (iter.hasNext())
+      addPatternBackward(iter.next());
+  }
+
+  /**
+   * Returns true if the given <code>String</code> is matched by a suffix in the
+   * trie
+   */
+  public boolean matches(String input) {
+    TrieNode node = root;
+    for (int i = input.length() - 1; i >= 0; i--) {
+      node = node.getChild(input.charAt(i));
+      if (node == null)
+        return false;
+      if (node.isTerminal())
+        return true;
+    }
+    return false;
+  }
+
+  /**
+   * Returns the shortest suffix of <code>input<code> that is matched,
+   * or <code>null<code> if no match exists.
+   */
+  public String shortestMatch(String input) {
+    TrieNode node = root;
+    for (int i = input.length() - 1; i >= 0; i--) {
+      node = node.getChild(input.charAt(i));
+      if (node == null)
+        return null;
+      if (node.isTerminal())
+        return input.substring(i);
+    }
+    return null;
+  }
+
+  /**
+   * Returns the longest suffix of <code>input<code> that is matched,
+   * or <code>null<code> if no match exists.
+   */
+  public String longestMatch(String input) {
+    TrieNode node = root;
+    String result = null;
+    for (int i = input.length() - 1; i >= 0; i--) {
+      node = node.getChild(input.charAt(i));
+      if (node == null)
+        break;
+      if (node.isTerminal())
+        result = input.substring(i);
+    }
+    return result;
+  }
+
+  public static final void main(String[] argv) {
+    SuffixStringMatcher matcher = new SuffixStringMatcher(new String[] { "a",
+        "abcd", "bcd", "bcdefg", "defg", "aac", "baz", "foo", "foobar" });
+
+    String[] tests = { "a", "ac", "abcd", "abcdefg", "apple", "aa", "aac",
+        "aaccca", "abaz", "baz", "bazooka", "fo", "foobar", "kite", };
+
+    for (int i = 0; i < tests.length; i++) {
+      System.out.println("testing: " + tests[i]);
+      System.out.println("   matches: " + matcher.matches(tests[i]));
+      System.out.println("  shortest: " + matcher.shortestMatch(tests[i]));
+      System.out.println("   longest: " + matcher.longestMatch(tests[i]));
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/TableUtil.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/util/TableUtil.java b/nutch-core/src/main/java/org/apache/nutch/util/TableUtil.java
new file mode 100644
index 0000000..68ded69
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/util/TableUtil.java
@@ -0,0 +1,161 @@
+/*******************************************************************************
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ ******************************************************************************/
+package org.apache.nutch.util;
+
+import org.apache.commons.lang.StringUtils;
+
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.nio.ByteBuffer;
+
+public class TableUtil {
+
+  public static final ByteBuffer YES_VAL = ByteBuffer.wrap(new byte[] { 'y' });
+
+  /**
+   * Reverses a url's domain. This form is better for storing in hbase. Because
+   * scans within the same domain are faster.
+   * <p>
+   * E.g. "http://bar.foo.com:8983/to/index.html?a=b" becomes
+   * "com.foo.bar:8983:http/to/index.html?a=b".
+   * 
+   * @param url
+   *          url to be reversed
+   * @return Reversed url
+   * @throws MalformedURLException
+   */
+  public static String reverseUrl(String urlString)
+      throws MalformedURLException {
+    return reverseUrl(new URL(urlString));
+  }
+
+  /**
+   * Reverses a url's domain. This form is better for storing in hbase. Because
+   * scans within the same domain are faster.
+   * <p>
+   * E.g. "http://bar.foo.com:8983/to/index.html?a=b" becomes
+   * "com.foo.bar:http:8983/to/index.html?a=b".
+   * 
+   * @param url
+   *          url to be reversed
+   * @return Reversed url
+   */
+  public static String reverseUrl(URL url) {
+    String host = url.getHost();
+    String file = url.getFile();
+    String protocol = url.getProtocol();
+    int port = url.getPort();
+
+    StringBuilder buf = new StringBuilder();
+
+    /* reverse host */
+    reverseAppendSplits(host, buf);
+
+    /* add protocol */
+    buf.append(':');
+    buf.append(protocol);
+
+    /* add port if necessary */
+    if (port != -1) {
+      buf.append(':');
+      buf.append(port);
+    }
+
+    /* add path */
+    if (file.length() > 0 && '/' != file.charAt(0)) {
+      buf.append('/');
+    }
+    buf.append(file);
+
+    return buf.toString();
+  }
+
+  public static String unreverseUrl(String reversedUrl) {
+    StringBuilder buf = new StringBuilder(reversedUrl.length() + 2);
+
+    int pathBegin = reversedUrl.indexOf('/');
+    if (pathBegin == -1)
+      pathBegin = reversedUrl.length();
+    String sub = reversedUrl.substring(0, pathBegin);
+
+    String[] splits = StringUtils.splitPreserveAllTokens(sub, ':'); // {<reversed
+                                                                    // host>,
+                                                                    // <port>,
+                                                                    // <protocol>}
+
+    buf.append(splits[1]); // add protocol
+    buf.append("://");
+    reverseAppendSplits(splits[0], buf); // splits[0] is reversed
+    // host
+    if (splits.length == 3) { // has a port
+      buf.append(':');
+      buf.append(splits[2]);
+    }
+    buf.append(reversedUrl.substring(pathBegin));
+    return buf.toString();
+  }
+
+  /**
+   * Given a reversed url, returns the reversed host E.g
+   * "com.foo.bar:http:8983/to/index.html?a=b" -> "com.foo.bar"
+   * 
+   * @param reversedUrl
+   *          Reversed url
+   * @return Reversed host
+   */
+  public static String getReversedHost(String reversedUrl) {
+    return reversedUrl.substring(0, reversedUrl.indexOf(':'));
+  }
+
+  private static void reverseAppendSplits(String string, StringBuilder buf) {
+    String[] splits = StringUtils.split(string, '.');
+    if (splits.length > 0) {
+      for (int i = splits.length - 1; i > 0; i--) {
+        buf.append(splits[i]);
+        buf.append('.');
+      }
+      buf.append(splits[0]);
+    } else {
+      buf.append(string);
+    }
+  }
+
+  public static String reverseHost(String hostName) {
+    StringBuilder buf = new StringBuilder();
+    reverseAppendSplits(hostName, buf);
+    return buf.toString();
+
+  }
+
+  public static String unreverseHost(String reversedHostName) {
+    return reverseHost(reversedHostName); // Reversible
+  }
+
+  /**
+   * Convert given Utf8 instance to String and and cleans out any offending "\ufffd"
+   * from the String.
+   * 
+   * 
+   * @param utf8
+   *          Utf8 object
+   * @return string-ifed Utf8 object or null if Utf8 instance is null
+   */
+  public static String toString(CharSequence utf8) {
+    return (utf8 == null ? null : StringUtil.cleanField(utf8.toString()));
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/TimingUtil.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/util/TimingUtil.java b/nutch-core/src/main/java/org/apache/nutch/util/TimingUtil.java
new file mode 100644
index 0000000..c4af356
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/util/TimingUtil.java
@@ -0,0 +1,72 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util;
+
+import java.util.concurrent.TimeUnit;
+
+public class TimingUtil {
+
+  /**
+   * Calculate the elapsed time between two times specified in milliseconds.
+   * 
+   * @param start
+   *          The start of the time period
+   * @param end
+   *          The end of the time period
+   * @return a string of the form "XhYmZs" when the elapsed time is X hours, Y
+   *         minutes and Z seconds or null if start > end.
+   */
+  public static String elapsedTime(long start, long end) {
+    if (start > end) {
+      return null;
+    }
+    return secondsToHMS((end-start)/1000);
+  }
+  
+  /**
+   * Show time in seconds as hours, minutes and seconds (hh:mm:ss)
+   * 
+   * @param seconds
+   *          (elapsed) time in seconds
+   * @return human readable time string "hh:mm:ss"
+   */
+  public static String secondsToHMS(long seconds) {
+    long hours = TimeUnit.SECONDS.toHours(seconds);
+    long minutes = TimeUnit.SECONDS.toMinutes(seconds)
+        % TimeUnit.HOURS.toMinutes(1);
+    seconds = TimeUnit.SECONDS.toSeconds(seconds)
+        % TimeUnit.MINUTES.toSeconds(1);
+    return String.format("%02d:%02d:%02d", hours, minutes, seconds);
+  }
+
+  /**
+   * Show time in seconds as days, hours, minutes and seconds (d days, hh:mm:ss)
+   * 
+   * @param seconds
+   *          (elapsed) time in seconds
+   * @return human readable time string "d days, hh:mm:ss"
+   */
+  public static String secondsToDaysHMS(long seconds) {
+    long days = TimeUnit.SECONDS.toDays(seconds);
+    if (days == 0)
+      return secondsToHMS(seconds);
+    String hhmmss = secondsToHMS(seconds % TimeUnit.DAYS.toSeconds(1));
+    return String.format("%d days, %s", days, hhmmss);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/TrieStringMatcher.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/util/TrieStringMatcher.java b/nutch-core/src/main/java/org/apache/nutch/util/TrieStringMatcher.java
new file mode 100644
index 0000000..95f06ad
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/util/TrieStringMatcher.java
@@ -0,0 +1,202 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util;
+
+import java.util.Arrays;
+import java.util.LinkedList;
+import java.util.ListIterator;
+
+/**
+ * TrieStringMatcher is a base class for simple tree-based string matching.
+ * 
+ */
+public abstract class TrieStringMatcher {
+  protected TrieNode root;
+
+  protected TrieStringMatcher() {
+    this.root = new TrieNode('\000', false);
+  }
+
+  /**
+   * Node class for the character tree.
+   */
+  protected class TrieNode implements Comparable<TrieNode> {
+    protected TrieNode[] children;
+    protected LinkedList<TrieNode> childrenList;
+    protected char nodeChar;
+    protected boolean terminal;
+
+    /**
+     * Creates a new TrieNode, which contains the given <code>nodeChar</code>.
+     * If <code>isTerminal</code> is <code>true</code>, the new node is a
+     * <em>terminal</em> node in the trie.
+     */
+    TrieNode(char nodeChar, boolean isTerminal) {
+      this.nodeChar = nodeChar;
+      this.terminal = isTerminal;
+      this.childrenList = new LinkedList<TrieNode>();
+    }
+
+    /**
+     * Returns <code>true</code> if this node is a <em>terminal</em> node in the
+     * trie.
+     */
+    boolean isTerminal() {
+      return terminal;
+    }
+
+    /**
+     * Returns the child node of this node whose node-character is
+     * <code>nextChar</code>. If no such node exists, one will be is added. If
+     * <em>isTerminal</em> is <code>true</code>, the node will be a terminal
+     * node in the trie.
+     */
+    TrieNode getChildAddIfNotPresent(char nextChar, boolean isTerminal) {
+      if (childrenList == null) {
+        childrenList = new LinkedList<TrieNode>();
+        childrenList.addAll(Arrays.asList(children));
+        children = null;
+      }
+
+      if (childrenList.size() == 0) {
+        TrieNode newNode = new TrieNode(nextChar, isTerminal);
+        childrenList.add(newNode);
+        return newNode;
+      }
+
+      ListIterator<TrieNode> iter = childrenList.listIterator();
+      TrieNode node = iter.next();
+      while ((node.nodeChar < nextChar) && iter.hasNext())
+        node = iter.next();
+
+      if (node.nodeChar == nextChar) {
+        node.terminal = node.terminal | isTerminal;
+        return node;
+      }
+
+      if (node.nodeChar > nextChar)
+        iter.previous();
+
+      TrieNode newNode = new TrieNode(nextChar, isTerminal);
+      iter.add(newNode);
+      return newNode;
+    }
+
+    /**
+     * Returns the child node of this node whose node-character is
+     * <code>nextChar</code>. If no such node exists, <code>null</code> is
+     * returned.
+     */
+    TrieNode getChild(char nextChar) {
+      if (children == null) {
+        children = childrenList.toArray(new TrieNode[childrenList.size()]);
+        childrenList = null;
+        Arrays.sort(children);
+      }
+
+      int min = 0;
+      int max = children.length - 1;
+      int mid = 0;
+      while (min < max) {
+        mid = (min + max) / 2;
+        if (children[mid].nodeChar == nextChar)
+          return children[mid];
+        if (children[mid].nodeChar < nextChar)
+          min = mid + 1;
+        else
+          // if (children[mid].nodeChar > nextChar)
+          max = mid - 1;
+      }
+
+      if (min == max)
+        if (children[min].nodeChar == nextChar)
+          return children[min];
+
+      return null;
+    }
+
+    public int compareTo(TrieNode other) {
+      if (this.nodeChar < other.nodeChar)
+        return -1;
+      if (this.nodeChar == other.nodeChar)
+        return 0;
+      // if (this.nodeChar > other.nodeChar)
+      return 1;
+    }
+  }
+
+  /**
+   * Returns the next {@link TrieNode} visited, given that you are at
+   * <code>node</code>, and the the next character in the input is the
+   * <code>idx</code>'th character of <code>s</code>.
+   */
+  protected final TrieNode matchChar(TrieNode node, String s, int idx) {
+    return node.getChild(s.charAt(idx));
+  }
+
+  /**
+   * Adds any necessary nodes to the trie so that the given <code>String</code>
+   * can be decoded and the last character is represented by a terminal node.
+   * Zero-length <code>Strings</code> are ignored.
+   */
+  protected final void addPatternForward(String s) {
+    TrieNode node = root;
+    int stop = s.length() - 1;
+    int i;
+    if (s.length() > 0) {
+      for (i = 0; i < stop; i++)
+        node = node.getChildAddIfNotPresent(s.charAt(i), false);
+      node = node.getChildAddIfNotPresent(s.charAt(i), true);
+    }
+  }
+
+  /**
+   * Adds any necessary nodes to the trie so that the given <code>String</code>
+   * can be decoded <em>in reverse</em> and the first character is represented
+   * by a terminal node. Zero-length <code>Strings</code> are ignored.
+   */
+  protected final void addPatternBackward(String s) {
+    TrieNode node = root;
+    if (s.length() > 0) {
+      for (int i = s.length() - 1; i > 0; i--)
+        node = node.getChildAddIfNotPresent(s.charAt(i), false);
+      node = node.getChildAddIfNotPresent(s.charAt(0), true);
+    }
+  }
+
+  /**
+   * Returns true if the given <code>String</code> is matched by a pattern in
+   * the trie
+   */
+  public abstract boolean matches(String input);
+
+  /**
+   * Returns the shortest substring of <code>input<code> that is
+   * matched by a pattern in the trie, or <code>null<code> if no match
+   * exists.
+   */
+  public abstract String shortestMatch(String input);
+
+  /**
+   * Returns the longest substring of <code>input<code> that is
+   * matched by a pattern in the trie, or <code>null<code> if no match
+   * exists.
+   */
+  public abstract String longestMatch(String input);
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/URLUtil.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/util/URLUtil.java b/nutch-core/src/main/java/org/apache/nutch/util/URLUtil.java
new file mode 100644
index 0000000..3e696cb
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/util/URLUtil.java
@@ -0,0 +1,533 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util;
+
+import java.net.MalformedURLException;
+import java.net.*;
+import java.util.regex.Pattern;
+
+import org.apache.nutch.util.domain.DomainSuffix;
+import org.apache.nutch.util.domain.DomainSuffixes;
+
+/** Utility class for URL analysis */
+public class URLUtil {
+
+  /**
+   * Resolve relative URL-s and fix a java.net.URL error in handling of URLs
+   * with pure query targets.
+   * 
+   * @param base
+   *          base url
+   * @param target
+   *          target url (may be relative)
+   * @return resolved absolute url.
+   * @throws MalformedURLException
+   */
+  public static URL resolveURL(URL base, String target)
+      throws MalformedURLException {
+    target = target.trim();
+
+    // handle the case that there is a target that is a pure query,
+    // for example
+    // http://careers3.accenture.com/Careers/ASPX/Search.aspx?co=0&sk=0
+    // It has urls in the page of the form href="?co=0&sk=0&pg=1", and by
+    // default
+    // URL constructs the base+target combo as
+    // http://careers3.accenture.com/Careers/ASPX/?co=0&sk=0&pg=1, incorrectly
+    // dropping the Search.aspx target
+    //
+    // Browsers handle these just fine, they must have an exception similar to
+    // this
+    if (target.startsWith("?")) {
+      return fixPureQueryTargets(base, target);
+    }
+
+    return new URL(base, target);
+  }
+
+  /** Handle the case in RFC3986 section 5.4.1 example 7, and similar. */
+  static URL fixPureQueryTargets(URL base, String target)
+      throws MalformedURLException {
+    if (!target.startsWith("?"))
+      return new URL(base, target);
+
+    String basePath = base.getPath();
+    String baseRightMost = "";
+    int baseRightMostIdx = basePath.lastIndexOf("/");
+    if (baseRightMostIdx != -1) {
+      baseRightMost = basePath.substring(baseRightMostIdx + 1);
+    }
+
+    if (target.startsWith("?"))
+      target = baseRightMost + target;
+
+    return new URL(base, target);
+  }
+
+  private static Pattern IP_PATTERN = Pattern
+      .compile("(\\d{1,3}\\.){3}(\\d{1,3})");
+
+  /**
+   * Returns the domain name of the url. The domain name of a url is the
+   * substring of the url's hostname, w/o subdomain names. As an example <br>
+   * <code>
+   *  getDomainName(conf, new URL(http://lucene.apache.org/))
+   *  </code><br>
+   * will return <br>
+   * <code> apache.org</code>
+   * */
+  public static String getDomainName(URL url) {
+    DomainSuffixes tlds = DomainSuffixes.getInstance();
+    String host = url.getHost();
+    // it seems that java returns hostnames ending with .
+    if (host.endsWith("."))
+      host = host.substring(0, host.length() - 1);
+    if (IP_PATTERN.matcher(host).matches())
+      return host;
+
+    int index = 0;
+    String candidate = host;
+    for (; index >= 0;) {
+      index = candidate.indexOf('.');
+      String subCandidate = candidate.substring(index + 1);
+      if (tlds.isDomainSuffix(subCandidate)) {
+        return candidate;
+      }
+      candidate = subCandidate;
+    }
+    return candidate;
+  }
+
+  /**
+   * Returns the domain name of the url. The domain name of a url is the
+   * substring of the url's hostname, w/o subdomain names. As an example <br>
+   * <code>
+   *  getDomainName(conf, new http://lucene.apache.org/)
+   *  </code><br>
+   * will return <br>
+   * <code> apache.org</code>
+   * 
+   * @throws MalformedURLException
+   */
+  public static String getDomainName(String url) throws MalformedURLException {
+    return getDomainName(new URL(url));
+  }
+
+  /**
+   * Returns the top level domain name of the url. The top level domain name of
+   * a url is the substring of the url's hostname, w/o subdomain names. As an
+   * example <br>
+   * <code>
+   *  getTopLevelDomainName(conf, new http://lucene.apache.org/)
+   *  </code><br>
+   * will return <br>
+   * <code> org</code>
+   * 
+   * @throws MalformedURLException
+   */
+  public static String getTopLevelDomainName(URL url)
+      throws MalformedURLException {
+    String suffix = getDomainSuffix(url).toString();
+    int idx = suffix.lastIndexOf(".");
+    if (idx != -1) {
+      return suffix.substring(idx + 1);
+    } else {
+      return suffix;
+    }
+  }
+
+  /**
+   * Returns the top level domain name of the url. The top level domain name of
+   * a url is the substring of the url's hostname, w/o subdomain names. As an
+   * example <br>
+   * <code>
+   *  getTopLevelDomainName(conf, new http://lucene.apache.org/)
+   *  </code><br>
+   * will return <br>
+   * <code> org</code>
+   * 
+   * @throws MalformedURLException
+   */
+  public static String getTopLevelDomainName(String url)
+      throws MalformedURLException {
+    return getTopLevelDomainName(new URL(url));
+  }
+
+  /**
+   * Returns whether the given urls have the same domain name. As an example, <br>
+   * <code> isSameDomain(new URL("http://lucene.apache.org")
+   * , new URL("http://people.apache.org/"))
+   * <br> will return true. </code>
+   * 
+   * @return true if the domain names are equal
+   */
+  public static boolean isSameDomainName(URL url1, URL url2) {
+    return getDomainName(url1).equalsIgnoreCase(getDomainName(url2));
+  }
+
+  /**
+   * Returns whether the given urls have the same domain name. As an example, <br>
+   * <code> isSameDomain("http://lucene.apache.org"
+   * ,"http://people.apache.org/")
+   * <br> will return true. </code>
+   * 
+   * @return true if the domain names are equal
+   * @throws MalformedURLException
+   */
+  public static boolean isSameDomainName(String url1, String url2)
+      throws MalformedURLException {
+    return isSameDomainName(new URL(url1), new URL(url2));
+  }
+
+  /**
+   * Returns the {@link DomainSuffix} corresponding to the last public part of
+   * the hostname
+   */
+  public static DomainSuffix getDomainSuffix(URL url) {
+    DomainSuffixes tlds = DomainSuffixes.getInstance();
+    String host = url.getHost();
+    if (IP_PATTERN.matcher(host).matches())
+      return null;
+
+    int index = 0;
+    String candidate = host;
+    for (; index >= 0;) {
+      index = candidate.indexOf('.');
+      String subCandidate = candidate.substring(index + 1);
+      DomainSuffix d = tlds.get(subCandidate);
+      if (d != null) {
+        return d;
+      }
+      candidate = subCandidate;
+    }
+    return null;
+  }
+
+  /**
+   * Returns the {@link DomainSuffix} corresponding to the last public part of
+   * the hostname
+   */
+  public static DomainSuffix getDomainSuffix(String url)
+      throws MalformedURLException {
+    return getDomainSuffix(new URL(url));
+  }
+
+  /** Partitions of the hostname of the url by "." */
+  public static String[] getHostSegments(URL url) {
+    String host = url.getHost();
+    // return whole hostname, if it is an ipv4
+    // TODO : handle ipv6
+    if (IP_PATTERN.matcher(host).matches())
+      return new String[] { host };
+    return host.split("\\.");
+  }
+
+  /**
+   * Partitions of the hostname of the url by "."
+   * 
+   * @throws MalformedURLException
+   */
+  public static String[] getHostSegments(String url)
+      throws MalformedURLException {
+    return getHostSegments(new URL(url));
+  }
+
+  /**
+   * <p>
+   * Given two urls, a src and a destination of a redirect, it returns the
+   * representative url.
+   * <p>
+   * 
+   * <p>
+   * This method implements an extended version of the algorithm used by the
+   * Yahoo! Slurp crawler described here:<br>
+   * <a href=
+   * "http://help.yahoo.com/l/nz/yahooxtra/search/webcrawler/slurp-11.html"> How
+   * does the Yahoo! webcrawler handle redirects?</a> <br>
+   * <br>
+   * <ol>
+   * <li>Choose target url if either url is malformed.</li>
+   * <li>If different domains the keep the destination whether or not the
+   * redirect is temp or perm</li>
+   * <ul>
+   * <li>a.com -> b.com*</li>
+   * </ul>
+   * <li>If the redirect is permanent and the source is root, keep the source.</li>
+   * <ul>
+   * <li>*a.com -> a.com?y=1 || *a.com -> a.com/xyz/index.html</li>
+   * </ul>
+   * <li>If the redirect is permanent and the source is not root and the
+   * destination is root, keep the destination</li>
+   * <ul>
+   * <li>a.com/xyz/index.html -> a.com*</li>
+   * </ul>
+   * <li>If the redirect is permanent and neither the source nor the destination
+   * is root, then keep the destination</li>
+   * <ul>
+   * <li>a.com/xyz/index.html -> a.com/abc/page.html*</li>
+   * </ul>
+   * <li>If the redirect is temporary and source is root and destination is not
+   * root, then keep the source</li>
+   * <ul>
+   * <li>*a.com -> a.com/xyz/index.html</li>
+   * </ul>
+   * <li>If the redirect is temporary and source is not root and destination is
+   * root, then keep the destination</li>
+   * <ul>
+   * <li>a.com/xyz/index.html -> a.com*</li>
+   * </ul>
+   * <li>If the redirect is temporary and neither the source or the destination
+   * is root, then keep the shortest url. First check for the shortest host, and
+   * if both are equal then check by path. Path is first by length then by the
+   * number of / path separators.</li>
+   * <ul>
+   * <li>a.com/xyz/index.html -> a.com/abc/page.html*</li>
+   * <li>*www.a.com/xyz/index.html -> www.news.a.com/xyz/index.html</li>
+   * </ul>
+   * <li>If the redirect is temporary and both the source and the destination
+   * are root, then keep the shortest sub-domain</li>
+   * <ul>
+   * <li>*www.a.com -> www.news.a.com</li>
+   * </ul>
+   * <br>
+   * While not in this logic there is a further piece of representative url
+   * logic that occurs during indexing and after scoring. During creation of the
+   * basic fields before indexing, if a url has a representative url stored we
+   * check both the url and its representative url (which should never be the
+   * same) against their linkrank scores and the highest scoring one is kept as
+   * the url and the lower scoring one is held as the orig url inside of the
+   * index.
+   * 
+   * @param src
+   *          The source url.
+   * @param dst
+   *          The destination url.
+   * @param temp
+   *          Is the redirect a temporary redirect.
+   * 
+   * @return String The representative url.
+   */
+  public static String chooseRepr(String src, String dst, boolean temp) {
+
+    // validate both are well formed urls
+    URL srcUrl;
+    URL dstUrl;
+    try {
+      srcUrl = new URL(src);
+      dstUrl = new URL(dst);
+    } catch (MalformedURLException e) {
+      return dst;
+    }
+
+    // get the source and destination domain, host, and page
+    String srcDomain = URLUtil.getDomainName(srcUrl);
+    String dstDomain = URLUtil.getDomainName(dstUrl);
+    String srcHost = srcUrl.getHost();
+    String dstHost = dstUrl.getHost();
+    String srcFile = srcUrl.getFile();
+    String dstFile = dstUrl.getFile();
+
+    // are the source and destination the root path url.com/ or url.com
+    boolean srcRoot = (srcFile.equals("/") || srcFile.length() == 0);
+    boolean destRoot = (dstFile.equals("/") || dstFile.length() == 0);
+
+    // 1) different domain them keep dest, temp or perm
+    // a.com -> b.com*
+    //
+    // 2) permanent and root, keep src
+    // *a.com -> a.com?y=1 || *a.com -> a.com/xyz/index.html
+    //
+    // 3) permanent and not root and dest root, keep dest
+    // a.com/xyz/index.html -> a.com*
+    //
+    // 4) permanent and neither root keep dest
+    // a.com/xyz/index.html -> a.com/abc/page.html*
+    //
+    // 5) temp and root and dest not root keep src
+    // *a.com -> a.com/xyz/index.html
+    //
+    // 7) temp and not root and dest root keep dest
+    // a.com/xyz/index.html -> a.com*
+    //
+    // 8) temp and neither root, keep shortest, if hosts equal by path else by
+    // hosts. paths are first by length then by number of / separators
+    // a.com/xyz/index.html -> a.com/abc/page.html*
+    // *www.a.com/xyz/index.html -> www.news.a.com/xyz/index.html
+    //
+    // 9) temp and both root keep shortest sub domain
+    // *www.a.com -> www.news.a.com
+
+    // if we are dealing with a redirect from one domain to another keep the
+    // destination
+    if (!srcDomain.equals(dstDomain)) {
+      return dst;
+    }
+
+    // if it is a permanent redirect
+    if (!temp) {
+
+      // if source is root return source, otherwise destination
+      if (srcRoot) {
+        return src;
+      } else {
+        return dst;
+      }
+    } else { // temporary redirect
+
+      // source root and destination not root
+      if (srcRoot && !destRoot) {
+        return src;
+      } else if (!srcRoot && destRoot) { // destination root and source not
+        return dst;
+      } else if (!srcRoot && !destRoot && (srcHost.equals(dstHost))) {
+
+        // source and destination hosts are the same, check paths, host length
+        int numSrcPaths = srcFile.split("/").length;
+        int numDstPaths = dstFile.split("/").length;
+        if (numSrcPaths != numDstPaths) {
+          return (numDstPaths < numSrcPaths ? dst : src);
+        } else {
+          int srcPathLength = srcFile.length();
+          int dstPathLength = dstFile.length();
+          return (dstPathLength < srcPathLength ? dst : src);
+        }
+      } else {
+
+        // different host names and both root take the shortest
+        int numSrcSubs = srcHost.split("\\.").length;
+        int numDstSubs = dstHost.split("\\.").length;
+        return (numDstSubs < numSrcSubs ? dst : src);
+      }
+    }
+  }
+
+  /**
+   * Returns the lowercased hostname for the url or null if the url is not well
+   * formed.
+   * 
+   * @param url
+   *          The url to check.
+   * @return String The hostname for the url.
+   */
+  public static String getHost(String url) {
+    try {
+      return new URL(url).getHost().toLowerCase();
+    } catch (MalformedURLException e) {
+      return null;
+    }
+  }
+
+  /**
+   * Returns the page for the url. The page consists of the protocol, host, and
+   * path, but does not include the query string. The host is lowercased but the
+   * path is not.
+   * 
+   * @param url
+   *          The url to check.
+   * @return String The page for the url.
+   */
+  public static String getPage(String url) {
+    try {
+      // get the full url, and replace the query string with and empty string
+      url = url.toLowerCase();
+      String queryStr = new URL(url).getQuery();
+      return (queryStr != null) ? url.replace("?" + queryStr, "") : url;
+    } catch (MalformedURLException e) {
+      return null;
+    }
+  }
+
+  public static String getProtocol(String url) {
+    try {
+      return getProtocol(new URL(url));
+    } catch (Exception e) {
+      return null;
+    }
+  }
+
+  public static String getProtocol(URL url) {
+    return url.getProtocol();
+  }
+
+  public static String toASCII(String url) {
+    try {
+      URL u = new URL(url);
+      String host = u.getHost();
+      if (host == null || host.isEmpty()) {
+        // no host name => no punycoded domain name
+        // also do not add additional slashes for file: URLs (NUTCH-1880)
+        return url;
+      }
+      URI p = new URI(u.getProtocol(), u.getUserInfo(), IDN.toASCII(host),
+          u.getPort(), u.getPath(), u.getQuery(), u.getRef());
+
+      return p.toString();
+    } catch (Exception e) {
+      return null;
+    }
+  }
+
+  public static String toUNICODE(String url) {
+    try {
+      URL u = new URL(url);
+      String host = u.getHost();
+      if (host == null || host.isEmpty()) {
+        // no host name => no punycoded domain name
+        // also do not add additional slashes for file: URLs (NUTCH-1880)
+        return url;
+      }
+      StringBuilder sb = new StringBuilder();
+      sb.append(u.getProtocol());
+      sb.append("://");
+      if (u.getUserInfo() != null) {
+        sb.append(u.getUserInfo());
+        sb.append('@');
+      }
+      sb.append(IDN.toUnicode(host));
+      if (u.getPort() != -1) {
+        sb.append(':');
+        sb.append(u.getPort());
+      }
+      sb.append(u.getFile()); // includes query
+      if (u.getRef() != null) {
+        sb.append('#');
+        sb.append(u.getRef());
+      }
+
+      return sb.toString();
+    } catch (Exception e) {
+      return null;
+    }
+  }
+
+  /** For testing */
+  public static void main(String[] args) {
+
+    if (args.length != 1) {
+      System.err.println("Usage : URLUtil <url>");
+      return;
+    }
+
+    String url = args[0];
+    try {
+      System.out.println(URLUtil.getDomainName(new URL(url)));
+    } catch (MalformedURLException ex) {
+      ex.printStackTrace();
+    }
+  }
+}

[20/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build for nutch-core and nutch-plugins

Posted by th...@apache.org.

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/language-identifier/src/test/java/org/apache/nutch/analysis/lang/de.test
----------------------------------------------------------------------
diff --git a/nutch-plugins/language-identifier/src/test/java/org/apache/nutch/analysis/lang/de.test b/nutch-plugins/language-identifier/src/test/java/org/apache/nutch/analysis/lang/de.test
new file mode 100644
index 0000000..9d6e5c9
--- /dev/null
+++ b/nutch-plugins/language-identifier/src/test/java/org/apache/nutch/analysis/lang/de.test
@@ -0,0 +1,104 @@
+Wiederaufnahme der Sitzungsperiode
+Ich erkl�re die am Freitag, dem 17. Dezember unterbrochene Sitzungsperiode des Europ�ischen Parlaments f�r wiederaufgenommen, w�nsche Ihnen nochmals alles Gute zum Jahreswechsel und hoffe, da� Sie sch�ne Ferien hatten.
+Wie Sie feststellen konnten, ist der gef�rchtete "Millenium-Bug " nicht eingetreten. Doch sind B�rger einiger unserer Mitgliedstaaten Opfer von schrecklichen Naturkatastrophen geworden. Im Parlament besteht der Wunsch nach einer Aussprache im Verlauf dieser Sitzungsperiode in den n�chsten Tagen. Heute m�chte ich Sie bitten - das ist auch der Wunsch einiger Kolleginnen und Kollegen -, allen Opfern der St�rme, insbesondere in den verschiedenen L�ndern der Europ�ischen Union, in einer Schweigeminute zu gedenken. Ich bitte Sie, sich zu einer Schweigeminute zu erheben.
+(Das Parlament erhebt sich zu einer Schweigeminute.)
+
+Frau Pr�sidentin, zur Gesch�ftsordnung. Wie Sie sicher aus der Presse und dem Fernsehen wissen, gab es in Sri Lanka mehrere Bombenexplosionen mit zahlreichen Toten. Zu den Attentatsopfern, die es in j�ngster Zeit in Sri Lanka zu beklagen gab, z�hlt auch Herr Kumar Ponnambalam, der dem Europ�ischen Parlament erst vor wenigen Monaten einen Besuch abgestattet hatte. W�re es angemessen, wenn Sie, Frau Pr�sidentin, der Pr�sidentin von Sri Lanka in einem Schreiben das Bedauern des Parlaments zum gewaltsamen Tod von Herrn Ponnambalam und anderen B�rgern von Sri Lanka �bermitteln und sie auffordern w�rden, alles in ihrem Kr�ften stehende zu tun, um nach einer friedlichen L�sung dieser sehr schwierigen Situation zu suchen?
+
+Ja, Herr Evans, ich denke, da� eine derartige Initiative durchaus angebracht ist. Wenn das Haus damit einverstanden ist, werde ich dem Vorschlag von Herrn Evans folgen.
+
+Frau Pr�sidentin, zur Gesch�ftsordnung. K�nnten Sie mir eine Auskunft zu Artikel 143 im Zusammenhang mit der Unzul�ssigkeit geben? Meine Frage betrifft eine Angelegenheit, die am Donnerstag zur Sprache kommen wird und auf die ich dann erneut verweisen werde.
+Das Parlament wird sich am Donnerstag mit dem Cunha-Bericht �ber mehrj�hrige Ausrichtungsprogramme befassen, der in Absatz 6 vorschl�gt, da� L�nder, die ihr Soll zur Flottenverkleinerung nicht erf�llen, j�hrlich mit einer Art Quotenstrafe belegt werden sollen. Und zwar sollen derartige Strafen trotz des Grundsatzes der relativen Stabilit�t verh�ngt werden. Ich meine, da� der Grundsatz der relativen Stabilit�t einen elementaren Rechtsgrundsatz der gemeinsamen Fischereipolitik darstellt und ein Vorschlag, diesen zu unterlaufen, rechtlich unzul�ssig w�re. Ich m�chte wissen, ob es m�glich ist, einen Einwand gegen ein Dokument zu erheben, bei dem es sich lediglich um einen Bericht und keinen Legislativvorschlag handelt, und ob ich befugt bin, dies am Donnerstag zu tun.
+
+Genau dann k�nnen Sie, wenn Sie wollen, diese Frage ansprechen, d. h. am Donnerstag zu Beginn der Aussprache �ber den Bericht.
+
+Frau Pr�sidentin! Die erste diesj�hrige Tagung des Europ�ischen Parlaments f�llt leider damit zusammen, da� in den Vereinigten Staaten, in Texas, f�r Donnerstag dieser Woche die Hinrichtung eines zum Tode verurteilten 34j�hrigen jungen Mannes namens Hicks festgelegt worden ist.
+Auf Wunsch eines franz�sischen Mitglieds, Herrn Zimeray, wurde bereits eine Petition eingereicht, die von vielen, auch von mir selbst, unterzeichnet worden ist. Gem�� der vom Europ�ischen Parlament und von der gesamten Europ�ischen Union nunmehr st�ndig vertretenen Linie m�chte ich Sie jedoch bitten, den ganzen Einflu� Ihres Amtes und der Institution, die Sie vertreten, bei dem Pr�sidentschaftskandidaten und Gouverneur von Texas, George W. Bush, der zur Aussetzung der Vollstreckung des Todesurteils und zur Begnadigung des Verurteilten befugt ist, geltend zu machen.
+All dies entspricht den Grunds�tzen, die wir stets verteidigt haben.
+
+Vielen Dank, Herr Segni, das will ich gerne tun. Das ist ganz im Sinne der Position, die wir als Parlament immer vertreten haben.
+
+Frau Pr�sidentin! Ich m�chte Sie auf einen Fall aufmerksam machen, mit dem sich dieses Parlament immer wieder befa�t hat. Das ist der Fall von Alexander Nikitin. Wir freuen uns hier alle, da� das Gericht ihn freigesprochen und deutlich gemacht hat, da� auch in Ru�land der Zugang zu Umweltinformationen konstitutionelles Recht ist. Nun ist es aber so, da� er wieder angeklagt werden soll, weil der Staatsanwalt in Berufung geht. Wir wissen und wir haben es in wirklich sehr vielen Entschlie�ungen festgestellt - gerade w�hrend der letzten Plenartagung des vergangenen Jahres-, da� dies nicht nur ein juristischer Fall ist und da� es falsch ist, Alexander Nikitin Kriminalit�t und Verrat vorzuwerfen, weil wir als Betroffene von seinen Ergebnissen einen Nutzen haben. Diese Ergebnisse sind die Grundlage f�r die europ�ischen Programme zum Schutz der Barentsee, und deswegen bitte ich Sie, einen Briefentwurf, der Ihnen die wichtigsten Fakten schildert, zu pr�fen und im Sinne der Bes
 chl�sse des Parlaments in Ru�land diese Position deutlich zu machen.
+
+Frau Schroedter, ich bin gerne bereit, die damit zusammenh�ngenden Fakten zu pr�fen, wenn mir Ihr Brief vorliegt.
+
+Frau Pr�sidentin, zun�chst besten Dank daf�r, da� Sie Wort gehalten haben und nun in der ersten Sitzungsperiode des neuen Jahres das Angebot an Fernsehprogrammen in unseren B�ros tats�chlich enorm erweitert ist. Dennoch, Frau Pr�sidentin, wurde meinem Wunsch nicht entsprochen. Zwar k�nnen wir jetzt zwei finnische und einen portugiesischen, nach wie vor aber keinen niederl�ndischen Sender empfangen. Ich hatte Sie aber um ein niederl�ndisches Programm gebeten, denn auch wir Niederl�nder m�chten die Nachrichten verfolgen, wenn wir jeden Monat hierher in die Verbannung geschickt werden. Deshalb m�chte ich Sie nochmals ersuchen, daf�r Sorge zu tragen, da� auch ein niederl�ndischer Sender eingespeist wird.
+
+Frau Plooij-van Gorsel, ich kann Ihnen mitteilen, da� dieser Punkt am Mittwoch auf der Tagesordnung der Qu�storen steht. Ich hoffe, da� dort in Ihrem Sinne entschieden wird.
+
+Frau Pr�sidentin, k�nnen Sie mir sagen, warum sich dieses Parlament nicht an die Arbeitsschutzregelungen h�lt, die es selbst verabschiedet hat? Weshalb wurde die Luftqualit�t in diesem Geb�ude seit unserer Wahl nicht ein einziges Mal �berpr�ft? Weshalb ist der Arbeitsschutzausschu� seit 1998 nicht ein einziges Mal zusammengetreten? Warum hat weder im Br�sseler noch im Stra�burger Parlamentsgeb�ude eine Brandschutz�bung stattgefunden? Warum finden keine Brandschutzbelehrungen statt? Warum wurde nach meinem Unfall nichts unternommen, um die Treppen sicherer zu machen? Warum wird in den Nichtraucherzonen das Rauchverbot nicht durchgesetzt? Es ist eine Schande, da� wir Regeln verabschieden, an die wir uns dann selbst nicht halten.
+
+Frau Lynne, Sie haben v�llig recht, und ich werde pr�fen, ob all dies wirklich so ist. Ich werde die Frage auch den Qu�storen unterbreiten. Ich bin mir sicher, da� diese gro�en Wert darauf legen, da� wir die Rechtsvorschriften, die wir verabschieden, auch selbst einhalten.
+
+Frau Pr�sidentin! Frau D�ez Gonz�lez und ich hatten einige Anfragen zu bestimmten, in einer spanischen Zeitung wiedergegebenen Stellungnahmen der Vizepr�sidentin, Frau de Palacio, gestellt. Die zust�ndigen Dienste haben sie nicht in die Tagesordnung aufgenommen, da sie der Meinung waren, sie seien schon in einer vorangegangenen Sitzung beantwortet worden.
+Ich bitte, diese Entscheidung zu �berdenken, weil das nicht der Fall ist. Die fr�her beantworteten Anfragen bezogen sich auf das Auftreten von Frau de Palacio in einer bestimmten Angelegenheit, nicht auf die am 18. November des vergangenen Jahres in der Tageszeitung ABC erschienenen Erkl�rungen.
+
+Lieber Kollege, wir werden das pr�fen. Ich mu� Ihnen aber sagen, da� ich die Lage im Moment f�r etwas verworren halte. Wir werden das aber sehr genau pr�fen, damit alles seine Richtigkeit hat.
+
+Frau Pr�sidentin, ich w��te gern, ob das Parlament in dieser Woche ein deutliches Signal unserer Unzufriedenheit bez�glich der heutigen Entscheidung, mit der eine Verl�ngerung des Waffenembargos gegen Indonesien abgelehnt wird, aussenden wird, zumal sich die gro�e Mehrheit in diesem Parlament in der Vergangenheit f�r das Waffenembargo gegen Indonesien ausgesprochen hat. Die heutige Entscheidung gegen eine Verl�ngerung des Embargos birgt angesichts der dortigen Lage eine sehr gro�e Gefahr. Das Parlament sollte, da dies dem Wunsch der gro�en Mehrheit entspricht, eine entsprechende Botschaft senden. Die Ablehnung einer Verl�ngerung des Embargos seitens der EU-Mitgliedstaaten ist unverantwortlich. Wie bereits festgestellt wurde, ist die Lage in Indonesien �u�erst instabil. Es besteht sogar die Gefahr eines Milit�rputsches. Wir wissen nicht, was passiert. Weshalb also sollten Waffenhersteller in der EU auf Kosten unschuldiger Menschen Profite einstreichen?
+
+Dieser Punkt ist bisher nicht f�r die Dringlichkeitsdebatte am Donnerstag vorgesehen.
+
+Arbeitsplan
+Nach der Tagesordnung folgt die Pr�fung des endg�ltigen Entwurfs der Tagesordnung, wie er nach Artikel 110 der Gesch�ftsordnung am Donnerstag, dem 13. Januar von der Konferenz der Pr�sidenten festgelegt wurde. Zu Montag und Dienstag liegen keine �nderungen vor.
+Zum Mittwoch:
+Die Sozialdemokratische Fraktion beantragt, eine Erkl�rung der Kommission �ber ihre strategischen Ziele f�r die n�chsten f�nf Jahre sowie �ber die Verwaltungsreform der Kommission in die Tagesordnung aufzunehmen.
+Ich bitte den Antragsteller, Herrn Bar�n Crespo, seinen Antrag zu begr�nden, falls er dies w�nscht. Danach verfahren wir wie �blich: ein Redner daf�r, einer dagegen.
+
+Frau Pr�sidentin! Die Vorstellung des politischen Programms der Kommission Prodi f�r die gesamte Wahlperiode ging auf einen Vorschlag der Fraktion der Sozialdemokratischen Partei Europas zur�ck, der die einhellige Billigung der Konferenz der Pr�sidenten im September und auch die ausdr�ckliche Zustimmung von Pr�sident Prodi fand, der seine Zusage in seiner Antrittsrede bekr�ftigte.
+Diese Zusage ist insofern von Bedeutung, als die Kommission ein Organ ist, das nach den Vertr�gen das Initiativmonopol besitzt und somit grundlegend die politische und legislative T�tigkeit dieses Parlaments in den n�chsten f�nf Jahren gestaltet. Ich m�chte auch daran erinnern, da� dieses Parlament in der vorangegangenen Wahlperiode Pr�sident Prodi zweimal sein Vertrauen ausgesprochen hat; in dieser Wahlperiode sprach es ihm im Juli erneut sein Vertrauen aus, und dann, als die neue Kommission im Amt war, gab es im September erneut ein Vertrauensvotum f�r die Kommission insgesamt. Somit hatte die Kommission bereits gen�gend Zeit, ihr Programm zu erarbeiten, und wir, um es kennenlernen und den B�rgern erkl�ren zu k�nnen. In diesem Sinne erinnere ich an die Entschlie�ung vom 15. September, in der empfohlen wurde, den Vorschlag in der k�rzestm�glichen Frist vorzulegen.
+Die Ereignisse der vergangenen Woche - die am Rande der Konferenz der Pr�sidenten ihren Anfang nahmen und wobei diese Konferenz nur zur Best�tigung und Ratifizierung von au�erhalb gefa�ten Beschl�ssen genutzt wurde - verdeutlichen ein Dilemma: Entweder ist die Kommission nicht in der Lage, dieses Programm vorzulegen (In diesem Fall sollte sie eine Kl�rung herbeif�hren. Nach den Worten ihres Pr�sidenten ist sie dazu in der Lage. Da die Kommission durch die Vizepr�sidentin, Frau de Palacio, vertreten ist, halte ich es f�r zweckm��ig, vor der Abstimmung die Position der Kommission hinsichtlich ihrer Bereitschaft zur Vorstellung des Programms, so wie es vereinbart war, zu erfahren.), oder das Parlament ist zur Pr�fung dieses Programms nicht in der Lage, wie einige offenbar vorgeben. Nach meiner Ansicht w�rde diese zweite Hypothese einem Verzicht auf unsere Verantwortung als Parlament und dar�ber hinaus dem Aufwerfen einer originellen These, einer unbekannten Methode gleic
 hkommen, die darin best�nde, den Fraktionen die programmatische Rede der Kommission in schriftlicher Form eine Woche vorher - und nicht, wie vereinbart, am Tag zuvor - zur Kenntnis zu geben, wobei zu ber�cksichtigen ist, da� das Legislativprogramm im Februar diskutiert werden wird, so da� wir auf die Aussprache verzichten k�nnten, da die Presse und das Internet am Tag darauf alle B�rger dar�ber informiert haben w�rden und das Parlament keinen Grund mehr h�tte, sich mit der Angelegenheit zu befassen.
+Da meine Fraktion der Meinung ist, da� ein Parlament dazu da ist, zuzuh�ren, zu diskutieren und nachzudenken, gibt es unserer Ansicht nach keinen Grund zur Rechtfertigung dieser Verz�gerung, und wir glauben, wenn die Kommission dazu in der Lage ist, liegen wir genau in der Zeit, um die urspr�ngliche Vereinbarung zwischen dem Parlament und der Kommission wieder in Kraft zu setzen und verantwortungsbewu�t vor unsere Mitb�rgerinnen und Mitb�rgern treten zu k�nnen. Deshalb besteht der Vorschlag der Fraktion der Sozialdemokratischen Partei Europas, den Sie erw�hnt haben, darin, den Mittwoch als Termin der Vorstellung des Programms der Kommission Prodi f�r die Wahlperiode beizubehalten, und in dieses Programm auch das Verwaltungsreformprojekt einzubeziehen, da wir andernfalls in eine paradoxe Situation geraten k�nnten: Mit der Ausrede, der Wortlaut liege nicht vor, wird einerseits dem Pr�sidenten der Kommission das Recht abgesprochen, in diesem Parlament zu sprechen, und ander
 erseits w�rde eine Aussprache �ber die Reform stattfinden, ohne da� dieses Parlament zuvor die Texte lesen konnte, die der Aussprache zugrunde liegen. Daher bitte ich Sie, Frau Pr�sidentin, die Kommission zu ersuchen, sich jetzt zu �u�ern, und danach zur Abstimmung zu schreiten.
+(Beifall der PSE-Fraktion)
+
+Frau Pr�sidentin, liebe Kolleginnen und Kollegen! Ich bin doch etwas erstaunt �ber das Verhalten des Kollegen Bar�n Crespo, der jetzt verlangt, da� dieser Tagesordnungspunkt auf die Tagesordnung f�r Mittwoch gesetzt wird.
+Herr Kollege Bar�n Crespo, Sie konnten am letzten Donnerstag in der Konferenz der Pr�sidenten nicht anwesend sein. Das kritisiere ich nicht; es kommt immer mal vor, da� man sich vertreten l��t. Der Kollege H�nsch hat Sie dort vertreten. Wir haben in der Konferenz der Pr�sidenten eine ausf�hrliche Debatte gef�hrt. Nur Ihre Fraktion hat das vertreten, was Sie jetzt sagen. Wir haben dann abgestimmt. Jeder Vorsitzende bzw. jede Vorsitzende hat ja so viele Stimmen, wie die Fraktion Mitglieder hat. Es gab eine Abstimmung zu diesem Punkt. Diese Abstimmung ist meiner Erinnerung nach so ausgegangen: 422 gegen 180 Stimmen bei einigen wenigen Enthaltungen. Das hei�t, alle Fraktionen, mit Ausnahme der Fraktionslosen - aber die sind ja keine Fraktion - waren sich einig, nur Ihre Fraktion war der Meinung, so zu verfahren, wie Sie es hier vorgeschlagen haben. Alle anderen waren anderer Meinung. Das war der Beschlu�.
+Jetzt m�chte ich zur Sache selbst etwas sagen. Wir haben Vertrauen zur Kommission, zu Romano Prodi, und die ganz gro�e Mehrheit unserer Fraktion hat Romano Prodi und der Kommission nach einem schwierigen Proze�, wie jeder wei�, das Vertrauen ausgesprochen. Aber wir sind auch der Meinung, da� wir eine Debatte �ber diese Strategie der Kommission in einem geordneten Verfahren f�hren m�ssen, nicht nur aufgrund einer m�ndlichen Erkl�rung hier im Europ�ischen Parlament, sondern auch aufgrund eines Dokumentes, das in der Kommission beschlossen ist und dieses Programm f�r f�nf Jahre beschreibt. Ein solches Dokument gibt es nicht!
+
+Die Kommission wird das Programm f�r das Jahr 2000 im Februar vorlegen. Wir haben gesagt, o. k, wenn die Kommission das Programm 2000 noch nicht im Januar machen will, dann machen wir das im Februar. Wir haben dem zugestimmt. Wir wollen ja an sich keinen Streit mit der Kommission, sondern wir sind der Meinung, wenn es eben geht, m�ssen Kommission und Parlament einen gemeinsamen Weg gehen. Aber wir als Parlament sind auch der Kontrolleur der Kommission. Und nicht alles, was von der Kommission kommt, mu� unsere Meinung sein.
+Ich m�chte, da� wir uns auf eine Debatte �ber das F�nfjahresprogramm in den Fraktionen vern�nftig vorbereiten k�nnen. Man kann sich nicht vorbereiten, wenn man hier eine Erkl�rung h�rt und gar nicht wei�, was Inhalt einer solchen Erkl�rung ist. Deswegen ist es unsere Empfehlung - und mein Eindruck ist, da� die Kommission auch aufgeschlossen ist f�r diesen Gedanken -, da� wir im Februar die Debatte �ber das langfristige Programm der Kommission bis zum Jahre 2005 f�hren - ich hoffe, die Kommission wird sich bis dahin auch auf ein Programm verst�ndigen, das sie uns vorschlagen wird -, und da� wir gleichzeitig im Februar auch die Debatte �ber das Legislativprogramm der Kommission f�r das Jahr 2000 f�hren. Es ist also auch ein vern�nftiger sachlicher Zusammenhang, der uns r�t, die Debatte �ber beide Programme gemeinsam zu f�hren. Deswegen lehnt meine Fraktion den Vorschlag der Sozialistischen Fraktion entschieden ab!
+(Beifall von der PPE-DE-Fraktion)
+
+Frau Pr�sidentin! Ich m�chte ganz deutlich sagen, da� die Kommission vor allem h�chsten Respekt gegen�ber den Beschl�ssen dieses Parlaments hat, und dazu geh�rt die Aufstellung seiner Tagesordnung. Deshalb respektieren wir die Entscheidungen des Parlaments in diesem Sinne. Aber ich m�chte auch ganz deutlich zum Ausdruck bringen, da� sich Pr�sident Prodi gegen�ber dem Parlament, wie Herr Bar�n in Erinnerung brachte, zu einer neuen Aussprache verpflichtet hat, die zus�tzlich zu der Jahresaussprache �ber das Legislativprogramm der Kommission, �ber die gro�en Aktionslinien f�r den n�chsten F�nfjahreszeitraum, das hei�t, f�r diese Wahlperiode, stattfinden soll.
+Ich m�chte sagen, da� diese Aussprache in der im September getroffenen Vereinbarung von der Jahresvorlage des Legislativprogramms der Kommission abgetrennt wurde. Und ich m�chte auch sagen, da� wir seitens der Kommission auf diese Aussprache vorbereitet und bereit sind, sie zum geeigneten Zeitpunkt zu f�hren, da� wir f�r diese Woche darauf vorbereitet waren, wie dies grunds�tzlich vereinbart war, wobei man am Vorabend vor den Fraktionen des Parlaments mit einer Rede auftreten wollte.
+Deshalb, Frau Pr�sidentin, m�chte ich bekr�ftigen, da� wir unsererseits das Aktionsprogramm f�r die n�chsten f�nf Jahre diskutiert haben und darauf vorbereitet sind, sofern es vom Parlament so beschlossen wird - noch in dieser Woche, wenn der Beschlu� so lautet -, das F�nfjahresprogramm und im n�chsten Monat das Programm f�r das Jahr 2000, genau so wie vereinbart, vorzustellen.
+
+Ich schlage vor, da� wir �ber den Antrag der Sozialdemokratischen Fraktion, die Erkl�rung der Kommission �ber ihre strategischen Ziele wieder auf die Tagesordnung zu setzen, abstimmen.
+(Das Parlament lehnt den Antrag ab.) Die Pr�sidentin. Zum Mittwoch liegt mir noch ein weiterer Antrag betreffend die m�ndliche Anfrage �ber die Kapitalsteuer vor. Die PPE/DE-Fraktion beantragt, diesen Punkt von der Tagesordnung abzusetzen.
+M�chte jemand den Antrag im Namen der Fraktion begr�nden?
+
+Frau Pr�sidentin, da ich bei den Sozialisten ein bi�chen Gel�chter h�re - mir wurde gesagt, da� auch weite Kreise der Sozialistischen Fraktion diesen Tagesordnungspunkt gern abgesetzt haben wollen, weil bei der Abstimmung in der Konferenz der Pr�sidenten das Votum der Arbeitsgruppe der zust�ndigen Kolleginnen und Kollegen der Sozialistischen Fraktion nicht vorlag. Ich wei� nicht, ob diese Information richtig ist, aber wir als EVP-ED-Fraktion w�ren jedenfalls dankbar, wenn dieser Punkt abgesetzt w�rde, weil sich das Parlament n�mlich schon mehrfach mit dieser Frage befa�t hat. Es gibt auch Beschl�sse gegen eine solche Steuer. Deswegen beantragt meine Fraktion, diesen Punkt von der Tagesordnung abzusetzen.
+
+Vielen Dank, Herr Poettering.
+Wir kommen nun zu Herrn Wurtz, der gegen den Antrag spricht.
+
+Frau Pr�sidentin, ich m�chte zun�chst darauf hinweisen, da� das, was Herr Poettering da sagt, nicht ganz logisch ist. Zum einen belehrt er die Sozialdemokratische Fraktion, weil diese eine ganz klare und eindeutige Entscheidung der Konferenz der Pr�sidenten in Frage stellt. Und nun tut er genau dasselbe. Wir haben diskutiert, wir waren uns einig, nur die EVP-Fraktion und die Liberalen nicht. Und ich hatte noch darauf hingewiesen, die anderen Pr�sidentenkollegen werden sich noch daran erinnern, da� es nicht darum geht, ob man f�r oder gegen die Tobin-Steuer ist, sondern darum, ob wir bereit sind, uns anzuh�ren, was die Kommission und der Rat davon halten. Das ist nicht zuviel verlangt. Ich wiederhole also den Vorschlag, diese m�ndliche Anfrage an die Kommission und den Rat aufrechtzuerhalten, um ein f�r alle Mal die Meinung dieser beiden Institutionen zu diesem relativ bescheidenen Vorschlag zu erfahren, der f�r die �ffentlichkeit ein wichtiges Signal w�re, insbesondere
  nach der Aufregung im Gefolge des Scheiterns der Konferenz von Seattle.
+
+Wir stimmen jetzt �ber den Antrag der PPE/DE-Fraktion ab, die m�ndliche Anfrage �ber die Kapitalsteuer von der Tagesordnung abzusetzen.
+(Das Parlament lehnt den Antrag mit 164 Ja-Stimmen, 166 Nein-Stimmen und 7 Enthaltungen ab.)
+
+Frau Pr�sidentin, ich m�chte Herrn Poettering f�r das R�hren der Werbetrommel zugunsten dieser Aussprache danken. Vielen Dank.
+
+Frau Pr�sidentin! Ist meine Stimme mitgez�hlt worden? Ich konnte sie n�mlich nicht elektronisch abgeben, weil ich die Karte nicht habe. Ich habe "daf�r " gestimmt.
+
+In der Tat, wenn man die beiden Mitglieder, die sich gemeldet haben hinzuz�hlt, dann ergibt sich als Ergebnis ...
+
+Frau Pr�sidentin! Die Pr�sidentschaft hat das Ergebnis der Abstimmung verk�ndet. �nderungen sind nicht m�glich.
+
+Liebe Kolleginnen und Kollegen, ich mu� Sie nochmals daran erinnern, montags Ihre Stimmkarte mitzubringen. Wir haben da offensichtlich ein Problem, und ich mu� jetzt eine Entscheidung treffen.
+Auch ich habe meine Stimmkarte vergessen, und ich h�tte dagegen gestimmt. Ich gehe somit davon aus, da� die m�ndliche Anfrage auf der Tagesordnung bleibt.
+Das war das letzte Mal, da� wir vergessene Karten ber�cksichtigen. Damit das ein f�r alle Mal klar ist!
+(Beifall)
+Richtig, damit bleibt die m�ndliche Anfrage auf der Tagesordnung, und richtig, die Pr�sidentin hat das Recht abzustimmen, wie sie auch das Recht hat, ihre Stimmkarte zu vergessen.
+Wir kommen nun zu den weiteren �nderungen der Tagesordnung.
+
+Frau Pr�sidentin, bei der fr�heren Abstimmung zur Frage des Strategieplans der Kommission - keine Angst, ich werde mich an Ihre Entscheidung in dieser Sache halten - hatte ich darum gebeten, vor der Abstimmung im Namen meiner Fraktion sprechen zu d�rfen. Dazu kam es nicht. Ich w�rde es begr��en, wenn ich zum Abschlu� dieses Gesch�ftspunktes die M�glichkeit h�tte, im Namen meiner Fraktion eine Erkl�rung zur Abstimmung abzugeben. Das ist eine wichtige Angelegenheit. Vielleicht w�re es sinnvoll festzuhalten, wie das, was wir eben getan haben, von den einzelnen vor dem Hintergrund ihrer eigenen politischen Analyse aufgenommen wird.
+
+Frau Pr�sidentin! Ich will die Debatte nicht wieder aufnehmen, aber ich hatte mich auch gemeldet, um zu dem Antrag von Herrn Bar�n Crespo Stellung zu nehmen. Sie haben mich auch nicht aufgerufen. Ich bedauere das, aber die Abstimmung ist durchgef�hrt worden, die Entscheidung ist gefallen, also lassen wir die Dinge.
+
+Das tut mir leid, Herr H�nsch und Herr Cox. Ich hatte nicht gesehen, da� Sie ums Wort gebeten hatten. Doch die Positionen sind deutlich geworden und werden ins Protokoll aufgenommen. Wenn wir morgen das Protokoll der Sitzung von heute verabschieden, k�nnen die Kolleginnen und Kollegen, die der Auffassung sind, da� die Positionen nicht ausreichend klar geworden sind, �nderungen beantragen. Ich halte dieses Vorgehen f�r angemessen. Nat�rlich werden im Protokoll der Sitzung von morgen s�mtliche zus�tzlichen Erl�uterungen ber�cksichtigt. Ich halte das f�r besser als jetzt lange Stimmerkl�rungen abzugeben. Herr Cox, Herr H�nsch, sind Sie damit einverstanden?
+
+Frau Pr�sidentin, wenn aus der Abstimmung einwandfrei hervorgeht, wie meine Fraktion abgestimmt hat, dann werde und kann ich nichts dagegen sagen. Wenn Sie festlegen, da� ich keine Erkl�rung zur Abstimmung abgeben kann, akzeptiere ich das, wenngleich unter Vorbehalt.
+
+Beim Abfassen des Protokolls werden wir mit gro�er Sorgfalt vorgehen. Das tun wir im �brigen immer. Wenn Positionen nicht richtig wiedergegeben werden, k�nnen wir das Protokoll gegebenenfalls �ndern.
+(Das Parlament genehmigt den ge�nderten Arbeitsplan.)

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/language-identifier/src/test/java/org/apache/nutch/analysis/lang/el.test
----------------------------------------------------------------------
diff --git a/nutch-plugins/language-identifier/src/test/java/org/apache/nutch/analysis/lang/el.test b/nutch-plugins/language-identifier/src/test/java/org/apache/nutch/analysis/lang/el.test
new file mode 100644
index 0000000..5fa13e7
--- /dev/null
+++ b/nutch-plugins/language-identifier/src/test/java/org/apache/nutch/analysis/lang/el.test
@@ -0,0 +1,109 @@
+\u0395\u03c0\u03b1v\u03ac\u03bb\u03b7\u03c8\u03b7 \u03c4\u03b7\u03c2 \u03c3\u03c5v\u03c3\u03b4o\u03c5
+\u039a\u03b7\u03c1\u03cd\u03c3\u03c3\u03c9 \u03c4\u03b7\u03bd \u03b5\u03c0\u03b1\u03bd\u03ac\u03bb\u03b7\u03c8\u03b7 \u03c4\u03b7\u03c2 \u03c3\u03c5\u03bd\u03cc\u03b4\u03bf\u03c5 \u03c4\u03bf\u03c5 \u0395\u03c5\u03c1\u03c9\u03c0\u03b1\u03ca\u03ba\u03bf\u03cd \u039a\u03bf\u03b9\u03bd\u03bf\u03b2\u03bf\u03c5\u03bb\u03af\u03bf\u03c5 \u03b7 \u03bf\u03c0\u03bf\u03af\u03b1 \u03b5\u03af\u03c7\u03b5 \u03b4\u03b9\u03b1\u03ba\u03bf\u03c0\u03b5\u03af \u03c4\u03b7\u03bd \u03a0\u03b1\u03c1\u03b1\u03c3\u03ba\u03b5\u03c5\u03ae 17 \u0394\u03b5\u03ba\u03b5\u03bc\u03b2\u03c1\u03af\u03bf\u03c5 \u03ba\u03b1\u03b9 \u03c3\u03b1\u03c2 \u03b1\u03c0\u03b5\u03c5\u03b8\u03cd\u03bd\u03c9 \u03be\u03b1\u03bd\u03ac \u03c4\u03b9\u03c2 \u03b8\u03b5\u03c1\u03bc\u03ad\u03c2 \u03b5\u03c5\u03c7\u03ad\u03c2 \u03bc\u03bf\u03c5, \u03b5\u03bb\u03c0\u03af\u03b6\u03bf\u03bd\u03c4\u03b1\u03c2 \u03bd\u03b1 \u03c0\u03b5\u03c1\u03ac\u03c3\u03b1\u03c4\u03b5 \u03ba\u03b1\u03bb\u03ac \u03c3\u03c4\u03b9\u03c2 \u03b4\u03b9\u03b1\u03ba\u03bf\u03c0\u03ad\u03c2.
+\u038c\u03c0\u03c9\u03c2 \u03bc\u03c0\u03bf\u03c1\u03ad\u03c3\u03b1\u03c4\u03b5 \u03bd\u03b1 \u03b4\u03b9\u03b1\u03c0\u03b9\u03c3\u03c4\u03ce\u03c3\u03b5\u03c4\u03b5, \u03bf \u03c0\u03b5\u03c1\u03af\u03c6\u03b7\u03bc\u03bf\u03c2 "\u03b9\u03cc\u03c2 \u03c4\u03bf\u03c5 \u03ad\u03c4\u03bf\u03c5\u03c2 2000" \u03b4\u03b5\u03bd \u03b5\u03bc\u03c6\u03b1\u03bd\u03af\u03c3\u03b8\u03b7\u03ba\u03b5. \u0391\u03bd\u03c4\u03b9\u03b8\u03ad\u03c4\u03c9\u03c2, \u03bf\u03b9 \u03c0\u03bf\u03bb\u03af\u03c4\u03b5\u03c2 \u03bf\u03c1\u03b9\u03c3\u03bc\u03ad\u03bd\u03c9\u03bd \u03c7\u03c9\u03c1\u03ce\u03bd \u03bc\u03b1\u03c2 \u03c5\u03c0\u03ae\u03c1\u03be\u03b1\u03bd \u03b8\u03cd\u03bc\u03b1\u03c4\u03b1 \u03c6\u03c5\u03c3\u03b9\u03ba\u03ce\u03bd \u03ba\u03b1\u03c4\u03b1\u03c3\u03c4\u03c1\u03bf\u03c6\u03ce\u03bd, \u03bf\u03b9 \u03bf\u03c0\u03bf\u03af\u03b5\u03c2 \u03ae\u03c4\u03b1\u03bd \u03cc\u03bd\u03c4\u03c9\u03c2 \u03c6\u03bf\u03b2\u03b5\u03c1\u03ad\u03c2. \u0395\u03c0\u03b9\u03b8\u03c5\u03bc\u03b5\u03af\u03c4\u03b5 \u03bc\u03af\u03b1 \u03c3\u03c5\u03b6\u03ae\u03c4\u03b7\u03c3\u03b7 \u03b5\u03c0\u03af \u03c4\u03bf\u03c5 \u03b8\u03ad\u03bc\u03b1\u03c4\u03bf\u03c2 \u03c4\u03b9\u03c2 \u03b5\u03c0\u03cc\u03bc\u03b5\u03bd\u03b5\u03c2 \u03b7\u03bc\u03ad\u03c1\u03b5\u03c2, \u03ba\u03b1\u03c4\u03ac \u03c4\u03b7 \u03b4\u03b9\u03ac\u03c1\u03ba\u03b5\u03b9\u03b1 \u03c4\u03b7\u03c2 \u03c4\u03c1\u03ad\u03c7\u03bf\u03c5\u03c3\u03b1\u03c2 \u03c0\u03b5\u03c1\u03b9\u03cc\u03b4\u03bf\u03c5 \u03c3\u03c5\u03bd\u03cc\u03b4\u03bf\u03c5. \u0395\u03c0\u03af \u03c4\u03bf\u03c5 \u03c0\u03b1\u03c1\u03cc\u03bd\u03c4\u03bf\u03c2 \u03b8\u03b1 \u03ae\u03b8\u03b5\u03bb\u03b1, \u03cc\u03c0\u03c9\u03c2 \u03bc\u03bf\u03c5 \u03b6\u03ae\u03c4\u03b7\u03c3\u03b1\u03bd \u03bf\u03c1\u03b9\u03c3\u03bc\u03ad\u03bd\u03bf\u03b9 \u03c3\u03c5\u03bd\u03ac\u03b4\u03b5\u03bb\u03c6\u03bf\u03b9, \u03bd\u03b1 \u03c4\u03b7\u03c1\u03ae\u03c3\u03bf\u03c5\u03bc\u03b5 \u03b5\u03bd\u03cc\u03c2 \u03bb\u03b5\u03c0\u03c4\u03bf\u03cd \u03c3\u03b9\u03b3\u03ae \u03b3\u03b9\u03b1 \u03cc\u03bb\u03b1 \u03c4\u03b1 \u03b8\u03cd\u03bc\u03b1\u03c4\u03b1, \u03ba\u03c5\u03c1\u03af\u03c9\u03c2 \u03b3\u03b9\u03b1 \u03c4\u03b1 \u03b8\u03cd\u03bc\u03b1\u03c4\u03b1 \u03c4\u03c9\u03bd \u03ba\u03b1\u03c4\u03b1\u03b9\u03b3\u03af\u03b4\u03c9\u03bd, \u03c3\u03c4\u03b9\u03c2 \u03b4\u03b9\u03ac\u03c6\u03bf\u03c1\u03b5\u03c2 \u03c0\u03bb\u03b7\u03b3\u03b5\u03af\u03c3\u03b5\u03c2 \u03c7\u03ce\u03c1\u03b5\u03c2 \u03c4\u03b7\u03c2 \u0395\u03c5\u03c1\u03c9\u03c0\u03b1\u03ca\u03ba\u03ae\u03c2 \u0388\u03bd\u03c9\u03c3\u03b7\u03c2. \u03a3\u03b1\u03c2 \u03ba\u03b1\u03bb\u03ce \u03bd\u03b1 \u03c3\u03b7\u03ba\u03c9\u03b8\u03b5\u03af\u03c4\u03b5 \u03b3\u03b9\u03b1 \u03b1\u03c5\u03c4\u03ae \u03c4\u03b7\u03bd \u03b5\u03bd\u03cc\u03c2 \u03bb\u03b5\u03c0\u03c4
 \u03bf\u03cd \u03c3\u03b9\u03b3\u03ae.
+(\u03a4\u03bf \u03a3\u03ce\u03bc\u03b1, \u03cc\u03c1\u03b8\u03b9\u03bf, \u03c4\u03b7\u03c1\u03b5\u03af \u03b5\u03bd\u03cc\u03c2 \u03bb\u03b5\u03c0\u03c4\u03bf\u03cd \u03c3\u03b9\u03b3\u03ae)
+
+K\u03c5\u03c1\u03af\u03b1 \u03a0\u03c1\u03cc\u03b5\u03b4\u03c1\u03b5, \u03b5\u03c0\u03af \u03b5\u03bd\u03cc\u03c2 \u03b8\u03ad\u03bc\u03b1\u03c4\u03bf\u03c2 \u03b4\u03b9\u03b1\u03b4\u03b9\u03ba\u03b1\u03c3\u03af\u03b1\u03c2. \u0398\u03b1 \u03ad\u03c7\u03b5\u03c4\u03b5 \u03b5\u03bd\u03b7\u03bc\u03b5\u03c1\u03c9\u03b8\u03b5\u03af \u03b1\u03c0\u03cc \u03c4\u03bf\u03bd \u03c4\u03cd\u03c0\u03bf \u03ba\u03b1\u03b9 \u03c4\u03b7\u03bd \u03c4\u03b7\u03bb\u03b5\u03cc\u03c1\u03b1\u03c3\u03b7 \u03cc\u03c4\u03b9 \u03c3\u03c5\u03bd\u03ad\u03b2\u03b7\u03c3\u03b1\u03bd \u03bf\u03c1\u03b9\u03c3\u03bc\u03ad\u03bd\u03b5\u03c2 \u03b5\u03ba\u03c1\u03ae\u03be\u03b5\u03b9\u03c2 \u03b2\u03bf\u03bc\u03b2\u03ce\u03bd \u03ba\u03b1\u03b9 \u03c6\u03cc\u03bd\u03bf\u03b9 \u03c3\u03c4\u03b7 \u03a3\u03c1\u03b9 \u039b\u03ac\u03bd\u03ba\u03b1. \u0388\u03bd\u03b1\u03c2 \u03b1\u03c0\u03cc \u03c4\u03bf\u03c5\u03c2 \u03b1\u03bd\u03b8\u03c1\u03ce\u03c0\u03bf\u03c5\u03c2 \u03c0\u03bf\u03c5 \u03b4\u03bf\u03bb\u03bf\u03c6\u03bf\u03bd\u03ae\u03b8\u03b7\u03ba\u03b1\u03bd \u03c0\u03bf\u03bb\u03cd \u03c0\u03c1\u03cc\u03c3\u03c6\u03b1\u03c4\u03b1 \u03c3\u03c4\u03b7 \u03a3\u03c1\u03b9 \u039b\u03ac\u03bd\u03ba\u03b1 \u03ae\u03c4\u03b1\u03bd \u03bf \u03ba. Kumar Ponnambalam, \u03bf \u03bf\u03c0\u03bf\u03af\u03bf\u03c2 \u03b5\u03af\u03c7\u03b5 \u03b5\u03c0\u03b9\u03c3\u03ba\u03b5\u03c6\u03b8\u03b5\u03af \u03c4\u03bf \u0395\u03c5\u03c1\u03c9\u03c0\u03b1\u03ca\u03ba\u03cc \u039a\u03bf\u03b9\u03bd\u03bf\u03b2\u03bf\u03cd\u03bb\u03b9\u03bf \u03bc\u03cc\u03bb\u03b9\u03c2 \u03c0\u03c1\u03b9\u03bd \u03bb\u03af\u03b3\u03bf\u03c5\u03c2 \u03bc\u03ae\u03bd\u03b5\u03c2. \u0398\u03b1 \u03c4\u03bf \u03b8\u03b5\u03c9\u03c1\u03bf\u03cd\u03c3\u03b1\u03c4\u03b5 \u03c3\u03c9\u03c3\u03c4\u03cc, \u03ba\u03c5\u03c1\u03af\u03b1 \u03a0\u03c1\u03cc\u03b5\u03b4\u03c1\u03b5, \u03bd\u03b1 \u03b3\u03c1\u03ac\u03c8\u03b5\u03c4\u03b5 \u03bc\u03af\u03b1 \u03b5\u03c0\u03b9\u03c3\u03c4\u03bf\u03bb\u03ae \u03c3\u03c4\u03b7\u03bd \u03a0\u03c1\u03cc\u03b5\u03b4\u03c1\u03bf \u03c4\u03b7\u03c2 \u03a3\u03c1\u03b9 \u039b\u03ac\u03bd\u03ba\u03b1, \u03cc\u03c0\u03bf\u03c5 \u03b8\u03b1 \u03b5\u03ba\u03c6\u03c1\u03ac\u03b6\u03b5\u03c4\u03b5 \u03c4\u03b7 \u03bb\u03cd\u03c0\u03b7 \u03c4\u03bf\u03c5 \u039a\u03bf\u03b9\u03bd\u03bf\u03b2\u03bf\u03c5\u03bb\u03af\u03bf\u03c5 \u03b3\u03b9\u03b1 \u03c4\u03bf\u03bd \u03b8\u03ac\u03bd\u03b1\u03c4\u03cc \u03c4\u03bf\u03c5 \u03ba\u03b1\u03b9 \u03c4\u03bf\u03c5\u03c2 \u03ac\u03bb\u03bb\u03bf\u03c5\u03c2 \u03b2\u03af\u03b1\u03b9\u03bf\u03c5\u03c2 \u03b8\u03b1\u03bd\u03ac\u03c4\u03bf\u03c5\u03c2 \u03c3\u03c4\u03b7 \u03a3\u03c1\u03b9 \u039b\u03ac\u03bd\u03ba\u03b1 \u03ba\u03b1\u03b9 \u03b8\u03b1 \u03c4\u03b7\u03bd \u03c0\u03b1\u03c1\u03b1\u03ba\u03b9\u03bd\u03b5\u03af\u03c4
 \u03b5 \u03bd\u03b1 \u03c0\u03c1\u03ac\u03be\u03b5\u03b9 \u03bf\u03c4\u03b9\u03b4\u03ae\u03c0\u03bf\u03c4\u03b5 \u03b5\u03af\u03bd\u03b1\u03b9 \u03b4\u03c5\u03bd\u03b1\u03c4\u03cc, \u03b1\u03c0\u03cc \u03c4\u03b7\u03bd \u03c0\u03bb\u03b5\u03c5\u03c1\u03ac \u03c4\u03b7\u03c2, \u03b3\u03b9\u03b1 \u03bd\u03b1 \u03b1\u03bd\u03b1\u03b6\u03b7\u03c4\u03ae\u03c3\u03b5\u03b9 \u03bc\u03b9\u03b1 \u03b5\u03b9\u03c1\u03b7\u03bd\u03b9\u03ba\u03ae \u03b4\u03b9\u03b5\u03c5\u03b8\u03ad\u03c4\u03b7\u03c3\u03b7 \u03c3\u03b5 \u03bc\u03b9\u03b1 \u03c0\u03bf\u03bb\u03cd \u03b4\u03cd\u03c3\u03ba\u03bf\u03bb\u03b7 \u03ba\u03b1\u03c4\u03ac\u03c3\u03c4\u03b1\u03c3\u03b7;
+
+\u039d\u03b1\u03b9, \u03ba\u03cd\u03c1\u03b9\u03b5 Evans, \u03b8\u03b5\u03c9\u03c1\u03ce \u03cc\u03c4\u03b9 \u03bc\u03af\u03b1 \u03c0\u03c1\u03c9\u03c4\u03bf\u03b2\u03bf\u03c5\u03bb\u03af\u03b1 \u03bc\u03b5 \u03c4\u03b7\u03bd \u03ad\u03bd\u03bd\u03bf\u03b9\u03b1 \u03c0\u03bf\u03c5 \u03bc\u03cc\u03bb\u03b9\u03c2 \u03c0\u03c1\u03bf\u03c4\u03b5\u03af\u03bd\u03b1\u03c4\u03b5 \u03b8\u03b1 \u03ae\u03c4\u03b1\u03bd \u03b1\u03c0\u03bf\u03bb\u03cd\u03c4\u03c9\u03c2 \u03b5\u03bd\u03b4\u03b5\u03b4\u03b5\u03b9\u03b3\u03bc\u03ad\u03bd\u03b7. \u0395\u03ac\u03bd \u03c4\u03bf \u03a3\u03ce\u03bc\u03b1 \u03c3\u03c5\u03bc\u03c6\u03c9\u03bd\u03b5\u03af \u03b8\u03b1 \u03c0\u03c1\u03ac\u03be\u03c9 \u03cc\u03c0\u03c9\u03c2 \u03c0\u03c1\u03cc\u03c4\u03b5\u03b9\u03bd\u03b5 \u03bf \u03ba. Evans.
+
+\u039a\u03c5\u03c1\u03af\u03b1 \u03a0\u03c1\u03cc\u03b5\u03b4\u03c1\u03b5, \u03b5\u03c0\u03af \u03b5\u03bd\u03cc\u03c2 \u03b8\u03ad\u03bc\u03b1\u03c4\u03bf\u03c2 \u03b4\u03b9\u03b1\u03b4\u03b9\u03ba\u03b1\u03c3\u03af\u03b1\u03c2. \u0398\u03b1 \u03ae\u03b8\u03b5\u03bb\u03b1 \u03c4\u03b7 \u03c3\u03c5\u03bc\u03b2\u03bf\u03c5\u03bb\u03ae \u03c3\u03b1\u03c2 \u03b3\u03b9\u03b1 \u03c4\u03bf \u03ac\u03c1\u03b8\u03c1\u03bf 143 \u03c0\u03b5\u03c1\u03af \u03c4\u03bf\u03c5 \u03bc\u03b7 \u03c0\u03b1\u03c1\u03b1\u03b4\u03b5\u03ba\u03c4\u03bf\u03cd. \u03a4\u03bf \u03b5\u03c1\u03ce\u03c4\u03b7\u03bc\u03ac \u03bc\u03bf\u03c5 \u03b1\u03c6\u03bf\u03c1\u03ac \u03ba\u03ac\u03c4\u03b9 \u03c0\u03bf\u03c5 \u03b8\u03b1 \u03c4\u03b5\u03b8\u03b5\u03af \u03c0\u03c1\u03bf\u03c2 \u03c3\u03c5\u03b6\u03ae\u03c4\u03b7\u03c3\u03b7 \u03c4\u03b7\u03bd \u03a0\u03ad\u03bc\u03c0\u03c4\u03b7, \u03c4\u03bf \u03bf\u03c0\u03bf\u03af\u03bf \u03b8\u03b1 \u03b8\u03ad\u03c3\u03c9 \u03c4\u03cc\u03c4\u03b5 \u03be\u03b1\u03bd\u03ac.
+\u0397 \u03ad\u03ba\u03b8\u03b5\u03c3\u03b7 Cunha, \u03c3\u03c7\u03b5\u03c4\u03b9\u03ba\u03ac \u03bc\u03b5 \u03c4\u03b1 \u03c0\u03bf\u03bb\u03c5\u03b5\u03c4\u03ae \u03c0\u03c1\u03bf\u03b3\u03c1\u03ac\u03bc\u03bc\u03b1\u03c4\u03b1 \u03c0\u03c1\u03bf\u03c3\u03b1\u03bd\u03b1\u03c4\u03bf\u03bb\u03b9\u03c3\u03bc\u03bf\u03cd, \u03b8\u03b1 \u03c4\u03b5\u03b8\u03b5\u03af \u03b5\u03bd\u03ce\u03c0\u03b9\u03bf\u03bd \u03c4\u03bf\u03c5 \u039a\u03bf\u03b9\u03bd\u03bf\u03b2\u03bf\u03c5\u03bb\u03af\u03bf\u03c5 \u03c4\u03b7\u03bd \u03a0\u03ad\u03bc\u03c0\u03c4\u03b7 \u03ba\u03b1\u03b9 \u03c0\u03b5\u03c1\u03b9\u03ad\u03c7\u03b5\u03b9 \u03bc\u03af\u03b1 \u03c0\u03c1\u03cc\u03c4\u03b1\u03c3\u03b7 \u03c3\u03c4\u03b7\u03bd \u03c0\u03b1\u03c1\u03ac\u03b3\u03c1\u03b1\u03c6\u03bf 6, \u03cc\u03c4\u03b9 \u03c0\u03c1\u03ad\u03c0\u03b5\u03b9 \u03bd\u03b1 \u03b8\u03b5\u03c3\u03c0\u03b9\u03c3\u03b8\u03bf\u03cd\u03bd \u03ba\u03c5\u03c1\u03ce\u03c3\u03b5\u03b9\u03c2 \u03bc\u03b5 \u03c4\u03b7 \u03bc\u03bf\u03c1\u03c6\u03ae \u03c0\u03bf\u03c3\u03bf\u03c3\u03c4\u03ce\u03c3\u03b5\u03c9\u03bd \u03b3\u03b9\u03b1 \u03c4\u03b9\u03c2 \u03c7\u03ce\u03c1\u03b5\u03c2 \u03c0\u03bf\u03c5 \u03b1\u03c0\u03bf\u03c4\u03c5\u03b3\u03c7\u03ac\u03bd\u03bf\u03c5\u03bd \u03bd\u03b1 \u03b5\u03c0\u03b9\u03c4\u03cd\u03c7\u03bf\u03c5\u03bd \u03b5\u03c4\u03b7\u03c3\u03af\u03c9\u03c2 \u03c4\u03bf\u03c5\u03c2 \u03c3\u03c4\u03cc\u03c7\u03bf\u03c5\u03c2 \u03bc\u03b5\u03af\u03c9\u03c3\u03b7\u03c2 \u03c4\u03c9\u03bd \u03c3\u03c4\u03cc\u03bb\u03c9\u03bd \u03c4\u03bf\u03c5\u03c2. \u0391\u03bd\u03b1\u03c6\u03ad\u03c1\u03b5\u03b9 \u03cc\u03c4\u03b9 \u03b1\u03c5\u03c4\u03cc \u03c0\u03c1\u03ad\u03c0\u03b5\u03b9 \u03bd\u03b1 \u03c0\u03c1\u03b1\u03b3\u03bc\u03b1\u03c4\u03bf\u03c0\u03bf\u03b9\u03b7\u03b8\u03b5\u03af \u03c0\u03b1\u03c1\u03ac \u03c4\u03b7\u03bd \u03b1\u03c1\u03c7\u03ae \u03c4\u03b7\u03c2 \u03c3\u03c7\u03b5\u03c4\u03b9\u03ba\u03ae\u03c2 \u03c3\u03c4\u03b1\u03b8\u03b5\u03c1\u03cc\u03c4\u03b7\u03c4\u03b1\u03c2. \u0398\u03b5\u03c9\u03c1\u03ce \u03cc\u03c4\u03b9 \u03b7 \u03b1\u03c1\u03c7\u03ae \u03c4\u03b7\u03c2 \u03c3\u03c7\u03b5\u03c4\u03b9\u03ba\u03ae\u03c2 \u03c3\u03c4\u03b1\u03b8\u03b5\u03c1\u03cc\u03c4\u03b7\u03c4\u03b1\u03c2 \u03c3\u03c5\u03bd\u03b9\u03c3\u03c4\u03ac \u03b8\u03b5\u03bc\u03b5\u03bb\u03b9\u03ce\u03b4\u03b7 \u03bd\u03bf\u03bc\u03b9\u03ba\u03ae \u03b1\u03c1\u03c7\u03ae \u03c4\u03b7\u03c2 \u03ba\u03bf\u03b9\u03bd\u03ae\u03c2 \u03b1\u03bb\u03b9\u03b5\u03c5\u03c4\u03b9\u03ba\u03ae\u03c2 \u03c0\u03bf\u03bb\u03b9\u03c4\u03b9\u03ba\u03ae\u03c2 \u03ba\u03b1\u03b9 \u03bc\u03b9\u03b1 \u03c0\u03c1\u03cc\u03c4\u03b1\u03c3\u03b7 \u03b1\u03bd\u03b1\u03c4\u03c1\u03bf\u03c0\u03ae\u03c2 \u03c4\u03b7\u03c2 \u03b8\u03b1 \u03ae\u03c4\u03b1\u03bd \u03bd\u03bf\u03bc\u03b9\u03ba\u03ac \u03b1\u03c0\u03b1\u03c1\u03ac\u03b4
 \u03ba\u03c4\u03b7. \u0398\u03ad\u03bb\u03c9 \u03bd\u03b1 \u03bc\u03ac\u03b8\u03c9 \u03b5\u03ac\u03bd \u03bc\u03c0\u03bf\u03c1\u03b5\u03af \u03ba\u03b1\u03bd\u03b5\u03af\u03c2 \u03bd\u03b1 \u03b8\u03ad\u03c3\u03b5\u03b9 \u03bc\u03af\u03b1 \u03ad\u03bd\u03c3\u03c4\u03b1\u03c3\u03b7 \u03c4\u03ad\u03c4\u03bf\u03b9\u03bf\u03c5 \u03b5\u03af\u03b4\u03bf\u03c5\u03c2 \u03c3\u03b5 \u03ad\u03bd\u03b1 \u03ad\u03b3\u03b3\u03c1\u03b1\u03c6\u03bf \u03c0\u03bf\u03c5 \u03b5\u03af\u03bd\u03b1\u03b9 \u03b1\u03c0\u03bb\u03ce\u03c2 \u03ad\u03ba\u03b8\u03b5\u03c3\u03b7, \u03ba\u03b1\u03b9 \u03cc\u03c7\u03b9 \u03c0\u03c1\u03cc\u03c4\u03b1\u03c3\u03b7 \u03bd\u03bf\u03bc\u03bf\u03b8\u03b5\u03c4\u03b9\u03ba\u03bf\u03cd \u03c0\u03b5\u03c1\u03b9\u03b5\u03c7\u03bf\u03bc\u03ad\u03bd\u03bf\u03c5, \u03ba\u03b1\u03b9 \u03b5\u03ac\u03bd \u03ad\u03c7\u03c9 \u03c4\u03b7\u03bd \u03b1\u03c1\u03bc\u03bf\u03b4\u03b9\u03cc\u03c4\u03b7\u03c4\u03b1 \u03bd\u03b1 \u03c4\u03bf \u03c0\u03c1\u03ac\u03be\u03c9 \u03b1\u03c5\u03c4\u03cc \u03c4\u03b7\u03bd \u03a0\u03ad\u03bc\u03c0\u03c4\u03b7.
+
+\u0391\u03ba\u03c1\u03b9\u03b2\u03ce\u03c2 \u03b5\u03ba\u03b5\u03af\u03bd\u03b7 \u03c4\u03b7 \u03c3\u03c4\u03b9\u03b3\u03bc\u03ae \u03b8\u03b1 \u03bc\u03c0\u03bf\u03c1\u03ad\u03c3\u03b5\u03c4\u03b5 \u03c0\u03c1\u03ac\u03b3\u03bc\u03b1\u03c4\u03b9, \u03b5\u03ac\u03bd \u03c4\u03bf \u03b5\u03c0\u03b9\u03b8\u03c5\u03bc\u03b5\u03af\u03c4\u03b5, \u03bd\u03b1 \u03b8\u03ad\u03c3\u03b5\u03c4\u03b5 \u03b1\u03c5\u03c4\u03cc \u03c4\u03bf \u03b5\u03c1\u03ce\u03c4\u03b7\u03bc\u03b1, \u03b4\u03b7\u03bb\u03b1\u03b4\u03ae \u03c4\u03b7\u03bd \u03a0\u03ad\u03bc\u03c0\u03c4\u03b7 \u03c0\u03c1\u03b9\u03bd \u03c4\u03b7\u03bd \u03ad\u03bd\u03b1\u03c1\u03be\u03b7 \u03c4\u03b7\u03c2 \u03c0\u03b1\u03c1\u03bf\u03c5\u03c3\u03af\u03b1\u03c3\u03b7\u03c2 \u03c4\u03b7\u03c2 \u03ad\u03ba\u03b8\u03b5\u03c3\u03b7\u03c2.
+
+\u039a\u03c5\u03c1\u03af\u03b1 \u03a0\u03c1\u03cc\u03b5\u03b4\u03c1\u03b5, \u03c4\u03b1\u03c5\u03c4\u03cc\u03c7\u03c1\u03bf\u03bd\u03b1 \u03bc\u03b5 \u03c4\u03b7\u03bd \u03c0\u03c1\u03ce\u03c4\u03b7 \u03c0\u03b5\u03c1\u03af\u03bf\u03b4\u03bf \u03c3\u03c5\u03bd\u03cc\u03b4\u03bf\u03c5 \u03c4\u03bf\u03c5 \u0395\u03c5\u03c1\u03c9\u03c0\u03b1\u03ca\u03ba\u03bf\u03cd \u039a\u03bf\u03b9\u03bd\u03bf\u03b2\u03bf\u03c5\u03bb\u03af\u03bf\u03c5 \u03b1\u03c5\u03c4\u03bf\u03cd \u03c4\u03bf\u03c5 \u03ad\u03c4\u03bf\u03c5\u03c2, \u03c3\u03c4\u03b9\u03c2 \u0397\u03bd\u03c9\u03bc\u03ad\u03bd\u03b5\u03c2 \u03a0\u03bf\u03bb\u03b9\u03c4\u03b5\u03af\u03b5\u03c2 \u03ba\u03b1\u03b9 \u03c3\u03c5\u03b3\u03ba\u03b5\u03ba\u03c1\u03b9\u03bc\u03ad\u03bd\u03b1 \u03c3\u03c4\u03bf \u03a4\u03ad\u03be\u03b1\u03c2, \u03bf\u03c1\u03af\u03c3\u03c4\u03b7\u03ba\u03b5 \u03b7 \u03b7\u03bc\u03b5\u03c1\u03bf\u03bc\u03b7\u03bd\u03af\u03b1, \u03b3\u03b9\u03b1 \u03c4\u03b7\u03bd \u03b5\u03c0\u03cc\u03bc\u03b5\u03bd\u03b7 \u03a0\u03ad\u03bc\u03c0\u03c4\u03b7 \u03b4\u03c5\u03c3\u03c4\u03c5\u03c7\u03ce\u03c2, \u03c4\u03b7\u03c2 \u03b5\u03ba\u03c4\u03ad\u03bb\u03b5\u03c3\u03b7\u03c2 \u03b5\u03bd\u03cc\u03c2 \u03b8\u03b1\u03bd\u03b1\u03c4\u03bf\u03c0\u03bf\u03b9\u03bd\u03af\u03c4\u03b7, \u03b5\u03bd\u03cc\u03c2 \u03bd\u03ad\u03bf\u03c5 34 \u03b5\u03c4\u03ce\u03bd \u03bc\u03b5 \u03c4\u03bf \u03cc\u03bd\u03bf\u03bc\u03b1 Hicks.
+\u039a\u03b1\u03c4\u03cc\u03c0\u03b9\u03bd \u03c0\u03c1\u03c9\u03c4\u03bf\u03b2\u03bf\u03c5\u03bb\u03af\u03b1\u03c2 \u03b5\u03bd\u03cc\u03c2 \u03b3\u03ac\u03bb\u03bb\u03bf\u03c5 \u03b2\u03bf\u03c5\u03bb\u03b5\u03c5\u03c4\u03ae, \u03c4\u03bf\u03c5 \u03ba. Zimeray, \u03ad\u03c7\u03b5\u03b9 \u03ae\u03b4\u03b7 \u03ba\u03b1\u03c4\u03b1\u03c4\u03b5\u03b8\u03b5\u03af \u03b1\u03af\u03c4\u03b7\u03c3\u03b7 \u03c0\u03bf\u03bb\u03bb\u03ce\u03bd \u03c3\u03c5\u03bd\u03c5\u03c0\u03bf\u03b3\u03c1\u03b1\u03c6\u03cc\u03bd\u03c4\u03c9\u03bd, \u03c3\u03c5\u03bc\u03c0\u03b5\u03c1\u03b9\u03bb\u03b1\u03bc\u03b2\u03b1\u03bd\u03bf\u03bc\u03ad\u03bd\u03bf\u03c5 \u03c4\u03bf\u03c5 \u03bf\u03bc\u03b9\u03bb\u03bf\u03cd\u03bd\u03c4\u03bf\u03c2, \u03c3\u03b1\u03c2 \u03b6\u03b7\u03c4\u03ce \u03c9\u03c3\u03c4\u03cc\u03c3\u03bf, \u03c3\u03c4\u03bf \u03c0\u03bd\u03b5\u03cd\u03bc\u03b1 \u03c4\u03b7\u03c2 \u03ba\u03b1\u03c4\u03b5\u03cd\u03b8\u03c5\u03bd\u03c3\u03b7\u03c2 \u03c0\u03bf\u03c5 \u03c7\u03ac\u03c1\u03b1\u03be\u03b1\u03bd \u03c4\u03bf \u0395\u03c5\u03c1\u03c9\u03c0\u03b1\u03ca\u03ba\u03cc \u039a\u03bf\u03b9\u03bd\u03bf\u03b2\u03bf\u03cd\u03bb\u03b9\u03bf \u03ba\u03b1\u03b9 \u03bf\u03bb\u03cc\u03ba\u03bb\u03b7\u03c1\u03b7 \u03b7 \u0395\u03c5\u03c1\u03c9\u03c0\u03b1\u03ca\u03ba\u03ae \u039a\u03bf\u03b9\u03bd\u03cc\u03c4\u03b7\u03c4\u03b1, \u03bd\u03b1 \u03c0\u03b1\u03c1\u03ad\u03bc\u03b2\u03b5\u03c4\u03b5, \u03bc\u03b5 \u03c4\u03bf \u03ba\u03cd\u03c1\u03bf\u03c2 \u03c4\u03bf\u03c5 \u03b1\u03be\u03b9\u03ce\u03bc\u03b1\u03c4\u03cc\u03c2 \u03c3\u03b1\u03c2 \u03ba\u03b1\u03b9 \u03c4\u03bf\u03c5 \u03b8\u03b5\u03c3\u03bc\u03b9\u03ba\u03bf\u03cd \u03bf\u03c1\u03b3\u03ac\u03bd\u03bf\u03c5 \u03c0\u03bf\u03c5 \u03b5\u03ba\u03c0\u03c1\u03bf\u03c3\u03c9\u03c0\u03b5\u03af\u03c4\u03b5, \u03c0\u03c1\u03bf\u03c2 \u03c4\u03bf\u03bd \u03a0\u03c1\u03cc\u03b5\u03b4\u03c1\u03bf \u03ba\u03b1\u03b9 \u03c4\u03bf\u03bd \u039a\u03c5\u03b2\u03b5\u03c1\u03bd\u03ae\u03c4\u03b7 \u03c4\u03bf\u03c5 \u03a4\u03ad\u03be\u03b1\u03c2 Bush, \u03c0\u03bf\u03c5 \u03ad\u03c7\u03b5\u03b9 \u03c4\u03b7\u03bd \u03b5\u03be\u03bf\u03c5\u03c3\u03af\u03b1 \u03bd\u03b1 \u03b1\u03bd\u03b1\u03c3\u03c4\u03b5\u03af\u03bb\u03b5\u03b9 \u03c4\u03b7 \u03b8\u03b1\u03bd\u03b1\u03c4\u03b9\u03ba\u03ae \u03ba\u03b1\u03c4\u03b1\u03b4\u03af\u03ba\u03b7 \u03ba\u03b1\u03b9 \u03bd\u03b1 \u03b1\u03c0\u03bf\u03bd\u03b5\u03af\u03bc\u03b5\u03b9 \u03c7\u03ac\u03c1\u03b7 \u03c3\u03c4\u03bf\u03bd \u03ba\u03b1\u03c4\u03ac\u03b4\u03b9\u03ba\u03bf.
+\u0397 \u03c3\u03c4\u03ac\u03c3\u03b7 \u03b1\u03c5\u03c4\u03ae \u03b5\u03af\u03bd\u03b1\u03b9 \u03c3\u03c5\u03bd\u03b5\u03c0\u03ae\u03c2 \u03bc\u03b5 \u03c4\u03b9\u03c2 \u03b1\u03c1\u03c7\u03ad\u03c2 \u03c0\u03bf\u03c5 \u03c0\u03ac\u03bd\u03c4\u03b1 \u03c0\u03c1\u03bf\u03c3\u03c0\u03af\u03b6\u03b1\u03bc\u03b5.
+
+\u0395\u03c5\u03c7\u03b1\u03c1\u03b9\u03c3\u03c4\u03ce \u03ba\u03cd\u03c1\u03b9\u03b5 Segni, \u03b8\u03b1 \u03c4\u03bf \u03c0\u03c1\u03ac\u03be\u03c9 \u03c0\u03bf\u03bb\u03cd \u03b5\u03c5\u03c7\u03b1\u03c1\u03af\u03c3\u03c4\u03c9\u03c2. \u03a0\u03c1\u03ac\u03b3\u03bc\u03b1\u03c4\u03b9, \u03b5\u03c5\u03b8\u03c5\u03b3\u03c1\u03b1\u03bc\u03bc\u03af\u03b6\u03b5\u03c4\u03b1\u03b9 \u03b1\u03c0\u03bf\u03bb\u03cd\u03c4\u03c9\u03c2 \u03bc\u03b5 \u03c4\u03b9\u03c2 \u03b8\u03ad\u03c3\u03b5\u03b9\u03c2 \u03c0\u03bf\u03c5 \u03b1\u03bd\u03ad\u03ba\u03b1\u03b8\u03b5\u03bd \u03c5\u03b9\u03bf\u03b8\u03b5\u03c4\u03bf\u03cd\u03c3\u03b5 \u03c4\u03bf \u039a\u03bf\u03b9\u03bd\u03bf\u03b2\u03bf\u03cd\u03bb\u03b9\u03cc \u03bc\u03b1\u03c2.
+
+\u039a\u03c5\u03c1\u03af\u03b1 \u03a0\u03c1\u03cc\u03b5\u03b4\u03c1\u03b5, \u03b8\u03b1 \u03ae\u03b8\u03b5\u03bb\u03b1 \u03bd\u03b1 \u03b5\u03c0\u03b9\u03c3\u03c4\u03ae\u03c3\u03c9 \u03c4\u03b7\u03bd \u03c0\u03c1\u03bf\u03c3\u03bf\u03c7\u03ae \u03c3\u03b1\u03c2 \u03c3\u03b5 \u03bc\u03af\u03b1 \u03c0\u03b5\u03c1\u03af\u03c0\u03c4\u03c9\u03c3\u03b7 \u03bc\u03b5 \u03c4\u03b7\u03bd \u03bf\u03c0\u03bf\u03af\u03b1 \u03ad\u03c7\u03b5\u03b9 \u03b1\u03c3\u03c7\u03bf\u03bb\u03b7\u03b8\u03b5\u03af \u03b5\u03c0\u03b1\u03bd\u03b5\u03b9\u03bb\u03b7\u03bc\u03bc\u03ad\u03bd\u03b1 \u03b1\u03c5\u03c4\u03cc \u03c4\u03bf \u039a\u03bf\u03b9\u03bd\u03bf\u03b2\u03bf\u03cd\u03bb\u03b9\u03bf. \u0395\u03bd\u03bd\u03bf\u03ce \u03c4\u03b7\u03bd \u03c0\u03b5\u03c1\u03af\u03c0\u03c4\u03c9\u03c3\u03b7 \u03c4\u03bf\u03c5 Alexander Nikitin. \u038c\u03bb\u03bf\u03b9 \u03b5\u03b4\u03ce \u03c7\u03b1\u03b9\u03c1\u03cc\u03bc\u03b1\u03c3\u03c4\u03b5 \u03c0\u03bf\u03c5 \u03c4\u03bf\u03bd \u03b1\u03b8\u03ce\u03c9\u03c3\u03b5 \u03c4\u03bf \u03b4\u03b9\u03ba\u03b1\u03c3\u03c4\u03ae\u03c1\u03b9\u03bf, \u03ba\u03b1\u03b8\u03b9\u03c3\u03c4\u03ce\u03bd\u03c4\u03b1\u03c2 \u03c3\u03b1\u03c6\u03ad\u03c2 \u03cc\u03c4\u03b9 \u03b7 \u03c0\u03c1\u03cc\u03c3\u03b2\u03b1\u03c3\u03b7 \u03c3\u03b5 \u03c0\u03bb\u03b7\u03c1\u03bf\u03c6\u03bf\u03c1\u03af\u03b5\u03c2 \u03c0\u03bf\u03c5 \u03b1\u03c6\u03bf\u03c1\u03bf\u03cd\u03bd \u03c4\u03bf \u03c0\u03b5\u03c1\u03b9\u03b2\u03ac\u03bb\u03bb\u03bf\u03bd \u03b1\u03c0\u03bf\u03c4\u03b5\u03bb\u03b5\u03af \u03c3\u03c5\u03bd\u03c4\u03b1\u03b3\u03bc\u03b1\u03c4\u03b9\u03ba\u03cc \u03b4\u03b9\u03ba\u03b1\u03af\u03c9\u03bc\u03b1 \u03ba\u03b1\u03b9 \u03c3\u03c4\u03b7 \u03a1\u03c9\u03c3\u03af\u03b1. \u03a4\u03ce\u03c1\u03b1 \u03cc\u03bc\u03c9\u03c2 \u03bc\u03b1\u03b8\u03b1\u03af\u03bd\u03bf\u03c5\u03bc\u03b5 \u03c0\u03c9\u03c2 \u03bf Alexander Nikitin \u03c0\u03c1\u03cc\u03ba\u03b5\u03b9\u03c4\u03b1\u03b9 \u03bd\u03b1 \u03ba\u03b1\u03c4\u03b7\u03b3\u03bf\u03c1\u03b7\u03b8\u03b5\u03af \u03b5\u03ba \u03bd\u03ad\u03bf\u03c5, \u03b1\u03c6\u03bf\u03cd \u03b8\u03b1 \u03b1\u03c3\u03ba\u03ae\u03c3\u03b5\u03b9 \u03ad\u03c6\u03b5\u03c3\u03b7 \u03b7 \u03b5\u03b9\u03c3\u03b1\u03b3\u03b3\u03b5\u03bb\u03af\u03b1. \u0393\u03bd\u03c9\u03c1\u03af\u03b6\u03bf\u03c5\u03bc\u03b5, \u03ba\u03b1\u03b9 \u03c4\u03bf \u03ad\u03c7\u03bf\u03c5\u03bc\u03b5 \u03b4\u03b9\u03b1\u03c0\u03b9\u03c3\u03c4\u03ce\u03c3\u03b5\u03b9 \u03c3\u03b5 \u03c0\u03ac\u03c1\u03b1 \u03c0\u03bf\u03bb\u03bb\u03ac, \u03cc\u03bd\u03c4\u03c9\u03c2, \u03c8\u03b7\u03c6\u03af\u03c3\u03bc\u03b1\u03c4\u03b1 - \u03b5\u03b9\u03b4\u03b9\u03ba\u03ac \u03ba\u03b1\u03c4\u03ac \u03c4\u03b7 \u03b4\u03b9\u03ac\u03c1\u03ba\u03b5\u03b9\u03b1 \u03c4\u03b7\u03c2 \u03c4\u03b5\u03bb
 \u03b5\u03c5\u03c4\u03b1\u03af\u03b1\u03c2 \u03c0\u03b5\u03c1\u03b9\u03cc\u03b4\u03bf\u03c5 \u03c3\u03c5\u03bd\u03cc\u03b4\u03bf\u03c5 \u03c4\u03bf\u03c5 \u03c0\u03c1\u03bf\u03b7\u03b3\u03bf\u03cd\u03bc\u03b5\u03bd\u03bf\u03c5 \u03ad\u03c4\u03bf\u03c5\u03c2 - \u03cc\u03c4\u03b9 \u03b1\u03c5\u03c4\u03cc \u03b4\u03b5\u03bd \u03b1\u03c0\u03bf\u03c4\u03b5\u03bb\u03b5\u03af \u03bd\u03bf\u03bc\u03b9\u03ba\u03ae \u03bc\u03cc\u03bd\u03bf\u03bd \u03c5\u03c0\u03cc\u03b8\u03b5\u03c3\u03b7 \u03b1\u03bb\u03bb\u03ac \u03ba\u03b1\u03b9 \u03cc\u03c4\u03b9 \u03b5\u03af\u03bd\u03b1\u03b9 \u03bb\u03ac\u03b8\u03bf\u03c2 \u03bd\u03b1 \u03ba\u03b1\u03c4\u03b7\u03b3\u03bf\u03c1\u03b5\u03af\u03c4\u03b1\u03b9 \u03bf Alexander Nikitin \u03b3\u03b9\u03b1 \u03b5\u03b3\u03ba\u03bb\u03b7\u03bc\u03b1\u03c4\u03b9\u03ba\u03ae \u03b4\u03c1\u03ac\u03c3\u03b7 \u03ba\u03b1\u03b9 \u03c0\u03c1\u03bf\u03b4\u03bf\u03c3\u03af\u03b1, \u03b5\u03c0\u03b5\u03b9\u03b4\u03ae \u03b5\u03bc\u03b5\u03af\u03c2 \u03b5\u03c0\u03c9\u03c6\u03b5\u03bb\u03bf\u03cd\u03bc\u03b5\u03b8\u03b1 \u03c9\u03c2 \u03b8\u03b9\u03b3\u03cc\u03bc\u03b5\u03bd\u03bf\u03b9 \u03b1\u03c0\u03cc \u03c4\u03b1 \u03c0\u03bf\u03c1\u03af\u03c3\u03bc\u03b1\u03c4\u03ac \u03c4\u03bf\u03c5. \u03a4\u03b1 \u03b5\u03bd \u03bb\u03cc\u03b3\u03c9 \u03c0\u03bf\u03c1\u03af\u03c3\u03bc\u03b1\u03c4\u03b1 \u03c3\u03c5\u03bd\u03b9\u03c3\u03c4\u03bf\u03cd\u03bd \u03c4\u03b7 \u03b2\u03ac\u03c3\u03b7 \u03c4\u03c9\u03bd \u03b5\u03c5\u03c1\u03c9\u03c0\u03b1\u03ca\u03ba\u03ce\u03bd \u03c0\u03c1\u03bf\u03b3\u03c1\u03b1\u03bc\u03bc\u03ac\u03c4\u03c9\u03bd \u03b3\u03b9\u03b1 \u03c4\u03b7\u03bd \u03c0\u03c1\u03bf\u03c3\u03c4\u03b1\u03c3\u03af\u03b1 \u03c4\u03b7\u03c2 \u0398\u03ac\u03bb\u03b1\u03c3\u03c3\u03b1\u03c2 \u03c4\u03bf\u03c5 \u039c\u03c0\u03ac\u03c1\u03b5\u03bd\u03c4, \u03ba\u03b1\u03b9 \u03b3\u03b9\u03b1 \u03c4\u03bf\u03bd \u03bb\u03cc\u03b3\u03bf \u03b1\u03c5\u03c4\u03cc \u03c3\u03b1\u03c2 \u03c0\u03b1\u03c1\u03b1\u03ba\u03b1\u03bb\u03ce \u03bd\u03b1 \u03bc\u03b5\u03bb\u03b5\u03c4\u03ae\u03c3\u03b5\u03c4\u03b5 \u03ad\u03bd\u03b1 \u03c3\u03c7\u03ad\u03b4\u03b9\u03bf \u03b5\u03c0\u03b9\u03c3\u03c4\u03bf\u03bb\u03ae\u03c2 \u03c0\u03bf\u03c5 \u03b8\u03b1 \u03c3\u03b1\u03c2 \u03c0\u03b5\u03c1\u03b9\u03b3\u03c1\u03ac\u03c8\u03b5\u03b9 \u03c4\u03b1 \u03c3\u03b7\u03bc\u03b1\u03bd\u03c4\u03b9\u03ba\u03cc\u03c4\u03b5\u03c1\u03b1 \u03b3\u03b5\u03b3\u03bf\u03bd\u03cc\u03c4\u03b1 \u03ba\u03b1\u03b9 \u03bd\u03b1 \u03ba\u03b1\u03c4\u03b1\u03c3\u03c4\u03ae\u03c3\u03b5\u03c4\u03b5 \u03c3\u03b1\u03c6\u03ae \u03c4\u03b7 \u03b8\u03ad\u03c3\u03b7 \u03b1\u03c5\u03c4\u03ae \u03c3\u03c4\u03b7 \u03a1\u03c9\u03c3\u03af\u03b1 \u03c3\u03c4\u03bf \u03c0\u03bd\u03b5\u03cd\u03bc\u03b1 \u03c4\u03c9\u03bd \u03b1\u03c0\u03bf\u03c6\u03ac\u03c3\u03b5\u03c9\u03bd 
 \u03c4\u03bf\u03c5 \u039a\u03bf\u03b9\u03bd\u03bf\u03b2\u03bf\u03c5\u03bb\u03af\u03bf\u03c5
+
+\u0392\u03b5\u03b2\u03b1\u03af\u03c9\u03c2, \u03ba\u03c5\u03c1\u03af\u03b1 Schroedter, \u03b8\u03b1 \u03b5\u03be\u03b5\u03c4\u03ac\u03c3\u03c9 \u03c0\u03bf\u03bb\u03cd \u03b5\u03c5\u03c7\u03b1\u03c1\u03af\u03c3\u03c4\u03c9\u03c2 \u03c4\u03b1 \u03c3\u03c7\u03b5\u03c4\u03b9\u03ba\u03ac \u03bc\u03b5 \u03c4\u03bf \u03c3\u03c5\u03b3\u03ba\u03b5\u03ba\u03c1\u03b9\u03bc\u03ad\u03bd\u03bf \u03b6\u03ae\u03c4\u03b7\u03bc\u03b1 \u03b3\u03b5\u03b3\u03bf\u03bd\u03cc\u03c4\u03b1 \u03bc\u03cc\u03bb\u03b9\u03c2 \u03bb\u03ac\u03b2\u03c9 \u03c4\u03b7\u03bd \u03b5\u03c0\u03b9\u03c3\u03c4\u03bf\u03bb\u03ae \u03c3\u03b1\u03c2.
+
+\u039a\u03c5\u03c1\u03af\u03b1 \u03a0\u03c1\u03cc\u03b5\u03b4\u03c1\u03b5, \u03b8\u03b1 \u03ae\u03b8\u03b5\u03bb\u03b1 \u03ba\u03b1\u03c4\u03b1\u03c1\u03c7\u03ac\u03c2 \u03bd\u03b1 \u03c3\u03b1\u03c2 \u03c3\u03c5\u03b3\u03c7\u03b1\u03c1\u03ce \u03c0\u03bf\u03c5 \u03ba\u03c1\u03b1\u03c4\u03ae\u03c3\u03b1\u03c4\u03b5 \u03c4\u03bf \u03bb\u03cc\u03b3\u03bf \u03c3\u03b1\u03c2, \u03b3\u03b9\u03b1\u03c4\u03af \u03b4\u03b9\u03b1\u03c0\u03b9\u03c3\u03c4\u03ce\u03bd\u03c9 \u03cc\u03bd\u03c4\u03c9\u03c2 \u03c3\u03b5 \u03b1\u03c5\u03c4\u03ae\u03bd \u03c4\u03b7\u03bd \u03c0\u03c1\u03ce\u03c4\u03b7 \u03c0\u03b5\u03c1\u03af\u03bf\u03b4\u03bf \u03c3\u03c5\u03bd\u03cc\u03b4\u03bf\u03c5 \u03c4\u03bf\u03c5 \u03ba\u03b1\u03b9\u03bd\u03bf\u03cd\u03c1\u03b3\u03b9\u03bf\u03c5 \u03c7\u03c1\u03cc\u03bd\u03bf\u03c5 \u03cc\u03c4\u03b9 \u03bf \u03b1\u03c1\u03b9\u03b8\u03bc\u03cc\u03c2 \u03c4\u03c9\u03bd \u03c4\u03b7\u03bb\u03b5\u03bf\u03c0\u03c4\u03b9\u03ba\u03ce\u03bd \u03c3\u03c4\u03b1\u03b8\u03bc\u03ce\u03bd \u03c0\u03bf\u03c5 \u03c0\u03b9\u03ac\u03bd\u03bf\u03c5\u03bc\u03b5 \u03c3\u03c4\u03bf \u03a3\u03ce\u03bc\u03b1 \u03b1\u03c5\u03be\u03ae\u03b8\u03b7\u03ba\u03b5 \u03b4\u03c1\u03b1\u03bc\u03b1\u03c4\u03b9\u03ba\u03ac. \u038c\u03bc\u03c9\u03c2, \u03ba\u03c5\u03c1\u03af\u03b1 \u03a0\u03c1\u03cc\u03b5\u03b4\u03c1\u03b5, \u03b4\u03b5\u03bd \u03ad\u03b3\u03b9\u03bd\u03b5 \u03b1\u03c5\u03c4\u03cc \u03c0\u03bf\u03c5 \u03b6\u03ae\u03c4\u03b7\u03c3\u03b1. \u0393\u03b9\u03b1\u03c4\u03af \u03c4\u03ce\u03c1\u03b1 \u03c0\u03b9\u03ac\u03bd\u03bf\u03c5\u03bc\u03b5 \u03b4\u03cd\u03bf \u03c6\u03b9\u03bd\u03bb\u03b1\u03bd\u03b4\u03b9\u03ba\u03bf\u03cd\u03c2 \u03c3\u03c4\u03b1\u03b8\u03bc\u03bf\u03cd\u03c2 \u03ba\u03b1\u03b9 \u03ad\u03bd\u03b1\u03bd \u03c0\u03bf\u03c1\u03c4\u03bf\u03b3\u03b1\u03bb\u03b9\u03ba\u03cc \u03b1\u03bb\u03bb\u03ac \u03ba\u03b1\u03bd\u03ad\u03bd\u03b1\u03bd \u03bf\u03bb\u03bb\u03b1\u03bd\u03b4\u03b9\u03ba\u03cc, \u03b5\u03bd\u03ce \u03b1\u03c5\u03c4\u03cc \u03c3\u03b1\u03c2 \u03b5\u03af\u03c7\u03b1 \u03b6\u03b7\u03c4\u03ae\u03c3\u03b5\u03b9, \u03b5\u03c0\u03b5\u03b9\u03b4\u03ae \u03ba\u03b1\u03b9 \u03b5\u03bc\u03b5\u03af\u03c2 \u03bf\u03b9 \u039f\u03bb\u03bb\u03b1\u03bd\u03b4\u03bf\u03af \u03b8\u03ad\u03bb\u03bf\u03c5\u03bc\u03b5 \u03bd\u03b1 \u03c0\u03b1\u03c1\u03b1\u03ba\u03bf\u03bb\u03bf\u03c5\u03b8\u03bf\u03cd\u03bc\u03b5 \u03c4\u03b1 \u03bd\u03ad\u03b1 \u03ba\u03ac\u03b8\u03b5 \u03bc\u03ae\u03bd\u03b1 \u03ba\u03ac\u03b8\u03b5 \u03c6\u03bf\u03c1\u03ac \u03c0\u03bf\u03c5 \u03bc\u03b1\u03c2 \u03c3\u03c4\u03ad\u03bb\u03bd\u03bf\u03c5\u03bd \u03b5\u03be\u03bf\u03c1\u03af\u03b1 \u03b5\u03b4\u03ce. \u0398\u03b1 \u03ae\u03b8\u03b5\u03bb\u03b1 \u03bb\u03bf\u03b9\u03c0\u03cc\u03bd \u03bd\u03b1 \u03c3\u03b1\u03c2 \u03c0\u03b1\u03c1\u03b1\u03ba\u03b1\u03bb\u03ad\u03c3\u03c9
  \u03b3\u03b9\u03b1 \u03ac\u03bb\u03bb\u03b7 \u03bc\u03af\u03b1 \u03c6\u03bf\u03c1\u03ac \u03bd\u03b1 \u03c6\u03c1\u03bf\u03bd\u03c4\u03af\u03c3\u03b5\u03c4\u03b5, \u03ce\u03c3\u03c4\u03b5 \u03bd\u03b1 \u03bc\u03c0\u03bf\u03c1\u03bf\u03cd\u03bc\u03b5 \u03bd\u03b1 \u03c0\u03b9\u03ac\u03bd\u03bf\u03c5\u03bc\u03b5 \u03ba\u03b1\u03b9 \u03ad\u03bd\u03b1\u03bd \u03bf\u03bb\u03bb\u03b1\u03bd\u03b4\u03b9\u03ba\u03cc \u03c3\u03c4\u03b1\u03b8\u03bc\u03cc.
+
+\u039a\u03c5\u03c1\u03af\u03b1 Plooij-van Gorsel, \u03bc\u03c0\u03bf\u03c1\u03ce \u03bd\u03b1 \u03c3\u03b1\u03c2 \u03c0\u03c9 \u03cc\u03c4\u03b9 \u03b1\u03c5\u03c4\u03cc \u03c4\u03bf \u03b6\u03ae\u03c4\u03b7\u03bc\u03b1 \u03b2\u03c1\u03af\u03c3\u03ba\u03b5\u03c4\u03b1\u03b9 \u03c3\u03c4\u03b7\u03bd \u03b7\u03bc\u03b5\u03c1\u03ae\u03c3\u03b9\u03b1 \u03b4\u03b9\u03ac\u03c4\u03b1\u03be\u03b7 \u03c4\u03b7\u03c2 \u03c3\u03c5\u03bd\u03b5\u03b4\u03c1\u03af\u03b1\u03c3\u03b7\u03c2 \u03c4\u03c9\u03bd \u039a\u03bf\u03c3\u03bc\u03b7\u03c4\u03cc\u03c1\u03c9\u03bd \u03c0\u03bf\u03c5 \u03b8\u03b1 \u03b4\u03b9\u03b5\u03be\u03b1\u03c7\u03b8\u03b5\u03af \u03c4\u03b7\u03bd \u03a4\u03b5\u03c4\u03ac\u03c1\u03c4\u03b7. \u0395\u03bb\u03c0\u03af\u03b6\u03c9 \u03cc\u03c4\u03b9 \u03b8\u03b1 \u03b5\u03be\u03b5\u03c4\u03b1\u03c3\u03b8\u03b5\u03af \u03bc\u03b5 \u03b8\u03b5\u03c4\u03b9\u03ba\u03cc \u03c0\u03bd\u03b5\u03cd\u03bc\u03b1.
+
+\u039a\u03c5\u03c1\u03af\u03b1 \u03a0\u03c1\u03cc\u03b5\u03b4\u03c1\u03b5, \u03bc\u03c0\u03bf\u03c1\u03b5\u03af\u03c4\u03b5 \u03bd\u03b1 \u03bc\u03bf\u03c5 \u03c0\u03b5\u03af\u03c4\u03b5 \u03b3\u03b9\u03b1\u03c4\u03af \u03b1\u03c5\u03c4\u03cc \u03c4\u03bf \u039a\u03bf\u03b9\u03bd\u03bf\u03b2\u03bf\u03cd\u03bb\u03b9\u03bf \u03b4\u03b5\u03bd \u03b5\u03c6\u03b1\u03c1\u03bc\u03cc\u03b6\u03b5\u03b9 \u03c4\u03b7 \u03bd\u03bf\u03bc\u03bf\u03b8\u03b5\u03c3\u03af\u03b1 \u03c0\u03b5\u03c1\u03af \u03c5\u03b3\u03b9\u03b5\u03b9\u03bd\u03ae\u03c2 \u03ba\u03b1\u03b9 \u03b1\u03c3\u03c6\u03ac\u03bb\u03b5\u03b9\u03b1\u03c2 \u03c4\u03b7\u03bd \u03bf\u03c0\u03bf\u03af\u03b1 \u03c3\u03c4\u03b7\u03bd \u03c0\u03c1\u03b1\u03b3\u03bc\u03b1\u03c4\u03b9\u03ba\u03cc\u03c4\u03b7\u03c4\u03b1 \u03c8\u03b7\u03c6\u03af\u03b6\u03b5\u03b9; \u0393\u03b9\u03b1\u03c4\u03af \u03b4\u03b5\u03bd \u03ad\u03c7\u03b5\u03b9 \u03c0\u03c1\u03b1\u03b3\u03bc\u03b1\u03c4\u03bf\u03c0\u03bf\u03b9\u03b7\u03b8\u03b5\u03af \u03ad\u03bb\u03b5\u03b3\u03c7\u03bf\u03c2 \u03c4\u03b7\u03c2 \u03c0\u03bf\u03b9\u03cc\u03c4\u03b7\u03c4\u03b1\u03c2 \u03c4\u03bf\u03c5 \u03b1\u03ad\u03c1\u03b1 \u03c3\u03c4\u03bf \u03c3\u03c5\u03b3\u03ba\u03b5\u03ba\u03c1\u03b9\u03bc\u03ad\u03bd\u03bf \u03ba\u03c4\u03af\u03c1\u03b9\u03bf \u03b1\u03c0\u03cc \u03c4\u03cc\u03c4\u03b5 \u03c0\u03bf\u03c5 \u03b5\u03ba\u03bb\u03b5\u03b3\u03ae\u03ba\u03b1\u03bc\u03b5; \u0393\u03b9\u03b1\u03c4\u03af \u03b4\u03b5\u03bd \u03ad\u03c7\u03b5\u03b9 \u03c0\u03c1\u03b1\u03b3\u03bc\u03b1\u03c4\u03bf\u03c0\u03bf\u03b9\u03b7\u03b8\u03b5\u03af \u03ba\u03b1\u03bc\u03af\u03b1 \u03c3\u03c5\u03bd\u03ad\u03bb\u03b5\u03c5\u03c3\u03b7 \u03c4\u03b7\u03c2 \u0395\u03c0\u03b9\u03c4\u03c1\u03bf\u03c0\u03ae\u03c2 \u03a5\u03b3\u03b5\u03af\u03b1\u03c2 \u03ba\u03b1\u03b9 \u0391\u03c3\u03c6\u03ac\u03bb\u03b5\u03b9\u03b1\u03c2 \u03b1\u03c0\u03cc \u03c4\u03bf 1998; \u0393\u03b9\u03b1\u03c4\u03af \u03b4\u03b5\u03bd \u03ad\u03c7\u03b5\u03b9 \u03c0\u03c1\u03b1\u03b3\u03bc\u03b1\u03c4\u03bf\u03c0\u03bf\u03b9\u03b7\u03b8\u03b5\u03af \u03ba\u03b1\u03bc\u03af\u03b1 \u03ac\u03c3\u03ba\u03b7\u03c3\u03b7 \u03c0\u03c5\u03c1\u03ba\u03b1\u03b3\u03b9\u03ac\u03c2, \u03bf\u03cd\u03c4\u03b5 \u03c3\u03c4\u03b1 \u03ba\u03c4\u03af\u03c1\u03b9\u03b1 \u03c4\u03bf\u03c5 \u039a\u03bf\u03b9\u03bd\u03bf\u03b2\u03bf\u03c5\u03bb\u03af\u03bf\u03c5 \u03c3\u03c4\u03b9\u03c2 \u0392\u03c1\u03c5\u03be\u03ad\u03bb\u03bb\u03b5\u03c2 \u03bf\u03cd\u03c4\u03b5 \u03c3\u03c4\u03b1 \u03ba\u03c4\u03af\u03c1\u03b9\u03b1 \u03c4\u03bf\u03c5 \u039a\u03bf\u03b9\u03bd\u03bf\u03b2\u03bf\u03c5\u03bb\u03af\u03bf\u03c5 \u03c3\u03c4\u03bf \u03a3\u03c4\u03c1\u03b1\u03c3\u03b2\u03bf\u03cd\u03c1\u03b3\u03bf; \u0393\u03b9\u03b1\u03c4\u03af \u03b4\u03b5\u03bd \u03c5\u03c0\u03ac\u03c1\u03c7\u03bf\u03c5\u03bd \u03bf\u03b4\u03b7\u03b3\u03af\u03b5\u03c2
  \u03b3\u03b9\u03b1 \u03c0\u03b5\u03c1\u03af\u03c0\u03c4\u03c9\u03c3\u03b7 \u03c0\u03c5\u03c1\u03ba\u03b1\u03b3\u03b9\u03ac\u03c2; \u0393\u03b9\u03b1\u03c4\u03af \u03bf\u03b9 \u03c3\u03ba\u03ac\u03bb\u03b5\u03c2 \u03b4\u03b5\u03bd \u03b2\u03b5\u03bb\u03c4\u03b9\u03ce\u03b8\u03b7\u03ba\u03b1\u03bd \u03b1\u03c0\u03cc \u03c4\u03cc\u03c4\u03b5 \u03c0\u03bf\u03c5 \u03ad\u03c0\u03b1\u03b8\u03b1 \u03c4\u03bf \u03b1\u03c4\u03cd\u03c7\u03b7\u03bc\u03ac \u03bc\u03bf\u03c5; \u0393\u03b9\u03b1\u03c4\u03af \u03b4\u03b5\u03bd \u03b5\u03c0\u03b9\u03b2\u03ac\u03bb\u03bb\u03b5\u03c4\u03b1\u03b9 \u03b7 \u03c4\u03ae\u03c1\u03b7\u03c3\u03b7 \u03c4\u03c9\u03bd \u03c7\u03ce\u03c1\u03c9\u03bd \u03bc\u03b7 \u03ba\u03b1\u03c0\u03bd\u03b9\u03c3\u03c4\u03ce\u03bd; \u03a6\u03b1\u03af\u03bd\u03b5\u03c4\u03b1\u03b9 \u03b5\u03bd\u03c4\u03b5\u03bb\u03ce\u03c2 \u03b5\u03c0\u03b1\u03af\u03c3\u03c7\u03c5\u03bd\u03c4\u03bf \u03bd\u03b1 \u03c8\u03b7\u03c6\u03af\u03b6\u03bf\u03c5\u03bc\u03b5 \u03bd\u03bf\u03bc\u03bf\u03b8\u03b5\u03c3\u03af\u03b1 \u03ba\u03b1\u03b9 \u03bd\u03b1 \u03bc\u03b7\u03bd \u03c4\u03b7\u03bd \u03b5\u03c6\u03b1\u03c1\u03bc\u03cc\u03b6\u03bf\u03c5\u03bc\u03b5 \u03b5\u03bc\u03b5\u03af\u03c2 \u03bf\u03b9 \u03af\u03b4\u03b9\u03bf\u03b9.
+(\u03a7\u03b5\u03b9\u03c1\u03bf\u03ba\u03c1\u03bf\u03c4\u03ae\u03bc\u03b1\u03c4\u03b1)
+
+\u039a\u03c5\u03c1\u03af\u03b1 Lynne, \u03ad\u03c7\u03b5\u03c4\u03b5 \u03b1\u03c0\u03cc\u03bb\u03c5\u03c4\u03bf \u03b4\u03af\u03ba\u03b9\u03bf \u03ba\u03b1\u03b9 \u03b8\u03b1 \u03b5\u03be\u03b1\u03ba\u03c1\u03b9\u03b2\u03ce\u03c3\u03c9 \u03b5\u03ac\u03bd \u03cc\u03bd\u03c4\u03c9\u03c2 \u03cc\u03bb\u03b1 \u03b1\u03c5\u03c4\u03ac \u03b4\u03b5\u03bd \u03ad\u03c7\u03bf\u03c5\u03bd \u03b3\u03af\u03bd\u03b5\u03b9. \u0398\u03b1 \u03b8\u03ad\u03c3\u03c9 \u03b5\u03c0\u03af\u03c3\u03b7\u03c2 \u03c4\u03bf \u03c0\u03c1\u03cc\u03b2\u03bb\u03b7\u03bc\u03b1 \u03c3\u03c4\u03bf \u03a3\u03ce\u03bc\u03b1 \u03c4\u03c9\u03bd \u039a\u03bf\u03c3\u03bc\u03b7\u03c4\u03cc\u03c1\u03c9\u03bd \u03ba\u03b1\u03b9 \u03b5\u03af\u03bc\u03b1\u03b9 \u03b2\u03ad\u03b2\u03b1\u03b9\u03b7 \u03cc\u03c4\u03b9 \u03bf\u03b9 \u039a\u03bf\u03c3\u03bc\u03ae\u03c4\u03bf\u03c1\u03ad\u03c2 \u03bc\u03b1\u03c2 \u03b8\u03b1 \u03b5\u03c0\u03b9\u03b4\u03b5\u03af\u03be\u03bf\u03c5\u03bd \u03b6\u03ae\u03bb\u03bf \u03ba\u03b1\u03b9 \u03b8\u03b1 \u03b5\u03bd\u03b5\u03c1\u03b3\u03ae\u03c3\u03bf\u03c5\u03bd \u03ad\u03c4\u03c3\u03b9, \u03ce\u03c3\u03c4\u03b5 \u03bd\u03b1 \u03c3\u03b5\u03b2\u03cc\u03bc\u03b1\u03c3\u03c4\u03b5 \u03c4\u03bf\u03c5\u03c2 \u03ba\u03b1\u03bd\u03bf\u03bd\u03b9\u03c3\u03bc\u03bf\u03cd\u03c2 \u03c0\u03bf\u03c5 \u03c3\u03c4\u03b7\u03bd \u03c0\u03c1\u03b1\u03b3\u03bc\u03b1\u03c4\u03b9\u03ba\u03cc\u03c4\u03b7\u03c4\u03b1 \u03c8\u03b7\u03c6\u03af\u03b6\u03bf\u03c5\u03bc\u03b5 \u03b5\u03bc\u03b5\u03af\u03c2.
+
+\u039a\u03c5\u03c1\u03af\u03b1 \u03a0\u03c1\u03cc\u03b5\u03b4\u03c1\u03b5, \u03b7 \u03ba. D\u03bdez Gonz\u03b1lez \u03ba\u03b1\u03b9 \u03b5\u03b3\u03ce \u03bf \u03af\u03b4\u03b9\u03bf\u03c2 \u03b5\u03af\u03c7\u03b1\u03bc\u03b5 \u03c5\u03c0\u03bf\u03b2\u03ac\u03bb\u03b5\u03b9 \u03ba\u03ac\u03c0\u03bf\u03b9\u03b5\u03c2 \u03b5\u03c1\u03c9\u03c4\u03ae\u03c3\u03b5\u03b9\u03c2 \u03c3\u03c7\u03b5\u03c4\u03b9\u03ba\u03ac \u03bc\u03b5 \u03c3\u03c5\u03b3\u03ba\u03b5\u03ba\u03c1\u03b9\u03bc\u03ad\u03bd\u03b5\u03c2 \u03b1\u03c0\u03cc\u03c8\u03b5\u03b9\u03c2 \u03c4\u03b7\u03c2 \u0391\u03bd\u03c4\u03b9\u03c0\u03c1\u03bf\u03ad\u03b4\u03c1\u03bf\u03c5 \u03ba. de Palacio, \u03bf\u03b9 \u03bf\u03c0\u03bf\u03af\u03b5\u03c2 \u03c0\u03b1\u03c1\u03bf\u03c5\u03c3\u03b9\u03ac\u03c3\u03c4\u03b7\u03ba\u03b1\u03bd \u03c3\u03b5 \u03b9\u03c3\u03c0\u03b1\u03bd\u03b9\u03ba\u03ae \u03b5\u03c6\u03b7\u03bc\u03b5\u03c1\u03af\u03b4\u03b1. \u039f\u03b9 \u03b1\u03c1\u03bc\u03cc\u03b4\u03b9\u03b5\u03c2 \u03c5\u03c0\u03b7\u03c1\u03b5\u03c3\u03af\u03b5\u03c2 \u03b4\u03b5\u03bd \u03c4\u03b9\u03c2 \u03ad\u03c7\u03bf\u03c5\u03bd \u03c3\u03c5\u03bc\u03c0\u03b5\u03c1\u03b9\u03bb\u03ac\u03b2\u03b5\u03b9 \u03c3\u03c4\u03b7\u03bd \u03b7\u03bc\u03b5\u03c1\u03ae\u03c3\u03b9\u03b1 \u03b4\u03b9\u03ac\u03c4\u03b1\u03be\u03b7, \u03b5\u03c0\u03b5\u03b9\u03b4\u03ae \u03b8\u03b5\u03ce\u03c1\u03b7\u03c3\u03b1\u03bd \u03cc\u03c4\u03b9 \u03ad\u03c7\u03bf\u03c5\u03bd \u03ae\u03b4\u03b7 \u03b4\u03bf\u03b8\u03b5\u03af \u03b1\u03c0\u03b1\u03bd\u03c4\u03ae\u03c3\u03b5\u03b9\u03c2 \u03c3\u03b5 \u03c0\u03c1\u03bf\u03b7\u03b3\u03bf\u03cd\u03bc\u03b5\u03bd\u03b7 \u03c3\u03cd\u03bd\u03bf\u03b4\u03bf.
+\u03a3\u03b1\u03c2 \u03c0\u03b1\u03c1\u03b1\u03ba\u03b1\u03bb\u03ce \u03bd\u03b1 \u03b5\u03c0\u03b1\u03bd\u03b5\u03be\u03b5\u03c4\u03b1\u03c3\u03c4\u03b5\u03af \u03b7 \u03b1\u03c0\u03cc\u03c6\u03b1\u03c3\u03b7 \u03b1\u03c5\u03c4\u03ae, \u03b3\u03b9\u03b1\u03c4\u03af \u03b1\u03c5\u03c4\u03cc \u03b4\u03b5\u03bd \u03b1\u03bb\u03b7\u03b8\u03b5\u03cd\u03b5\u03b9. \u039f\u03b9 \u03b5\u03c1\u03c9\u03c4\u03ae\u03c3\u03b5\u03b9\u03c2 \u03c3\u03c4\u03b9\u03c2 \u03bf\u03c0\u03bf\u03af\u03b5\u03c2 \u03b4\u03cc\u03b8\u03b7\u03ba\u03b5 \u03b1\u03c0\u03ac\u03bd\u03c4\u03b7\u03c3\u03b7 \u03c0\u03b1\u03bb\u03b1\u03b9\u03cc\u03c4\u03b5\u03c1\u03b1 \u03b1\u03bd\u03b1\u03c6\u03ad\u03c1\u03bf\u03bd\u03c4\u03b1\u03bd \u03c3\u03c4\u03b7\u03bd \u03c0\u03b1\u03c1\u03ad\u03bc\u03b2\u03b1\u03c3\u03b7 \u03c4\u03b7\u03c2 \u03ba. Palacio \u03c3\u03b5 \u03c3\u03c5\u03b3\u03ba\u03b5\u03ba\u03c1\u03b9\u03bc\u03ad\u03bd\u03b7 \u03c5\u03c0\u03cc\u03b8\u03b5\u03c3\u03b7 \u03ba\u03b1\u03b9 \u03cc\u03c7\u03b9 \u03c3\u03c4\u03b9\u03c2 \u03b5\u03bd \u03bb\u03cc\u03b3\u03c9 \u03b4\u03b7\u03bb\u03ce\u03c3\u03b5\u03b9\u03c2, \u03bf\u03b9 \u03bf\u03c0\u03bf\u03af\u03b5\u03c2 \u03b4\u03b7\u03bc\u03bf\u03c3\u03b9\u03b5\u03cd\u03c4\u03b7\u03ba\u03b1\u03bd \u03c3\u03c4\u03b7\u03bd \u03b5\u03c6\u03b7\u03bc\u03b5\u03c1\u03af\u03b4\u03b1 ABC \u03c3\u03c4\u03b9\u03c2 18 \u03c4\u03bf\u03c5 \u03c0\u03b5\u03c1\u03b1\u03c3\u03bc\u03ad\u03bd\u03bf\u03c5 \u039d\u03bf\u03b5\u03bc\u03b2\u03c1\u03af\u03bf\u03c5.
+
+\u0391\u03b3\u03b1\u03c0\u03b7\u03c4\u03ad \u03c3\u03c5\u03bd\u03ac\u03b4\u03b5\u03bb\u03c6\u03b5, \u03b8\u03b1 \u03c4\u03b1 \u03b5\u03be\u03b1\u03ba\u03c1\u03b9\u03b2\u03ce\u03c3\u03bf\u03c5\u03bc\u03b5 \u03cc\u03bb\u03b1 \u03b1\u03c5\u03c4\u03ac. \u03a3\u03b1\u03c2 \u03bf\u03bc\u03bf\u03bb\u03bf\u03b3\u03ce \u03cc\u03c4\u03b9, \u03b5\u03c0\u03af \u03c4\u03bf\u03c5 \u03c0\u03b1\u03c1\u03cc\u03bd\u03c4\u03bf\u03c2, \u03c4\u03b1 \u03c0\u03c1\u03ac\u03b3\u03bc\u03b1\u03c4\u03b1 \u03bc\u03bf\u03c5 \u03c6\u03b1\u03af\u03bd\u03bf\u03bd\u03c4\u03b1\u03b9 \u03ba\u03ac\u03c0\u03c9\u03c2 \u03c3\u03c5\u03b3\u03ba\u03b5\u03c7\u03c5\u03bc\u03ad\u03bd\u03b1. \u03a3\u03c5\u03bd\u03b5\u03c0\u03ce\u03c2, \u03b8\u03b1 \u03b5\u03c0\u03b1\u03bd\u03b5\u03be\u03b5\u03c4\u03ac\u03c3\u03bf\u03c5\u03bc\u03b5 \u03c4\u03bf \u03b8\u03ad\u03bc\u03b1 \u03c0\u03bf\u03bb\u03cd \u03c3\u03bf\u03b2\u03b1\u03c1\u03ac, \u03bf\u03cd\u03c4\u03c9\u03c2 \u03ce\u03c3\u03c4\u03b5 \u03cc\u03bb\u03b1 \u03bd\u03b1 \u03b4\u03b9\u03b5\u03c5\u03b8\u03b5\u03c4\u03b7\u03b8\u03bf\u03cd\u03bd \u03c0\u03bb\u03ae\u03c1\u03c9\u03c2.
+
+\u039a\u03c5\u03c1\u03af\u03b1 \u03a0\u03c1\u03cc\u03b5\u03b4\u03c1\u03b5, \u03b8\u03b1 \u03ae\u03b8\u03b5\u03bb\u03b1 \u03bd\u03b1 \u03bc\u03ac\u03b8\u03c9 \u03b5\u03ac\u03bd \u03b8\u03b1 \u03c5\u03c0\u03ac\u03c1\u03be\u03b5\u03b9 \u03ad\u03bd\u03b1 \u03c3\u03b1\u03c6\u03ad\u03c2 \u03bc\u03ae\u03bd\u03c5\u03bc\u03b1 \u03b1\u03c0\u03cc \u03c4\u03bf \u039a\u03bf\u03b9\u03bd\u03bf\u03b2\u03bf\u03cd\u03bb\u03b9\u03bf \u03b1\u03c5\u03c4\u03ae \u03c4\u03b7\u03bd \u03b5\u03b2\u03b4\u03bf\u03bc\u03ac\u03b4\u03b1 \u03b3\u03b9\u03b1 \u03c4\u03b7 \u03b4\u03c5\u03c3\u03b1\u03c1\u03ad\u03c3\u03ba\u03b5\u03b9\u03ac \u03bc\u03b1\u03c2, \u03b1\u03bd\u03b1\u03c6\u03bf\u03c1\u03b9\u03ba\u03ac \u03bc\u03b5 \u03c4\u03b7 \u03c3\u03b7\u03bc\u03b5\u03c1\u03b9\u03bd\u03ae \u03b1\u03c0\u03cc\u03c6\u03b1\u03c3\u03b7 \u03bd\u03b1 \u03bc\u03b7\u03bd \u03b1\u03bd\u03b1\u03bd\u03b5\u03c9\u03b8\u03b5\u03af \u03c4\u03bf \u03b5\u03bc\u03c0\u03ac\u03c1\u03b3\u03ba\u03bf \u03cc\u03c0\u03bb\u03c9\u03bd \u03b3\u03b9\u03b1 \u03c4\u03b7\u03bd \u0399\u03bd\u03b4\u03bf\u03bd\u03b7\u03c3\u03af\u03b1, \u03b4\u03b5\u03b4\u03bf\u03bc\u03ad\u03bd\u03bf\u03c5 \u03cc\u03c4\u03b9 \u03b7 \u03c3\u03c5\u03bd\u03c4\u03c1\u03b9\u03c0\u03c4\u03b9\u03ba\u03ae \u03c0\u03bb\u03b5\u03b9\u03bf\u03c8\u03b7\u03c6\u03af\u03b1 \u03b1\u03c5\u03c4\u03bf\u03cd \u03c4\u03bf\u03c5 \u039a\u03bf\u03b9\u03bd\u03bf\u03b2\u03bf\u03c5\u03bb\u03af\u03bf\u03c5 \u03b5\u03af\u03c7\u03b5 \u03b5\u03c0\u03b9\u03b4\u03bf\u03ba\u03b9\u03bc\u03ac\u03c3\u03b5\u03b9 \u03c4\u03bf \u03b5\u03bc\u03c0\u03ac\u03c1\u03b3\u03ba\u03bf \u03cc\u03c0\u03bb\u03c9\u03bd \u03b3\u03b9\u03b1 \u03c4\u03b7\u03bd \u0399\u03bd\u03b4\u03bf\u03bd\u03b7\u03c3\u03af\u03b1 \u03c3\u03c4\u03bf \u03c0\u03b1\u03c1\u03b5\u03bb\u03b8\u03cc\u03bd. \u0397 \u03c3\u03b7\u03bc\u03b5\u03c1\u03b9\u03bd\u03ae \u03b1\u03c0\u03cc\u03c6\u03b1\u03c3\u03b7 \u03b3\u03b9\u03b1 \u03c4\u03b7 \u03bc\u03b7 \u03b1\u03bd\u03b1\u03bd\u03ad\u03c9\u03c3\u03b7 \u03c4\u03bf\u03c5 \u03b5\u03bc\u03c0\u03ac\u03c1\u03b3\u03ba\u03bf \u03b5\u03af\u03bd\u03b1\u03b9 \u03b5\u03be\u03b1\u03b9\u03c1\u03b5\u03c4\u03b9\u03ba\u03ac \u03b5\u03c0\u03b9\u03ba\u03af\u03bd\u03b4\u03c5\u03bd\u03b7, \u03bb\u03b1\u03bc\u03b2\u03b1\u03bd\u03bf\u03bc\u03ad\u03bd\u03b7\u03c2 \u03c5\u03c0\u03cc\u03c8\u03b7 \u03c4\u03b7\u03c2 \u03ba\u03b1\u03c4\u03ac\u03c3\u03c4\u03b1\u03c3\u03b7\u03c2 \u03b5\u03ba\u03b5\u03af. \u03a4\u03bf \u039a\u03bf\u03b9\u03bd\u03bf\u03b2\u03bf\u03cd\u03bb\u03b9\u03bf \u03bb\u03bf\u03b9\u03c0\u03cc\u03bd \u03bf\u03c6\u03b5\u03af\u03bb\u03b5\u03b9 \u03bd\u03b1 \u03c3\u03c4\u03b5\u03af\u03bb\u03b5\u03b9 \u03ad\u03bd\u03b1 \u03bc\u03ae\u03bd\u03c5\u03bc\u03b1, \u03b5\u03c6\u03cc\u03c3\u03bf\u03bd \u03b1\u03c5\u03c4\u03ae \u03b5\u03af\u03bd\u03b1\u03b9 \u03b7 \u03b5\u03c0\u03b9\u03b8\u03c5\u03bc\u03af\u03b1 
 \u03c4\u03b7\u03c2 \u03c3\u03c5\u03bd\u03c4\u03c1\u03b9\u03c0\u03c4\u03b9\u03ba\u03ae\u03c2 \u03c0\u03bb\u03b5\u03b9\u03bf\u03c8\u03b7\u03c6\u03af\u03b1\u03c2. \u0395\u03af\u03bd\u03b1\u03b9 \u03b1\u03bd\u03b5\u03c5\u03b8\u03c5\u03bd\u03cc\u03c4\u03b7\u03c4\u03b1 \u03b5\u03ba \u03bc\u03ad\u03c1\u03bf\u03c5\u03c2 \u03c4\u03c9\u03bd \u03ba\u03c1\u03b1\u03c4\u03ce\u03bd \u03bc\u03b5\u03bb\u03ce\u03bd \u03c4\u03b7\u03c2 \u0395\u0395 \u03b7 \u03ac\u03c1\u03bd\u03b7\u03c3\u03b7 \u03b1\u03bd\u03b1\u03bd\u03ad\u03c9\u03c3\u03b7\u03c2 \u03c4\u03bf\u03c5 \u03b5\u03bc\u03c0\u03ac\u03c1\u03b3\u03ba\u03bf. \u038c\u03c0\u03c9\u03c2 \u03bb\u03ad\u03b3\u03b5\u03c4\u03b1\u03b9, \u03b7 \u03ba\u03b1\u03c4\u03ac\u03c3\u03c4\u03b1\u03c3\u03b7 \u03b5\u03ba\u03b5\u03af \u03b5\u03af\u03bd\u03b1\u03b9 \u03bb\u03af\u03b1\u03bd \u03b1\u03c3\u03c4\u03b1\u03b8\u03ae\u03c2. \u03a3\u03c4\u03b7\u03bd \u03c0\u03c1\u03b1\u03b3\u03bc\u03b1\u03c4\u03b9\u03ba\u03cc\u03c4\u03b7\u03c4\u03b1, \u03c5\u03c0\u03ac\u03c1\u03c7\u03b5\u03b9 \u03bf \u03ba\u03af\u03bd\u03b4\u03c5\u03bd\u03bf\u03c2 \u03c3\u03c4\u03c1\u03b1\u03c4\u03b9\u03c9\u03c4\u03b9\u03ba\u03bf\u03cd \u03c0\u03c1\u03b1\u03be\u03b9\u03ba\u03bf\u03c0\u03ae\u03bc\u03b1\u03c4\u03bf\u03c2 \u03c3\u03c4\u03bf \u03bc\u03ad\u03bb\u03bb\u03bf\u03bd. \u0394\u03b5\u03bd \u03b3\u03bd\u03c9\u03c1\u03af\u03b6\u03bf\u03c5\u03bc\u03b5 \u03c4\u03b9 \u03b8\u03b1 \u03c3\u03c5\u03bc\u03b2\u03b5\u03af. \u0393\u03b9\u03b1\u03c4\u03af \u03bb\u03bf\u03b9\u03c0\u03cc\u03bd \u03c0\u03c1\u03ad\u03c0\u03b5\u03b9 \u03bd\u03b1 \u03b1\u03c0\u03bf\u03ba\u03bf\u03bc\u03af\u03c3\u03bf\u03c5\u03bd \u03ba\u03ad\u03c1\u03b4\u03bf\u03c2 \u03bf\u03b9 \u03c0\u03b1\u03c1\u03b1\u03b3\u03c9\u03b3\u03bf\u03af \u03cc\u03c0\u03bb\u03c9\u03bd \u03c4\u03b7\u03c2 \u0395\u0395 \u03b5\u03b9\u03c2 \u03b2\u03ac\u03c1\u03bf\u03c2 \u03b1\u03b8\u03ce\u03c9\u03bd \u03b1\u03bd\u03b8\u03c1\u03ce\u03c0\u03c9\u03bd;
+(\u03a7\u03b5\u03b9\u03c1\u03bf\u03ba\u03c1\u03bf\u03c4\u03ae\u03bc\u03b1\u03c4\u03b1)
+
+\u0395\u03bd \u03c0\u03ac\u03c3\u03b7 \u03c0\u03b5\u03c1\u03b9\u03c0\u03c4\u03ce\u03c3\u03b5\u03b9, \u03b1\u03c5\u03c4\u03cc \u03c4\u03bf \u03b6\u03ae\u03c4\u03b7\u03bc\u03b1 \u03b4\u03b5\u03bd \u03c5\u03c0\u03ac\u03c1\u03c7\u03b5\u03b9 \u03b5\u03c0\u03af \u03c4\u03bf\u03c5 \u03c0\u03b1\u03c1\u03cc\u03bd\u03c4\u03bf\u03c2 \u03bc\u03b5\u03c4\u03b1\u03be\u03cd \u03c4\u03c9\u03bd \u03b1\u03b9\u03c4\u03ae\u03c3\u03b5\u03c9\u03bd \u03ba\u03b1\u03c4\u03b5\u03c0\u03b5\u03af\u03b3\u03bf\u03bd\u03c4\u03bf\u03c2 \u03b3\u03b9\u03b1 \u03c4\u03b7\u03bd \u03b5\u03c1\u03c7\u03cc\u03bc\u03b5\u03bd\u03b7 \u03a0\u03ad\u03bc\u03c0\u03c4\u03b7.
+
+\u0394\u03b9\u03ac\u03c4\u03b1\u03be\u03b7 \u03c4\u03c9v \u03b5\u03c1\u03b3\u03b1\u03c3\u03b9\u03cev
+\u0397 \u03b7\u03bc\u03b5\u03c1\u03ae\u03c3\u03b9\u03b1 \u03b4\u03b9\u03ac\u03c4\u03b1\u03be\u03b7 \u03c0\u03c1\u03bf\u03b2\u03bb\u03ad\u03c0\u03b5\u03b9 \u03c4\u03b7\u03bd \u03b5\u03be\u03ad\u03c4\u03b1\u03c3\u03b7 \u03c4\u03bf\u03c5 \u03c4\u03b5\u03bb\u03b9\u03ba\u03bf\u03cd \u03c3\u03c7\u03b5\u03b4\u03af\u03bf\u03c5 \u03c4\u03b7\u03c2 \u03b7\u03bc\u03b5\u03c1\u03ae\u03c3\u03b9\u03b1\u03c2 \u03b4\u03b9\u03ac\u03c4\u03b1\u03be\u03b7\u03c2 \u03ad\u03c4\u03c3\u03b9 \u03cc\u03c0\u03c9\u03c2 \u03ba\u03b1\u03c4\u03b1\u03c1\u03c4\u03af\u03c3\u03c4\u03b7\u03ba\u03b5 \u03b1\u03c0\u03cc \u03c4\u03b7 \u0394\u03b9\u03ac\u03c3\u03ba\u03b5\u03c8\u03b7 \u03c4\u03c9\u03bd \u03a0\u03c1\u03bf\u03ad\u03b4\u03c1\u03c9\u03bd, \u03c4\u03b7\u03bd \u03a0\u03ad\u03bc\u03c0\u03c4\u03b7 13 \u0399\u03b1\u03bd\u03bf\u03c5\u03b1\u03c1\u03af\u03bf\u03c5, \u03c3\u03c5\u03bc\u03c6\u03ce\u03bd\u03c9\u03c2 \u03c0\u03c1\u03bf\u03c2 \u03c4\u03bf \u03ac\u03c1\u03b8\u03c1\u03bf 110 \u03c4\u03bf\u03c5 \u039a\u03b1\u03bd\u03bf\u03bd\u03b9\u03c3\u03bc\u03bf\u03cd. \u03a3\u03b5 \u03cc,\u03c4\u03b9 \u03b1\u03c6\u03bf\u03c1\u03ac \u03c4\u03b7 \u0394\u03b5\u03c5\u03c4\u03ad\u03c1\u03b1 \u03ba\u03b1\u03b9 \u03c4\u03b7\u03bd \u03a4\u03c1\u03af\u03c4\u03b7 \u03b4\u03b5\u03bd \u03ad\u03c7\u03bf\u03c5\u03bd \u03c5\u03c0\u03bf\u03b2\u03bb\u03b7\u03b8\u03b5\u03af \u03c4\u03c1\u03bf\u03c0\u03bf\u03c0\u03bf\u03b9\u03ae\u03c3\u03b5\u03b9\u03c2.
+\u03a4\u03b5\u03c4\u03ac\u03c1\u03c4\u03b7:
+\u0397 \u039f\u03bc\u03ac\u03b4\u03b1 \u03c4\u03bf\u03c5 \u039a\u03cc\u03bc\u03bc\u03b1\u03c4\u03bf\u03c2 \u03c4\u03c9\u03bd \u0395\u03c5\u03c1\u03c9\u03c0\u03b1\u03af\u03c9\u03bd \u03a3\u03bf\u03c3\u03b9\u03b1\u03bb\u03b9\u03c3\u03c4\u03ce\u03bd \u03b6\u03b7\u03c4\u03ac \u03bd\u03b1 \u03c3\u03c5\u03bc\u03c0\u03b5\u03c1\u03b9\u03bb\u03b7\u03c6\u03b8\u03b5\u03af \u03bc\u03af\u03b1 \u03b4\u03ae\u03bb\u03c9\u03c3\u03b7 \u03c4\u03b7\u03c2 \u0395\u03c0\u03b9\u03c4\u03c1\u03bf\u03c0\u03ae\u03c2 \u03b3\u03b9\u03b1 \u03c4\u03bf\u03c5\u03c2 \u03c3\u03c4\u03c1\u03b1\u03c4\u03b7\u03b3\u03b9\u03ba\u03bf\u03cd\u03c2 \u03c3\u03c4\u03cc\u03c7\u03bf\u03c5\u03c2 \u03c4\u03b7\u03c2 \u03c4\u03b7\u03bd \u03b5\u03c0\u03cc\u03bc\u03b5\u03bd\u03b7 \u03c0\u03b5\u03bd\u03c4\u03b1\u03b5\u03c4\u03af\u03b1, \u03ba\u03b1\u03b8\u03ce\u03c2 \u03ba\u03b1\u03b9 \u03b3\u03b9\u03b1 \u03c4\u03b7 \u03b4\u03b9\u03bf\u03b9\u03ba\u03b7\u03c4\u03b9\u03ba\u03ae \u03bc\u03b5\u03c4\u03b1\u03c1\u03c1\u03cd\u03b8\u03bc\u03b9\u03c3\u03b7 \u03c4\u03b7\u03c2 \u0395\u03c0\u03b9\u03c4\u03c1\u03bf\u03c0\u03ae\u03c2.
+\u0398\u03b1 \u03ae\u03b8\u03b5\u03bb\u03b1 \u03bf \u03ba. Bar\u03c3n Crespo, \u03c3\u03c5\u03bd\u03c4\u03ac\u03ba\u03c4\u03b7\u03c2 \u03c4\u03b7\u03c2 \u03b1\u03b9\u03c4\u03ae\u03c3\u03b5\u03c9\u03c2, \u03bd\u03b1 \u03c0\u03b1\u03c1\u03ad\u03bc\u03b2\u03b5\u03b9 \u03b3\u03b9\u03b1 \u03bd\u03b1 \u03c4\u03b7\u03bd \u03b1\u03b9\u03c4\u03b9\u03bf\u03bb\u03bf\u03b3\u03ae\u03c3\u03b5\u03b9, \u03b5\u03ac\u03bd \u03b2\u03b5\u03b2\u03b1\u03af\u03c9\u03c2 \u03c4\u03bf \u03b5\u03c0\u03b9\u03b8\u03c5\u03bc\u03b5\u03af. \u03a3\u03c4\u03b7 \u03c3\u03c5\u03bd\u03ad\u03c7\u03b5\u03b9\u03b1 \u03b8\u03b1 \u03c0\u03c1\u03ac\u03be\u03bf\u03c5\u03bc\u03b5 \u03cc\u03c0\u03c9\u03c2 \u03c3\u03c5\u03bd\u03ae\u03b8\u03c9\u03c2: \u03b8\u03b1 \u03b1\u03ba\u03bf\u03cd\u03c3\u03bf\u03c5\u03bc\u03b5 \u03ad\u03bd\u03b1\u03bd \u03b1\u03b3\u03bf\u03c1\u03b7\u03c4\u03ae \u03c5\u03c0\u03ad\u03c1 \u03ba\u03b1\u03b9 \u03ad\u03bd\u03b1\u03bd \u03b1\u03b3\u03bf\u03c1\u03b7\u03c4\u03ae \u03ba\u03b1\u03c4\u03ac.
+
+\u039a\u03c5\u03c1\u03af\u03b1 \u03a0\u03c1\u03cc\u03b5\u03b4\u03c1\u03b5, \u03b7 \u03c0\u03b1\u03c1\u03bf\u03c5\u03c3\u03af\u03b1\u03c3\u03b7 \u03c4\u03bf\u03c5 \u03c0\u03bf\u03bb\u03b9\u03c4\u03b9\u03ba\u03bf\u03cd \u03c0\u03c1\u03bf\u03b3\u03c1\u03ac\u03bc\u03bc\u03b1\u03c4\u03bf\u03c2 \u03c4\u03b7\u03c2 \u0395\u03c0\u03b9\u03c4\u03c1\u03bf\u03c0\u03ae\u03c2 Prodi \u03b3\u03b9\u03b1 \u03bf\u03bb\u03cc\u03ba\u03bb\u03b7\u03c1\u03b7 \u03c4\u03b7\u03bd \u03ba\u03bf\u03b9\u03bd\u03bf\u03b2\u03bf\u03c5\u03bb\u03b5\u03c5\u03c4\u03b9\u03ba\u03ae \u03c0\u03b5\u03c1\u03af\u03bf\u03b4\u03bf \u03c5\u03c0\u03ae\u03c1\u03be\u03b5 \u03b1\u03c1\u03c7\u03b9\u03ba\u03ac \u03c0\u03c1\u03cc\u03c4\u03b1\u03c3\u03b7 \u03c4\u03b7\u03c2 \u039f\u03bc\u03ac\u03b4\u03b1\u03c2 \u03c4\u03bf\u03c5 \u039a\u03cc\u03bc\u03bc\u03b1\u03c4\u03bf\u03c2 \u03c4\u03c9\u03bd \u0395\u03c5\u03c1\u03c9\u03c0\u03b1\u03af\u03c9\u03bd \u03a3\u03bf\u03c3\u03b9\u03b1\u03bb\u03b9\u03c3\u03c4\u03ce\u03bd, \u03b7 \u03bf\u03c0\u03bf\u03af\u03b1 \u03ba\u03b1\u03c4\u03ac\u03c6\u03b5\u03c1\u03b5 \u03bd\u03b1 \u03b1\u03c0\u03bf\u03c3\u03c0\u03ac\u03c3\u03b5\u03b9 \u03c4\u03b7\u03bd \u03bf\u03bc\u03bf\u03c6\u03c9\u03bd\u03af\u03b1 \u03c4\u03b7\u03c2 \u0394\u03b9\u03ac\u03c3\u03ba\u03b5\u03c8\u03b7\u03c2 \u03c4\u03c9\u03bd \u03a0\u03c1\u03bf\u03ad\u03b4\u03c1\u03c9\u03bd \u03c4\u03bf\u03bd \u03a3\u03b5\u03c0\u03c4\u03ad\u03bc\u03b2\u03c1\u03b9\u03bf, \u03ba\u03b1\u03b8\u03ce\u03c2 \u03ba\u03b1\u03b9 \u03c4\u03b7 \u03c1\u03b7\u03c4\u03ae \u03ad\u03b3\u03ba\u03c1\u03b9\u03c3\u03b7 \u03c4\u03bf\u03c5 \u03a0\u03c1\u03bf\u03ad\u03b4\u03c1\u03bf\u03c5 \u03ba. Prodi, \u03bf \u03bf\u03c0\u03bf\u03af\u03bf\u03c2 \u03b5\u03c0\u03b1\u03bd\u03ad\u03bb\u03b1\u03b2\u03b5 \u03c4\u03b7 \u03b4\u03ad\u03c3\u03bc\u03b5\u03c5\u03c3\u03ae \u03c4\u03bf\u03c5 \u03c3\u03c4\u03bf\u03bd \u03bb\u03cc\u03b3\u03bf \u03c0\u03bf\u03c5 \u03b5\u03ba\u03c6\u03ce\u03bd\u03b7\u03c3\u03b5 \u03ba\u03b1\u03c4\u03ac \u03c4\u03b7\u03bd \u03c4\u03b5\u03bb\u03b5\u03c4\u03ae \u03b1\u03bd\u03ac\u03bb\u03b7\u03c8\u03b7\u03c2 \u03c4\u03c9\u03bd \u03ba\u03b1\u03b8\u03b7\u03ba\u03cc\u03bd\u03c4\u03c9\u03bd \u03c4\u03bf\u03c5.
+\u0391\u03c5\u03c4\u03ae \u03b7 \u03b4\u03ad\u03c3\u03bc\u03b5\u03c5\u03c3\u03b7 \u03b5\u03af\u03bd\u03b1\u03b9 \u03c3\u03b7\u03bc\u03b1\u03bd\u03c4\u03b9\u03ba\u03ae, \u03c3\u03c4\u03bf \u03b2\u03b1\u03b8\u03bc\u03cc \u03c0\u03bf\u03c5 \u03b7 \u0395\u03c0\u03b9\u03c4\u03c1\u03bf\u03c0\u03ae \u03b5\u03af\u03bd\u03b1\u03b9 \u03ad\u03bd\u03b1\u03c2 \u03bf\u03c1\u03b3\u03b1\u03bd\u03b9\u03c3\u03bc\u03cc\u03c2 \u03c0\u03bf\u03c5 \u03ba\u03b1\u03c4\u03ad\u03c7\u03b5\u03b9 \u03c4\u03bf \u03bc\u03bf\u03bd\u03bf\u03c0\u03ce\u03bb\u03b9\u03bf \u03c3\u03c4\u03b7 \u03bb\u03ae\u03c8\u03b7 \u03c0\u03c1\u03c9\u03c4\u03bf\u03b2\u03bf\u03c5\u03bb\u03b9\u03ce\u03bd \u03c3\u03cd\u03bc\u03c6\u03c9\u03bd\u03b1 \u03bc\u03b5 \u03c4\u03b9\u03c2 \u03a3\u03c5\u03bd\u03b8\u03ae\u03ba\u03b5\u03c2 \u03ba\u03b1\u03b9, \u03b5\u03c0\u03bf\u03bc\u03ad\u03bd\u03c9\u03c2, \u03b4\u03b9\u03b1\u03bc\u03bf\u03c1\u03c6\u03ce\u03bd\u03b5\u03b9 \u03ba\u03b1\u03c4\u03ac \u03b2\u03ac\u03c3\u03b7 \u03c4\u03b7 \u03c6\u03cd\u03c3\u03b7 \u03c4\u03b7\u03c2 \u03c0\u03bf\u03bb\u03b9\u03c4\u03b9\u03ba\u03ae\u03c2 \u03ba\u03b1\u03b9 \u03bd\u03bf\u03bc\u03bf\u03b8\u03b5\u03c4\u03b9\u03ba\u03ae\u03c2 \u03b4\u03c1\u03b1\u03c3\u03c4\u03b7\u03c1\u03b9\u03cc\u03c4\u03b7\u03c4\u03b1\u03c2 \u03b1\u03c5\u03c4\u03bf\u03cd \u03c4\u03bf\u03c5 \u039a\u03bf\u03b9\u03bd\u03bf\u03b2\u03bf\u03c5\u03bb\u03af\u03bf\u03c5 \u03b3\u03b9\u03b1 \u03c4\u03b1 \u03b5\u03c0\u03cc\u03bc\u03b5\u03bd\u03b1 \u03c0\u03ad\u03bd\u03c4\u03b5 \u03ad\u03c4\u03b7. \u03a5\u03c0\u03b5\u03bd\u03b8\u03c5\u03bc\u03af\u03b6\u03c9 \u03b5\u03c0\u03af\u03c3\u03b7\u03c2, \u03ba\u03c5\u03c1\u03af\u03b1 \u03a0\u03c1\u03cc\u03b5\u03b4\u03c1\u03b5, \u03cc\u03c4\u03b9 \u03b1\u03c5\u03c4\u03cc \u03c4\u03bf \u039a\u03bf\u03b9\u03bd\u03bf\u03b2\u03bf\u03cd\u03bb\u03b9\u03bf \u03ad\u03b4\u03c9\u03c3\u03b5 \u03c3\u03b5 \u03b4\u03cd\u03bf \u03c0\u03b5\u03c1\u03b9\u03c0\u03c4\u03ce\u03c3\u03b5\u03b9\u03c2, \u03c3\u03c4\u03b7 \u03b4\u03b9\u03ac\u03c1\u03ba\u03b5\u03b9\u03b1 \u03c4\u03b7\u03c2 \u03c0\u03c1\u03bf\u03b7\u03b3\u03bf\u03cd\u03bc\u03b5\u03bd\u03b7\u03c2 \u03ba\u03bf\u03b9\u03bd\u03bf\u03b2\u03bf\u03c5\u03bb\u03b5\u03c5\u03c4\u03b9\u03ba\u03ae\u03c2 \u03c0\u03b5\u03c1\u03b9\u03cc\u03b4\u03bf\u03c5, \u03c8\u03ae\u03c6\u03bf \u03b5\u03bc\u03c0\u03b9\u03c3\u03c4\u03bf\u03c3\u03cd\u03bd\u03b7\u03c2 \u03c3\u03c4\u03bf\u03bd \u03a0\u03c1\u03cc\u03b5\u03b4\u03c1\u03bf Prodi. \u03a3\u03c4\u03b7\u03bd \u03c4\u03c1\u03ad\u03c7\u03bf\u03c5\u03c3\u03b1 \u03ba\u03bf\u03b9\u03bd\u03bf\u03b2\u03bf\u03c5\u03bb\u03b5\u03c5\u03c4\u03b9\u03ba\u03ae \u03c0\u03b5\u03c1\u03af\u03bf\u03b4\u03bf, \u03c4\u03bf\u03c5 \u03ad\u03b4\u03c9\u03c3\u03b5 \u03b5\u03ba \u03bd\u03ad\u03bf\u03c5 \u03c8\u03ae\u03c6\u03bf \u03b5\u03bc\u03c0\u03b9\u03c3\u03c4\u03bf\u03c3\u03cd\u03bd\u03b7\u03c2 
 \u03bf\u03bd \u0399\u03bf\u03cd\u03bb\u03b9\u03bf \u03ba\u03b1\u03b9 \u03ba\u03b1\u03c4\u03cc\u03c0\u03b9\u03bd, \u03cc\u03c4\u03b1\u03bd \u03b7 \u03bd\u03ad\u03b1 \u0395\u03c0\u03b9\u03c4\u03c1\u03bf\u03c0\u03ae \u03b5\u03af\u03c7\u03b5 \u03b1\u03bd\u03b1\u03bb\u03ac\u03b2\u03b5\u03b9 \u03c4\u03b1 \u03ba\u03b1\u03b8\u03ae\u03ba\u03bf\u03bd\u03c4\u03ac \u03c4\u03b7\u03c2, \u03ad\u03b4\u03c9\u03c3\u03b5 \u03be\u03b1\u03bd\u03ac \u03c8\u03ae\u03c6\u03bf \u03b5\u03bc\u03c0\u03b9\u03c3\u03c4\u03bf\u03c3\u03cd\u03bd\u03b7\u03c2 \u03c3\u03b5 \u03cc\u03bb\u03b7 \u03c4\u03b7\u03bd \u0395\u03c0\u03b9\u03c4\u03c1\u03bf\u03c0\u03ae \u03c4\u03bf\u03bd \u03a3\u03b5\u03c0\u03c4\u03ad\u03bc\u03b2\u03c1\u03b9\u03bf. \u0395\u03c0\u03bf\u03bc\u03ad\u03bd\u03c9\u03c2, \u03c5\u03c0\u03ae\u03c1\u03c7\u03b5 \u03ae\u03b4\u03b7 \u03b1\u03c1\u03ba\u03b5\u03c4\u03cc\u03c2 \u03c7\u03c1\u03cc\u03bd\u03bf\u03c2 \u03ce\u03c3\u03c4\u03b5 \u03b7 \u0395\u03c0\u03b9\u03c4\u03c1\u03bf\u03c0\u03ae \u03bd\u03b1 \u03b5\u03ba\u03c0\u03bf\u03bd\u03ae\u03c3\u03b5\u03b9 \u03c4\u03bf \u03c0\u03c1\u03cc\u03b3\u03c1\u03b1\u03bc\u03bc\u03ac \u03c4\u03b7\u03c2 \u03ba\u03b1\u03b9 \u03b5\u03bc\u03b5\u03af\u03c2 \u03bd\u03b1 \u03bc\u03c0\u03bf\u03c1\u03ad\u03c3\u03bf\u03c5\u03bc\u03b5 \u03bd\u03b1 \u03c4\u03bf \u03bc\u03ac\u03b8\u03bf\u03c5\u03bc\u03b5 \u03ba\u03b1\u03b9 \u03bd\u03b1 \u03bc\u03c0\u03bf\u03c1\u03ad\u03c3\u03bf\u03c5\u03bc\u03b5 \u03bd\u03b1 \u03c4\u03bf \u03b5\u03be\u03b7\u03b3\u03ae\u03c3\u03bf\u03c5\u03bc\u03b5 \u03c3\u03c4\u03bf\u03c5\u03c2 \u03c0\u03bf\u03bb\u03af\u03c4\u03b5\u03c2. \u03a3\u03b5 \u03b1\u03c5\u03c4\u03cc \u03c4\u03bf \u03c0\u03bb\u03b1\u03af\u03c3\u03b9\u03bf, \u03c5\u03c0\u03b5\u03bd\u03b8\u03c5\u03bc\u03af\u03b6\u03c9 \u03c4\u03bf \u03c8\u03ae\u03c6\u03b9\u03c3\u03bc\u03b1 \u03c4\u03b7\u03c2 15\u03b7\u03c2 \u03a3\u03b5\u03c0\u03c4\u03b5\u03bc\u03b2\u03c1\u03af\u03bf\u03c5, \u03c3\u03c4\u03bf \u03bf\u03c0\u03bf\u03af\u03bf \u03c0\u03c1\u03bf\u03c4\u03b5\u03b9\u03bd\u03cc\u03c4\u03b1\u03bd \u03b7 \u03c5\u03c0\u03bf\u03b2\u03bf\u03bb\u03ae \u03c4\u03b7\u03c2 \u03c0\u03c1\u03cc\u03c4\u03b1\u03c3\u03b7\u03c2 \u03c3\u03c4\u03bf \u03c3\u03c5\u03bd\u03c4\u03bf\u03bc\u03cc\u03c4\u03b5\u03c1\u03bf \u03b4\u03c5\u03bd\u03b1\u03c4\u03cc \u03c7\u03c1\u03bf\u03bd\u03b9\u03ba\u03cc \u03b4\u03b9\u03ac\u03c3\u03c4\u03b7\u03bc\u03b1.
+\u03a4\u03b1 \u03b3\u03b5\u03b3\u03bf\u03bd\u03cc\u03c4\u03b1 \u03c0\u03bf\u03c5 \u03c3\u03c5\u03bd\u03ad\u03b2\u03b7\u03c3\u03b1\u03bd \u03c4\u03b7\u03bd \u03c0\u03b5\u03c1\u03b1\u03c3\u03bc\u03ad\u03bd\u03b7 \u03b5\u03b2\u03b4\u03bf\u03bc\u03ac\u03b4\u03b1 - \u03c4\u03b1 \u03bf\u03c0\u03bf\u03af\u03b1 \u03c0\u03c1\u03bf\u03ba\u03bb\u03ae\u03b8\u03b7\u03ba\u03b1\u03bd \u03b5\u03ba\u03c4\u03cc\u03c2 \u03c4\u03bf\u03c5 \u03c0\u03bb\u03b1\u03b9\u03c3\u03af\u03bf\u03c5 \u03c4\u03b7\u03c2 \u0394\u03b9\u03ac\u03c3\u03ba\u03b5\u03c8\u03b7\u03c2 \u03c4\u03c9\u03bd \u03a0\u03c1\u03bf\u03ad\u03b4\u03c1\u03c9\u03bd, \u03b5\u03bd\u03ce \u03b7 \u03c4\u03b5\u03bb\u03b5\u03c5\u03c4\u03b1\u03af\u03b1 \u03c7\u03c1\u03b7\u03c3\u03b9\u03bc\u03bf\u03c0\u03bf\u03b9\u03ae\u03b8\u03b7\u03ba\u03b5 \u03bc\u03cc\u03bd\u03bf \u03b3\u03b9\u03b1 \u03c4\u03b7\u03bd \u03b5\u03c0\u03b9\u03b2\u03b5\u03b2\u03b1\u03af\u03c9\u03c3\u03b7 \u03ba\u03b1\u03b9 \u03c4\u03b7\u03bd \u03b5\u03c0\u03b9\u03ba\u03cd\u03c1\u03c9\u03c3\u03b7 \u03b1\u03c0\u03bf\u03c6\u03ac\u03c3\u03b5\u03c9\u03bd \u03c0\u03bf\u03c5 \u03b5\u03af\u03c7\u03b1\u03bd \u03bb\u03b7\u03c6\u03b8\u03b5\u03af \u03b5\u03ba\u03c4\u03cc\u03c2 \u03b1\u03c5\u03c4\u03ae\u03c2 - \u03b8\u03ad\u03c4\u03bf\u03c5\u03bd \u03ad\u03bd\u03b1 \u03b4\u03af\u03bb\u03b7\u03bc\u03bc\u03b1: \u03b5\u03af\u03c4\u03b5 \u03b7 \u0395\u03c0\u03b9\u03c4\u03c1\u03bf\u03c0\u03ae \u03b4\u03b5\u03bd \u03b5\u03af\u03bd\u03b1\u03b9 \u03c3\u03b5 \u03b8\u03ad\u03c3\u03b7 \u03bd\u03b1 \u03c0\u03b1\u03c1\u03bf\u03c5\u03c3\u03b9\u03ac\u03c3\u03b5\u03b9 \u03c4\u03bf \u03c0\u03c1\u03cc\u03b3\u03c1\u03b1\u03bc\u03bc\u03b1 \u03b1\u03c5\u03c4\u03cc� (\u03c3\u03c4\u03b7\u03bd \u03c0\u03b5\u03c1\u03af\u03c0\u03c4\u03c9\u03c3\u03b7 \u03b1\u03c5\u03c4\u03ae, \u03b8\u03b1 \u03ae\u03c4\u03b1\u03bd \u03ba\u03b1\u03bb\u03cc \u03bd\u03b1 \u03c4\u03bf \u03b4\u03b7\u03bb\u03ce\u03c3\u03b5\u03b9. \u03a3\u03cd\u03bc\u03c6\u03c9\u03bd\u03b1 \u03bc\u03b5 \u03c4\u03b1 \u03bb\u03b5\u03b3\u03cc\u03bc\u03b5\u03bd\u03b1 \u03c4\u03bf\u03c5 \u03a0\u03c1\u03bf\u03ad\u03b4\u03c1\u03bf\u03c5 \u03c4\u03b7\u03c2, \u03b5\u03af\u03bd\u03b1\u03b9 \u03c3\u03b5 \u03b8\u03ad\u03c3\u03b7 \u03bd\u03b1 \u03c4\u03bf \u03c0\u03b1\u03c1\u03bf\u03c5\u03c3\u03b9\u03ac\u03c3\u03b5\u03b9. \u0394\u03b5\u03b4\u03bf\u03bc\u03ad\u03bd\u03bf\u03c5 \u03cc\u03c4\u03b9 \u03b7 \u0395\u03c0\u03b9\u03c4\u03c1\u03bf\u03c0\u03ae \u03b5\u03ba\u03c0\u03c1\u03bf\u03c3\u03c9\u03c0\u03b5\u03af\u03c4\u03b1\u03b9 \u03b1\u03c0\u03cc \u03c4\u03b7\u03bd \u0391\u03bd\u03c4\u03b9\u03c0\u03c1\u03cc\u03b5\u03b4\u03c1\u03bf \u03ba. de Palacio, \u03c0\u03b9\u03c3\u03c4\u03b5\u03cd\u03c9 \u03cc\u03c4\u03b9, \u03c0\u03c1\u03b9\u03bd \u03b1\u03c0\u03cc \u03c4\u03b7\u03bd \u03c8\u03b7\u03c6\u03bf\u03c6\u03bf
 \u03af\u03b1 \u03b8\u03b1 \u03ae\u03c4\u03b1\u03bd \u03c0\u03c1\u03ad\u03c0\u03bf\u03bd \u03bd\u03b1 \u03b3\u03bd\u03c9\u03c1\u03af\u03b6\u03bf\u03c5\u03bc\u03b5 \u03c4\u03b7\u03bd \u03ba\u03b1\u03c4\u03ac\u03c3\u03c4\u03b1\u03c3\u03b7 \u03c4\u03b7\u03c2 \u0395\u03c0\u03b9\u03c4\u03c1\u03bf\u03c0\u03ae\u03c2 \u03c9\u03c2 \u03c0\u03c1\u03bf\u03c2 \u03c4\u03b7 \u03b4\u03c5\u03bd\u03b1\u03c4\u03cc\u03c4\u03b7\u03c4\u03ac \u03c4\u03b7\u03c2 \u03bd\u03b1 \u03c0\u03b1\u03c1\u03bf\u03c5\u03c3\u03b9\u03ac\u03c3\u03b5\u03b9 \u03c4\u03bf \u03c0\u03c1\u03cc\u03b3\u03c1\u03b1\u03bc\u03bc\u03b1, \u03cc\u03c0\u03c9\u03c2 \u03b5\u03af\u03c7\u03b5 \u03c3\u03c5\u03bc\u03c6\u03c9\u03bd\u03b7\u03b8\u03b5\u03af)� \u03b5\u03af\u03c4\u03b5 \u03c4\u03bf \u039a\u03bf\u03b9\u03bd\u03bf\u03b2\u03bf\u03cd\u03bb\u03b9\u03bf \u03b4\u03b5\u03bd \u03b5\u03af\u03bd\u03b1\u03b9 \u03c3\u03b5 \u03b8\u03ad\u03c3\u03b7 \u03bd\u03b1 \u03b5\u03be\u03b5\u03c4\u03ac\u03c3\u03b5\u03b9 \u03c4\u03bf \u03c0\u03c1\u03cc\u03b3\u03c1\u03b1\u03bc\u03bc\u03b1 \u03b1\u03c5\u03c4\u03cc, \u03cc\u03c0\u03c9\u03c2 \u03c6\u03b1\u03af\u03bd\u03b5\u03c4\u03b1\u03b9 \u03cc\u03c4\u03b9 \u03b9\u03c3\u03c7\u03c5\u03c1\u03af\u03b6\u03bf\u03bd\u03c4\u03b1\u03b9 \u03bc\u03b5\u03c1\u03b9\u03ba\u03bf\u03af. \u039a\u03b1\u03c4\u03ac \u03c4\u03b7 \u03b3\u03bd\u03ce\u03bc\u03b7 \u03bc\u03bf\u03c5, \u03b1\u03c5\u03c4\u03cc \u03c4\u03bf \u03b4\u03b5\u03cd\u03c4\u03b5\u03c1\u03bf \u03b5\u03bd\u03b4\u03b5\u03c7\u03cc\u03bc\u03b5\u03bd\u03bf \u03b8\u03b1 \u03c3\u03ae\u03bc\u03b1\u03b9\u03bd\u03b5 \u03cc\u03c4\u03b9 \u03c0\u03b1\u03c1\u03b1\u03b9\u03c4\u03bf\u03cd\u03bc\u03b5\u03b8\u03b1 \u03b1\u03c0\u03cc \u03c4\u03b9\u03c2 \u03b1\u03c1\u03bc\u03bf\u03b4\u03b9\u03cc\u03c4\u03b7\u03c4\u03ad\u03c2 \u03bc\u03b1\u03c2 \u03c9\u03c2 \u039a\u03bf\u03b9\u03bd\u03bf\u03b2\u03bf\u03cd\u03bb\u03b9\u03bf \u03ba\u03b1\u03b9 \u03cc\u03c4\u03b9, \u03b5\u03c0\u03b9\u03c0\u03bb\u03ad\u03bf\u03bd, \u03b5\u03b9\u03c3\u03b1\u03b3\u03ac\u03b3\u03bf\u03c5\u03bc\u03b5 \u03bc\u03b9\u03b1 \u03c0\u03c1\u03c9\u03c4\u03cc\u03c4\u03c5\u03c0\u03b7 \u03ac\u03c0\u03bf\u03c8\u03b7, \u03bc\u03b9\u03b1 \u03ac\u03b3\u03bd\u03c9\u03c3\u03c4\u03b7 \u03bc\u03ad\u03b8\u03bf\u03b4\u03bf \u03c0\u03bf\u03c5 \u03c3\u03c5\u03bd\u03af\u03c3\u03c4\u03b1\u03c4\u03b1\u03b9 \u03c3\u03c4\u03b7 \u03b3\u03c1\u03b1\u03c0\u03c4\u03ae \u03c0\u03bb\u03b7\u03c1\u03bf\u03c6\u03cc\u03c1\u03b7\u03c3\u03b7 \u03c4\u03c9\u03bd \u03c0\u03bf\u03bb\u03b9\u03c4\u03b9\u03ba\u03ce\u03bd \u03bf\u03bc\u03ac\u03b4\u03c9\u03bd \u03c3\u03c7\u03b5\u03c4\u03b9\u03ba\u03ac \u03bc\u03b5 \u03c4\u03b9\u03c2 \u03c0\u03c1\u03bf\u03b3\u03c1\u03b1\u03bc\u03bc\u03b1\u03c4\u03b9\u03ba\u03ad\u03c2 \u03b4\u03b7\u03bb\u03ce\u03c3\u03b5\u03b9\u03c2 \u03c4\u03b7\u03c2 \u0395\u03c0\u03b9\u03c4\u03c1\u03bf\u03c0\u03ae
  \u03bc\u03af\u03b1 \u03b5\u03b2\u03b4\u03bf\u03bc\u03ac\u03b4\u03b1 \u03c0\u03c1\u03b9\u03bd - \u03ba\u03b1\u03b9 \u03cc\u03c7\u03b9 \u03c4\u03b7\u03bd \u03c0\u03c1\u03bf\u03b7\u03b3\u03bf\u03cd\u03bc\u03b5\u03bd\u03b7 \u03bc\u03ad\u03c1\u03b1, \u03cc\u03c0\u03c9\u03c2 \u03b5\u03af\u03c7\u03b5 \u03c3\u03c5\u03bc\u03c6\u03c9\u03bd\u03b7\u03b8\u03b5\u03af - \u03bb\u03b1\u03bc\u03b2\u03ac\u03bd\u03bf\u03bd\u03c4\u03b1\u03c2 \u03c5\u03c0\u03cc\u03c8\u03b7 \u03cc\u03c4\u03b9 \u03c4\u03bf \u03bd\u03bf\u03bc\u03bf\u03b8\u03b5\u03c4\u03b9\u03ba\u03cc \u03c0\u03c1\u03cc\u03b3\u03c1\u03b1\u03bc\u03bc\u03b1 \u03b8\u03b1 \u03c3\u03c5\u03b6\u03b7\u03c4\u03b7\u03b8\u03b5\u03af \u03c4\u03bf\u03bd \u03a6\u03b5\u03b2\u03c1\u03bf\u03c5\u03ac\u03c1\u03b9\u03bf, \u03ba\u03b1\u03c4\u03ac \u03c4\u03ad\u03c4\u03bf\u03b9\u03bf\u03bd \u03c4\u03c1\u03cc\u03c0\u03bf \u03ce\u03c3\u03c4\u03b5 \u03b8\u03b1 \u03bc\u03c0\u03bf\u03c1\u03bf\u03cd\u03c3\u03b1\u03bc\u03b5 \u03bd\u03b1 \u03c0\u03b1\u03c1\u03b1\u03ba\u03ac\u03bc\u03c8\u03bf\u03c5\u03bc\u03b5 \u03c4\u03b7 \u03c3\u03c5\u03b6\u03ae\u03c4\u03b7\u03c3\u03b7, \u03b5\u03c0\u03b5\u03b9\u03b4\u03ae \u03c4\u03b7\u03bd \u03b5\u03c0\u03cc\u03bc\u03b5\u03bd\u03b7 \u03bc\u03ad\u03c1\u03b1 \u03b8\u03b1 \u03b5\u03af\u03c7\u03b5 \u03b3\u03bd\u03c9\u03c3\u03c4\u03bf\u03c0\u03bf\u03b9\u03b7\u03b8\u03b5\u03af \u03c3\u03b5 \u03cc\u03bb\u03bf\u03c5\u03c2 \u03c4\u03bf\u03c5\u03c2 \u03c0\u03bf\u03bb\u03af\u03c4\u03b5\u03c2 \u03b1\u03c0\u03cc \u03c4\u03bf\u03bd \u03a4\u03cd\u03c0\u03bf \u03ba\u03b1\u03b9 \u03c4\u03bf Internet \u03ba\u03b1\u03b9 \u03c4\u03bf \u039a\u03bf\u03b9\u03bd\u03bf\u03b2\u03bf\u03cd\u03bb\u03b9\u03bf \u03b4\u03b5\u03bd \u03b8\u03b1 \u03b5\u03af\u03c7\u03b5 \u03c0\u03b9\u03b1 \u03bb\u03cc\u03b3\u03bf \u03bd\u03b1 \u03b1\u03c3\u03c7\u03bf\u03bb\u03b7\u03b8\u03b5\u03af \u03bc\u03b5 \u03c4\u03bf \u03b6\u03ae\u03c4\u03b7\u03bc\u03b1.
+\u039a\u03b1\u03b8\u03ce\u03c2 \u03b7 \u039f\u03bc\u03ac\u03b4\u03b1 \u03bc\u03bf\u03c5 \u03c0\u03b9\u03c3\u03c4\u03b5\u03cd\u03b5\u03b9 \u03cc\u03c4\u03b9 \u03c3\u03ba\u03bf\u03c0\u03cc\u03c2 \u03c4\u03b7\u03c2 \u03c3\u03cd\u03c3\u03c4\u03b1\u03c3\u03b7\u03c2 \u03b5\u03bd\u03cc\u03c2 \u039a\u03bf\u03b9\u03bd\u03bf\u03b2\u03bf\u03c5\u03bb\u03af\u03bf\u03c5 \u03b5\u03af\u03bd\u03b1\u03b9 \u03bd\u03b1 \u03b1\u03ba\u03bf\u03cd\u03b5\u03b9, \u03bd\u03b1 \u03b4\u03b9\u03b1\u03b2\u03bf\u03c5\u03bb\u03b5\u03cd\u03b5\u03c4\u03b1\u03b9 \u03ba\u03b1\u03b9 \u03bd\u03b1 \u03c3\u03c4\u03bf\u03c7\u03ac\u03b6\u03b5\u03c4\u03b1\u03b9, \u03bd\u03bf\u03bc\u03af\u03b6\u03bf\u03c5\u03bc\u03b5 \u03cc\u03c4\u03b9 \u03b4\u03b5\u03bd \u03c5\u03c0\u03ac\u03c1\u03c7\u03b5\u03b9 \u03ba\u03b1\u03bd\u03ad\u03bd\u03b1\u03c2 \u03bb\u03cc\u03b3\u03bf\u03c2 \u03c0\u03bf\u03c5 \u03bd\u03b1 \u03b4\u03b9\u03ba\u03b1\u03b9\u03bf\u03bb\u03bf\u03b3\u03b5\u03af \u03b1\u03c5\u03c4\u03ae \u03c4\u03b7\u03bd \u03b1\u03bd\u03b1\u03b2\u03bf\u03bb\u03ae \u03ba\u03b1\u03b9 \u03c0\u03b9\u03c3\u03c4\u03b5\u03cd\u03bf\u03c5\u03bc\u03b5 \u03cc\u03c4\u03b9 \u03b1\u03bd \u03b7 \u0395\u03c0\u03b9\u03c4\u03c1\u03bf\u03c0\u03ae \u03b5\u03af\u03bd\u03b1\u03b9 \u03c3\u03b5 \u03b8\u03ad\u03c3\u03b7 \u03bd\u03b1 \u03c4\u03bf \u03ba\u03ac\u03bd\u03b5\u03b9, \u03b7 \u03c3\u03c4\u03b9\u03b3\u03bc\u03ae \u03b5\u03af\u03bd\u03b1\u03b9 \u03ba\u03b1\u03c4\u03ac\u03bb\u03bb\u03b7\u03bb\u03b7, \u03b3\u03b9\u03b1 \u03bd\u03b1 \u03bc\u03c0\u03bf\u03c1\u03ad\u03c3\u03bf\u03c5\u03bc\u03b5 \u03bd\u03b1 \u03b5\u03c0\u03b1\u03bd\u03b1\u03c6\u03ad\u03c1\u03bf\u03c5\u03bc\u03b5 \u03c4\u03b7\u03bd \u03b1\u03c1\u03c7\u03b9\u03ba\u03ae \u03c3\u03c5\u03bc\u03c6\u03c9\u03bd\u03af\u03b1 \u03bc\u03b5\u03c4\u03b1\u03be\u03cd \u03c4\u03bf\u03c5 \u039a\u03bf\u03b9\u03bd\u03bf\u03b2\u03bf\u03c5\u03bb\u03af\u03bf\u03c5 \u03ba\u03b1\u03b9 \u03c4\u03b7\u03c2 \u0395\u03c0\u03b9\u03c4\u03c1\u03bf\u03c0\u03ae\u03c2 \u03ba\u03b1\u03b9 \u03bd\u03b1 \u03c3\u03c5\u03bd\u03b5\u03c7\u03af\u03c3\u03bf\u03c5\u03bc\u03b5 \u03c4\u03bf \u03ad\u03c1\u03b3\u03bf \u03bc\u03b1\u03c2 \u03bc\u03b5 \u03c5\u03c0\u03b5\u03c5\u03b8\u03c5\u03bd\u03cc\u03c4\u03b7\u03c4\u03b1 \u03b1\u03c0\u03ad\u03bd\u03b1\u03bd\u03c4\u03b9 \u03c3\u03c4\u03bf\u03c5\u03c2 \u03c3\u03c5\u03bc\u03c0\u03bf\u03bb\u03af\u03c4\u03b5\u03c2 \u03bc\u03b1\u03c2. \u0395\u03c0\u03bf\u03bc\u03ad\u03bd\u03c9\u03c2, \u03b7 \u03c0\u03c1\u03cc\u03c4\u03b1\u03c3\u03b7 \u03c0\u03bf\u03c5 \u03c5\u03c0\u03bf\u03b2\u03ac\u03bb\u03b5\u03b9 \u03b7 \u039f\u03bc\u03ac\u03b4\u03b1 \u03c4\u03bf\u03c5 \u039a\u03cc\u03bc\u03bc\u03b1\u03c4\u03bf\u03c2 \u03c4\u03c9\u03bd \u0395\u03c5\u03c1\u03c9\u03c0\u03b1\u03af\u03c9\u03bd \u03a3\u03bf\u03c3\u03b9\u03b1\u03bb\u03b9\u03c3\u03c4\u03ce\u03bd \u03ba\u03b1\u03b9 \u03c0\u03bf
  \u03b5\u03c3\u03b5\u03af\u03c2 \u03b1\u03bd\u03b1\u03c6\u03ad\u03c1\u03b1\u03c4\u03b5, \u03b5\u03af\u03bd\u03b1\u03b9 \u03bd\u03b1 \u03c0\u03b1\u03c1\u03bf\u03c5\u03c3\u03b9\u03b1\u03c3\u03c4\u03b5\u03af \u03c4\u03b7\u03bd \u03a4\u03b5\u03c4\u03ac\u03c1\u03c4\u03b7 \u03c4\u03bf \u03c0\u03c1\u03cc\u03b3\u03c1\u03b1\u03bc\u03bc\u03b1 \u03c4\u03b7\u03c2 \u03ba\u03bf\u03b9\u03bd\u03bf\u03b2\u03bf\u03c5\u03bb\u03b5\u03c5\u03c4\u03b9\u03ba\u03ae\u03c2 \u03c0\u03b5\u03c1\u03b9\u03cc\u03b4\u03bf\u03c5 \u03c4\u03b7\u03c2 \u0395\u03c0\u03b9\u03c4\u03c1\u03bf\u03c0\u03ae\u03c2 \u03c4\u03bf\u03c5 \u03ba. Prodi, \u03c3\u03c5\u03bc\u03c0\u03b5\u03c1\u03b9\u03bb\u03b1\u03bc\u03b2\u03b1\u03bd\u03bf\u03bc\u03ad\u03bd\u03bf\u03c5 \u03c3\u03c4\u03bf \u03c0\u03c1\u03cc\u03b3\u03c1\u03b1\u03bc\u03bc\u03b1 \u03b1\u03c5\u03c4\u03cc \u03c4\u03bf\u03c5 \u03c3\u03c7\u03b5\u03b4\u03af\u03bf\u03c5 \u03c4\u03b7\u03c2 \u03b4\u03b9\u03bf\u03b9\u03ba\u03b7\u03c4\u03b9\u03ba\u03ae\u03c2 \u03bc\u03b5\u03c4\u03b1\u03c1\u03c1\u03cd\u03b8\u03bc\u03b9\u03c3\u03b7\u03c2 \u03b4\u03b9\u03cc\u03c4\u03b9, \u03b4\u03b9\u03b1\u03c6\u03bf\u03c1\u03b5\u03c4\u03b9\u03ba\u03ac, \u03b5\u03af\u03bd\u03b1\u03b9 \u03c0\u03b9\u03b8\u03b1\u03bd\u03cc \u03bd\u03b1 \u03ad\u03c1\u03b8\u03bf\u03c5\u03bc\u03b5 \u03b1\u03bd\u03c4\u03b9\u03bc\u03ad\u03c4\u03c9\u03c0\u03bf\u03b9 \u03bc\u03b5 \u03bc\u03b9\u03b1 \u03c0\u03b1\u03c1\u03ac\u03b4\u03bf\u03be\u03b7 \u03ba\u03b1\u03c4\u03ac\u03c3\u03c4\u03b1\u03c3\u03b7: \u03bc\u03b5 \u03c4\u03b7 \u03b4\u03b9\u03ba\u03b1\u03b9\u03bf\u03bb\u03bf\u03b3\u03af\u03b1 \u03cc\u03c4\u03b9 \u03b4\u03b5\u03bd \u03c5\u03c0\u03ac\u03c1\u03c7\u03b5\u03b9 \u03ad\u03b3\u03b3\u03c1\u03b1\u03c6\u03bf, \u03b1\u03c0\u03cc \u03c4\u03b7 \u03bc\u03b9\u03b1 \u03c0\u03bb\u03b5\u03c5\u03c1\u03ac \u03bf \u03a0\u03c1\u03cc\u03b5\u03b4\u03c1\u03bf\u03c2 \u03c4\u03b7\u03c2 \u0395\u03c0\u03b9\u03c4\u03c1\u03bf\u03c0\u03ae\u03c2 \u03c3\u03c4\u03b5\u03c1\u03b5\u03af\u03c4\u03b1\u03b9 \u03c4\u03bf\u03c5 \u03b4\u03b9\u03ba\u03b1\u03b9\u03ce\u03bc\u03b1\u03c4\u03bf\u03c2 \u03bd\u03b1 \u03bc\u03b9\u03bb\u03ae\u03c3\u03b5\u03b9 \u03b5\u03bd\u03ce\u03c0\u03b9\u03bf\u03bd \u03c4\u03bf\u03c5 \u039a\u03bf\u03b9\u03bd\u03bf\u03b2\u03bf\u03c5\u03bb\u03af\u03bf\u03c5 \u03ba\u03b1\u03b9 \u03b1\u03c0\u03cc \u03c4\u03b7\u03bd \u03ac\u03bb\u03bb\u03b7 \u03b1\u03c0\u03bf\u03c1\u03c1\u03af\u03c0\u03c4\u03b5\u03c4\u03b1\u03b9 \u03b7 \u03b4\u03b9\u03b5\u03be\u03b1\u03b3\u03c9\u03b3\u03ae \u03bc\u03b9\u03b1\u03c2 \u03c3\u03c5\u03b6\u03ae\u03c4\u03b7\u03c3\u03b7\u03c2 \u03c3\u03c7\u03b5\u03c4\u03b9\u03ba\u03ac \u03bc\u03b5 \u03c4\u03b7 \u03bc\u03b5\u03c4\u03b1\u03c1\u03c1\u03cd\u03b8\u03bc\u03b9\u03c3\u03b7, \u03c7\u03c9\u03c1\u03af\u03c2 \u03bd\u03b1 \u03b3\u03bd\u03c9\u03c1\u03af\u03b6\u03b5\u03b9 \u03b1\u03c0\u03cc \u03c0
 \u03b9\u03bd \u03c4\u03bf \u039a\u03bf\u03b9\u03bd\u03bf\u03b2\u03bf\u03cd\u03bb\u03b9\u03bf \u03b1\u03c5\u03c4\u03cc \u03c4\u03b1 \u03ba\u03b5\u03af\u03bc\u03b5\u03bd\u03b1 \u03c3\u03c4\u03b1 \u03bf\u03c0\u03bf\u03af\u03b1 \u03b8\u03b1 \u03b2\u03b1\u03c3\u03af\u03b6\u03b5\u03c4\u03b1\u03b9 \u03b1\u03c5\u03c4\u03ae \u03b7 \u03c3\u03c5\u03b6\u03ae\u03c4\u03b7\u03c3\u03b7. \u0395\u03c0\u03bf\u03bc\u03ad\u03bd\u03c9\u03c2, \u03c3\u03b1\u03c2 \u03c0\u03b1\u03c1\u03b1\u03ba\u03b1\u03bb\u03ce, \u03ba\u03c5\u03c1\u03af\u03b1 \u03a0\u03c1\u03cc\u03b5\u03b4\u03c1\u03b5, \u03bd\u03b1 \u03b6\u03b7\u03c4\u03ae\u03c3\u03b5\u03c4\u03b5 \u03b1\u03c0\u03cc \u03c4\u03b7\u03bd \u0395\u03c0\u03b9\u03c4\u03c1\u03bf\u03c0\u03ae \u03bd\u03b1 \u03b5\u03ba\u03c6\u03c1\u03ac\u03c3\u03b5\u03b9 \u03c4\u03b7\u03bd \u03ac\u03c0\u03bf\u03c8\u03ae \u03c4\u03b7\u03c2 \u03b1\u03c5\u03c4\u03ae \u03c4\u03b7 \u03c3\u03c4\u03b9\u03b3\u03bc\u03ae \u03ba\u03b1\u03b9 \u03c3\u03c4\u03b7 \u03c3\u03c5\u03bd\u03ad\u03c7\u03b5\u03b9\u03b1 \u03bd\u03b1 \u03c0\u03c1\u03bf\u03c7\u03c9\u03c1\u03ae\u03c3\u03bf\u03c5\u03bc\u03b5 \u03c3\u03c4\u03b7\u03bd \u03c8\u03b7\u03c6\u03bf\u03c6\u03bf\u03c1\u03af\u03b1.
+(\u03a7\u03b5\u03b9\u03c1\u03bf\u03ba\u03c1\u03bf\u03c4\u03ae\u03bc\u03b1\u03c4\u03b1 \u03b1\u03c0\u03cc \u03c4\u03b7\u03bd \u039f\u03bc\u03ac\u03b4\u03b1 \u03c4\u03bf\u03c5 \u039a\u03cc\u03bc\u03bc\u03b1\u03c4\u03bf\u03c2 \u03c4\u03c9\u03bd \u0395\u03c5\u03c1\u03c9\u03c0\u03b1\u03af\u03c9\u03bd \u03a3\u03bf\u03c3\u03b9\u03b1\u03bb\u03b9\u03c3\u03c4\u03ce\u03bd)
+
+\u039a\u03c5\u03c1\u03af\u03b1 \u03a0\u03c1\u03cc\u03b5\u03b4\u03c1\u03b5, \u03b1\u03b3\u03b1\u03c0\u03b7\u03c4\u03bf\u03af \u03c3\u03c5\u03bd\u03ac\u03b4\u03b5\u03bb\u03c6\u03bf\u03b9, \u03bf\u03bc\u03bf\u03bb\u03bf\u03b3\u03bf\u03c5\u03bc\u03ad\u03bd\u03c9\u03c2 \u03b5\u03ba\u03c0\u03bb\u03ae\u03c3\u03c3\u03bf\u03bc\u03b1\u03b9 \u03ba\u03ac\u03c0\u03c9\u03c2 \u03bc\u03b5 \u03c4\u03b7 \u03c3\u03c5\u03bc\u03c0\u03b5\u03c1\u03b9\u03c6\u03bf\u03c1\u03ac \u03c4\u03bf\u03c5 \u03c3\u03c5\u03bd\u03b1\u03b4\u03ad\u03bb\u03c6\u03bf\u03c5 Bar\u03c3n Crespo, \u03bf \u03bf\u03c0\u03bf\u03af\u03bf\u03c2 \u03b6\u03b7\u03c4\u03b5\u03af \u03c4\u03ce\u03c1\u03b1 \u03bd\u03b1 \u03c4\u03b5\u03b8\u03b5\u03af \u03c3\u03c4\u03b7\u03bd \u03b7\u03bc\u03b5\u03c1\u03ae\u03c3\u03b9\u03b1 \u03b4\u03b9\u03ac\u03c4\u03b1\u03be\u03b7 \u03c4\u03b7\u03c2 \u03a4\u03b5\u03c4\u03ac\u03c1\u03c4\u03b7\u03c2 \u03c4\u03bf \u03b5\u03bd \u03bb\u03cc\u03b3\u03c9 \u03b8\u03ad\u03bc\u03b1 \u03c4\u03b7\u03c2 \u03b7\u03bc\u03b5\u03c1\u03ae\u03c3\u03b9\u03b1\u03c2 \u03b4\u03b9\u03ac\u03c4\u03b1\u03be\u03b7\u03c2.
+\u039a\u03cd\u03c1\u03b9\u03b5 \u03c3\u03c5\u03bd\u03ac\u03b4\u03b5\u03bb\u03c6\u03b5 Bar\u03c3n Crespo, \u03b4\u03b5\u03bd \u03bc\u03c0\u03bf\u03c1\u03ad\u03c3\u03b1\u03c4\u03b5 \u03bd\u03b1 \u03c0\u03b1\u03c1\u03b5\u03c5\u03c1\u03b5\u03b8\u03b5\u03af\u03c4\u03b5 \u03c3\u03c4\u03b7 \u0394\u03b9\u03ac\u03c3\u03ba\u03b5\u03c8\u03b7 \u03c4\u03c9\u03bd \u03a0\u03c1\u03bf\u03ad\u03b4\u03c1\u03c9\u03bd \u03c4\u03b7\u03bd \u03c0\u03b5\u03c1\u03b1\u03c3\u03bc\u03ad\u03bd\u03b7 \u03a0\u03ad\u03bc\u03c0\u03c4\u03b7. \u0394\u03b5\u03bd \u03c4\u03bf \u03ba\u03b1\u03c4\u03b1\u03ba\u03c1\u03af\u03bd\u03c9 \u03b1\u03c5\u03c4\u03cc, \u03b4\u03b9\u03cc\u03c4\u03b9 \u03c3\u03c5\u03bc\u03b2\u03b1\u03af\u03bd\u03b5\u03b9 \u03c3\u03c5\u03c7\u03bd\u03ac \u03bd\u03b1 \u03c3\u03c4\u03ad\u03bb\u03bd\u03bf\u03c5\u03bc\u03b5 \u03b5\u03ba\u03c0\u03c1\u03bf\u03c3\u03ce\u03c0\u03bf\u03c5\u03c2 \u03bc\u03b1\u03c2. \u0395\u03ba\u03b5\u03af \u03c3\u03b1\u03c2 \u03b5\u03ba\u03c0\u03c1\u03bf\u03c3\u03ce\u03c0\u03b7\u03c3\u03b5 \u03bf \u03c3\u03c5\u03bd\u03ac\u03b4\u03b5\u03bb\u03c6\u03bf\u03c2 H\u03b4nsch. \u03a3\u03c4\u03b7 \u0394\u03b9\u03ac\u03c3\u03ba\u03b5\u03c8\u03b7 \u03c4\u03c9\u03bd \u03a0\u03c1\u03bf\u03ad\u03b4\u03c1\u03c9\u03bd \u03c0\u03c1\u03b1\u03b3\u03bc\u03b1\u03c4\u03bf\u03c0\u03bf\u03b9\u03ae\u03c3\u03b1\u03bc\u03b5 \u03bc\u03b9\u03b1 \u03b4\u03b9\u03b5\u03be\u03bf\u03b4\u03b9\u03ba\u03ae \u03c3\u03c5\u03b6\u03ae\u03c4\u03b7\u03c3\u03b7. \u0397 \u039f\u03bc\u03ac\u03b4\u03b1 \u03c3\u03b1\u03c2 \u03ae\u03c4\u03b1\u03bd \u03b7 \u03bc\u03cc\u03bd\u03b7 \u03c0\u03bf\u03c5 \u03c5\u03c0\u03bf\u03c3\u03c4\u03ae\u03c1\u03b9\u03be\u03b5 \u03b1\u03c5\u03c4\u03cc \u03c0\u03bf\u03c5 \u03b1\u03bd\u03b1\u03c6\u03ad\u03c1\u03b5\u03c4\u03b5 \u03c4\u03ce\u03c1\u03b1 \u03b5\u03c3\u03b5\u03af\u03c2. \u03a3\u03c4\u03b7 \u03c3\u03c5\u03bd\u03ad\u03c7\u03b5\u03b9\u03b1 \u03b4\u03b9\u03b5\u03bd\u03b5\u03c1\u03b3\u03ae\u03c3\u03b1\u03bc\u03b5 \u03c8\u03b7\u03c6\u03bf\u03c6\u03bf\u03c1\u03af\u03b1. \u039a\u03ac\u03b8\u03b5 \u03c0\u03c1\u03cc\u03b5\u03b4\u03c1\u03bf\u03c2 \u03b4\u03b9\u03b1\u03b8\u03ad\u03c4\u03b5\u03b9, \u03c9\u03c2 \u03b3\u03bd\u03c9\u03c3\u03c4\u03cc\u03bd, \u03c4\u03cc\u03c3\u03b5\u03c2 \u03c8\u03ae\u03c6\u03bf\u03c5\u03c2, \u03cc\u03c3\u03b1 \u03b5\u03af\u03bd\u03b1\u03b9 \u03ba\u03b1\u03b9 \u03c4\u03b1 \u03bc\u03ad\u03bb\u03b7 \u03c4\u03b7\u03c2 \u039f\u03bc\u03ac\u03b4\u03b1\u03c2. \u0394\u03b9\u03b5\u03bd\u03b5\u03c1\u03b3\u03ae\u03b8\u03b7\u03ba\u03b5 \u03c8\u03b7\u03c6\u03bf\u03c6\u03bf\u03c1\u03af\u03b1 \u03b5\u03c0' \u03b1\u03c5\u03c4\u03bf\u03cd, \u03c4\u03bf \u03b1\u03c0\u03bf\u03c4\u03ad\u03bb\u03b5\u03c3\u03bc\u03b1 \u03c4\u03b7\u03c2 \u03bf\u03c0\u03bf\u03af\u03b1\u03c2 \u03b5\u03af\u03c7\u03b5 \u03c9\u03c2 \u03b5\u03be\u03ae
  \u03b1\u03c0' \u03cc,\u03c4\u03b9 \u03b8\u03c5\u03bc\u03ac\u03bc\u03b1\u03b9: 422 \u03c8\u03ae\u03c6\u03bf\u03b9 \u03b5\u03bd\u03b1\u03bd\u03c4\u03af\u03bf\u03bd 180 \u03c8\u03ae\u03c6\u03c9\u03bd \u03bc\u03b5 \u03bc\u03b5\u03c1\u03b9\u03ba\u03ad\u03c2 \u03bf\u03bb\u03b9\u03b3\u03ac\u03c1\u03b9\u03b8\u03bc\u03b5\u03c2 \u03b1\u03c0\u03bf\u03c7\u03ad\u03c2. \u0394\u03b7\u03bb\u03b1\u03b4\u03ae, \u03cc\u03bb\u03b5\u03c2 \u03bf\u03b9 \u039f\u03bc\u03ac\u03b4\u03b5\u03c2, \u03bc\u03b5 \u03c4\u03b7\u03bd \u03b5\u03be\u03b1\u03af\u03c1\u03b5\u03c3\u03b7 \u03c4\u03c9\u03bd \u03bc\u03b7 \u03b5\u03b3\u03b3\u03b5\u03b3\u03c1\u03b1\u03bc\u03bc\u03ad\u03bd\u03c9\u03bd - \u03bf\u03b9 \u03bf\u03c0\u03bf\u03af\u03bf\u03b9 \u03cc\u03bc\u03c9\u03c2 \u03b4\u03b5\u03bd \u03b1\u03c0\u03bf\u03c4\u03b5\u03bb\u03bf\u03cd\u03bd \u03b2\u03ad\u03b2\u03b1\u03b9\u03b1 \u039f\u03bc\u03ac\u03b4\u03b1 - \u03c3\u03c5\u03bc\u03c6\u03ce\u03bd\u03b7\u03c3\u03b1\u03bd \u03bc\u03b5\u03c4\u03b1\u03be\u03cd \u03c4\u03bf\u03c5\u03c2, \u03ba\u03b1\u03b9 \u03bc\u03cc\u03bd\u03bf\u03bd \u03b7 \u03b4\u03b9\u03ba\u03ae \u03c3\u03b1\u03c2 \u039f\u03bc\u03ac\u03b4\u03b1 \u03ae\u03c4\u03b1\u03bd \u03c4\u03b7\u03c2 \u03b3\u03bd\u03ce\u03bc\u03b7\u03c2 \u03c0\u03c9\u03c2 \u03c0\u03c1\u03ad\u03c0\u03b5\u03b9 \u03bd\u03b1 \u03c0\u03c1\u03bf\u03c7\u03c9\u03c1\u03ae\u03c3\u03bf\u03c5\u03bc\u03b5 \u03bc\u03b5 \u03c4\u03bf\u03bd \u03c4\u03c1\u03cc\u03c0\u03bf \u03c0\u03bf\u03c5 \u03b5\u03c3\u03b5\u03af\u03c2 \u03c0\u03c1\u03bf\u03c4\u03b5\u03af\u03bd\u03b1\u03c4\u03b5 \u03b5\u03b4\u03ce. \u038c\u03bb\u03bf\u03b9 \u03bf\u03b9 \u03ac\u03bb\u03bb\u03bf\u03b9 \u03b5\u03af\u03c7\u03b1\u03bd \u03b4\u03b9\u03b1\u03c6\u03bf\u03c1\u03b5\u03c4\u03b9\u03ba\u03ae \u03b3\u03bd\u03ce\u03bc\u03b7. \u0391\u03c5\u03c4\u03ae \u03ae\u03c4\u03b1\u03bd \u03b7 \u03b1\u03c0\u03cc\u03c6\u03b1\u03c3\u03b7.
+\u03a4\u03ce\u03c1\u03b1 \u03b8\u03b1 \u03ae\u03b8\u03b5\u03bb\u03b1 \u03bd\u03b1 \u03c0\u03c9 \u03ba\u03ac\u03c4\u03b9 \u03b5\u03c0\u03af \u03c4\u03bf\u03c5 \u03b9\u03b4\u03af\u03bf\u03c5 \u03c4\u03bf\u03c5 \u03b8\u03ad\u03bc\u03b1\u03c4\u03bf\u03c2. \u0395\u03bc\u03c0\u03b9\u03c3\u03c4\u03b5\u03c5\u03cc\u03bc\u03b1\u03c3\u03c4\u03b5 \u03c4\u03b7\u03bd \u0395\u03c0\u03b9\u03c4\u03c1\u03bf\u03c0\u03ae, \u03c4\u03bf\u03bd Romano Prodi, \u03ba\u03b1\u03b9 \u03b7 \u03c3\u03c5\u03bd\u03c4\u03c1\u03b9\u03c0\u03c4\u03b9\u03ba\u03ae \u03c0\u03bb\u03b5\u03b9\u03bf\u03c8\u03b7\u03c6\u03af\u03b1 \u03c4\u03b7\u03c2 \u039f\u03bc\u03ac\u03b4\u03b1\u03c2 \u03bc\u03b1\u03c2 \u03b5\u03be\u03ad\u03c6\u03c1\u03b1\u03c3\u03b5 \u03c4\u03b7\u03bd \u03b5\u03bc\u03c0\u03b9\u03c3\u03c4\u03bf\u03c3\u03cd\u03bd\u03b7 \u03c4\u03b7\u03c2 \u03c3\u03c4\u03bf\u03bd Romano Prodi \u03ba\u03b1\u03b9 \u03c4\u03b7\u03bd \u0395\u03c0\u03b9\u03c4\u03c1\u03bf\u03c0\u03ae \u03ad\u03c0\u03b5\u03b9\u03c4\u03b1 \u03b1\u03c0\u03cc \u03bc\u03b9\u03b1 \u03b4\u03cd\u03c3\u03ba\u03bf\u03bb\u03b7 \u03b4\u03b9\u03b1\u03b4\u03b9\u03ba\u03b1\u03c3\u03af\u03b1, \u03cc\u03c0\u03c9\u03c2 \u03b3\u03bd\u03c9\u03c1\u03af\u03b6\u03bf\u03c5\u03bd \u03cc\u03bb\u03bf\u03b9. \u0395\u03af\u03bc\u03b1\u03c3\u03c4\u03b5 \u03cc\u03bc\u03c9\u03c2 \u03ba\u03b1\u03b9 \u03c4\u03b7\u03c2 \u03b3\u03bd\u03ce\u03bc\u03b7\u03c2 \u03c0\u03c9\u03c2 \u03c0\u03c1\u03ad\u03c0\u03b5\u03b9 \u03bd\u03b1 \u03b4\u03b9\u03b5\u03be\u03b1\u03b3\u03ac\u03b3\u03bf\u03c5\u03bc\u03b5 \u03bc\u03b9\u03b1 \u03c3\u03c5\u03b6\u03ae\u03c4\u03b7\u03c3\u03b7 \u03b3\u03b9\u03b1 \u03c4\u03b7\u03bd \u03b5\u03bd \u03bb\u03cc\u03b3\u03c9 \u03c3\u03c4\u03c1\u03b1\u03c4\u03b7\u03b3\u03b9\u03ba\u03ae \u03c4\u03b7\u03c2 \u0395\u03c0\u03b9\u03c4\u03c1\u03bf\u03c0\u03ae\u03c2 \u03c3\u03c4\u03bf \u03c0\u03bb\u03b1\u03af\u03c3\u03b9\u03bf \u03bc\u03b9\u03b1\u03c2 \u03ba\u03b1\u03bd\u03bf\u03bd\u03b9\u03ba\u03ae\u03c2 \u03b4\u03b9\u03b1\u03b4\u03b9\u03ba\u03b1\u03c3\u03af\u03b1\u03c2, \u03cc\u03c7\u03b9 \u03bc\u03cc\u03bd\u03bf\u03bd \u03b2\u03ac\u03c3\u03b5\u03b9 \u03bc\u03b9\u03b1\u03c2 \u03c0\u03c1\u03bf\u03c6\u03bf\u03c1\u03b9\u03ba\u03ae\u03c2 \u03b4\u03ae\u03bb\u03c9\u03c3\u03b7\u03c2 \u03b5\u03b4\u03ce \u03c3\u03c4\u03bf \u0395\u03c5\u03c1\u03c9\u03c0\u03b1\u03ca\u03ba\u03cc \u039a\u03bf\u03b9\u03bd\u03bf\u03b2\u03bf\u03cd\u03bb\u03b9\u03bf, \u03b1\u03bb\u03bb\u03ac \u03ba\u03b1\u03b9 \u03b2\u03ac\u03c3\u03b5\u03b9 \u03b5\u03bd\u03cc\u03c2 \u03b5\u03b3\u03b3\u03c1\u03ac\u03c6\u03bf\u03c5 \u03c0\u03bf\u03c5 \u03b8\u03b1 \u03ad\u03c7\u03b5\u03b9 \u03c8\u03b7\u03c6\u03b9\u03c3\u03b8\u03b5\u03af \u03c3\u03c4\u03b7\u03bd \u0395\u03c0\u03b9\u03c4\u03c1\u03bf\u03c0\u03ae \u03ba\u03b1\u03b9 \u03c0\u03bf\u03c5 \u03b8\u03b1
  \u03c0\u03b5\u03c1\u03b9\u03b3\u03c1\u03ac\u03c6\u03b5\u03b9 \u03c4\u03bf \u03c0\u03b5\u03bd\u03c4\u03b1\u03b5\u03c4\u03ad\u03c2 \u03b1\u03c5\u03c4\u03cc \u03c0\u03c1\u03cc\u03b3\u03c1\u03b1\u03bc\u03bc\u03b1. \u03a0\u03b1\u03c1\u03cc\u03bc\u03bf\u03b9\u03bf \u03ad\u03b3\u03b3\u03c1\u03b1\u03c6\u03bf \u03b4\u03b5\u03bd \u03c5\u03c6\u03af\u03c3\u03c4\u03b1\u03c4\u03b1\u03b9!
+(\u03a7\u03b5\u03b9\u03c1\u03bf\u03ba\u03c1\u03bf\u03c4\u03ae\u03bc\u03b1\u03c4\u03b1)
+\u0397 \u0395\u03c0\u03b9\u03c4\u03c1\u03bf\u03c0\u03ae \u03b8\u03b1 \u03ba\u03b1\u03c4\u03b1\u03b8\u03ad\u03c3\u03b5\u03b9 \u03c4\u03bf \u03c0\u03c1\u03cc\u03b3\u03c1\u03b1\u03bc\u03bc\u03b1 \u03b3\u03b9\u03b1 \u03c4\u03bf \u03ad\u03c4\u03bf\u03c2 2000 \u03c4\u03bf\u03bd \u03a6\u03b5\u03b2\u03c1\u03bf\u03c5\u03ac\u03c1\u03b9\u03bf. \u0395\u03bc\u03b5\u03af\u03c2 \u03b5\u03af\u03c0\u03b1\u03bc\u03b5, \u03b5\u03bd\u03c4\u03ac\u03be\u03b5\u03b9, \u03b5\u03ac\u03bd \u03b7 \u0395\u03c0\u03b9\u03c4\u03c1\u03bf\u03c0\u03ae \u03b4\u03b5\u03bd \u03b5\u03c0\u03b9\u03b8\u03c5\u03bc\u03b5\u03af \u03bd\u03b1 \u03b5\u03ba\u03c0\u03bf\u03bd\u03ae\u03c3\u03b5\u03b9 \u03c4\u03bf \u03c0\u03c1\u03cc\u03b3\u03c1\u03b1\u03bc\u03bc\u03b1 2000 \u03ae\u03b4\u03b7 \u03b5\u03bd\u03c4\u03cc\u03c2 \u03c4\u03bf\u03c5 \u0399\u03b1\u03bd\u03bf\u03c5\u03b1\u03c1\u03af\u03bf\u03c5, \u03c4\u03cc\u03c4\u03b5 \u03b8\u03b1 \u03ba\u03ac\u03bd\u03bf\u03c5\u03bc\u03b5 \u03c4\u03b7 \u03c3\u03c5\u03b6\u03ae\u03c4\u03b7\u03c3\u03b7 \u03b5\u03bd\u03c4\u03cc\u03c2 \u03c4\u03bf\u03c5 \u03a6\u03b5\u03b2\u03c1\u03bf\u03c5\u03b1\u03c1\u03af\u03bf\u03c5. \u03a4\u03bf \u03b5\u03b3\u03ba\u03c1\u03af\u03bd\u03b1\u03bc\u03b5 \u03b1\u03c5\u03c4\u03cc, \u03b4\u03b9\u03cc\u03c4\u03b9 \u03b2\u03b1\u03c3\u03b9\u03ba\u03ac \u03b4\u03b5\u03bd \u03b5\u03c0\u03b9\u03b8\u03c5\u03bc\u03bf\u03cd\u03bc\u03b5 \u03b4\u03b9\u03ad\u03bd\u03b5\u03be\u03b7 \u03bc\u03b5 \u03c4\u03b7\u03bd \u0395\u03c0\u03b9\u03c4\u03c1\u03bf\u03c0\u03ae, \u03b1\u03bb\u03bb\u03ac \u03b5\u03af\u03bc\u03b1\u03c3\u03c4\u03b5 \u03c4\u03b7\u03c2 \u03b3\u03bd\u03ce\u03bc\u03b7\u03c2 \u03c0\u03c9\u03c2 \u03b7 \u0395\u03c0\u03b9\u03c4\u03c1\u03bf\u03c0\u03ae \u03ba\u03b1\u03b9 \u03c4\u03bf \u039a\u03bf\u03b9\u03bd\u03bf\u03b2\u03bf\u03cd\u03bb\u03b9\u03bf \u03c0\u03c1\u03ad\u03c0\u03b5\u03b9 \u03bd\u03b1 \u03c3\u03c5\u03bc\u03c0\u03bf\u03c1\u03b5\u03cd\u03bf\u03bd\u03c4\u03b1\u03b9 \u03cc\u03c0\u03bf\u03c4\u03b5 \u03b1\u03c5\u03c4\u03cc \u03b5\u03af\u03bd\u03b1\u03b9 \u03b5\u03c6\u03b9\u03ba\u03c4\u03cc. \u038c\u03bc\u03c9\u03c2 \u03b5\u03bc\u03b5\u03af\u03c2 \u03c9\u03c2 \u039a\u03bf\u03b9\u03bd\u03bf\u03b2\u03bf\u03cd\u03bb\u03b9\u03bf \u03b5\u03af\u03bc\u03b1\u03c3\u03c4\u03b5 \u03ba\u03b1\u03b9 \u03bf \u03b5\u03bb\u03b5\u03b3\u03ba\u03c4\u03ae\u03c2 \u03c4\u03b7\u03c2 \u0395\u03c0\u03b9\u03c4\u03c1\u03bf\u03c0\u03ae\u03c2. \u039a\u03b1\u03b9 \u03b4\u03b5\u03bd \u03b5\u03af\u03bd\u03b1\u03b9 \u03b1\u03c0\u03b1\u03c1\u03b1\u03af\u03c4\u03b7\u03c4\u03bf \u03b7 \u03b3\u03bd\u03ce\u03bc\u03b7 \u03bc\u03b1\u03c2 \u03bd\u03b1 \u03c4\u03b1\u03c5\u03c4\u03af\u03b6\u03b5\u03c4\u03b1\u03b9 \u03bc\u03b5 \u03bf\u03c4\u03b9\u03b4\u03ae\u03c0\u03bf\u03c4\u03b5 \u03c0\u03c1\u03bf\u03ad\u03c1\u03c7\u03b5\u03c4\u03b1\u03b9 \u03b1\u03c0\u03cc 
 \u03c4\u03b7\u03bd \u0395\u03c0\u03b9\u03c4\u03c1\u03bf\u03c0\u03ae.
+\u0398\u03b1 \u03ae\u03b8\u03b5\u03bb\u03b1 \u03bd\u03b1 \u03b5\u03af\u03bc\u03b1\u03c3\u03c4\u03b5 \u03c3\u03b5 \u03b8\u03ad\u03c3\u03b7 \u03bd\u03b1 \u03c0\u03c1\u03bf\u03b5\u03c4\u03bf\u03b9\u03bc\u03b1\u03c3\u03c4\u03bf\u03cd\u03bc\u03b5 \u03ba\u03b1\u03bb\u03ac \u03c3\u03c4\u03b9\u03c2 \u039f\u03bc\u03ac\u03b4\u03b5\u03c2 \u03b3\u03b9\u03b1 \u03bc\u03b9\u03b1 \u03c3\u03c5\u03b6\u03ae\u03c4\u03b7\u03c3\u03b7 \u03c3\u03c7\u03b5\u03c4\u03b9\u03ba\u03ac \u03bc\u03b5 \u03c4\u03bf \u03c0\u03b5\u03bd\u03c4\u03b1\u03b5\u03c4\u03ad\u03c2 \u03c0\u03c1\u03cc\u03b3\u03c1\u03b1\u03bc\u03bc\u03b1. \u0394\u03b5\u03bd \u03bc\u03c0\u03bf\u03c1\u03b5\u03af \u03ba\u03b1\u03bd\u03b5\u03af\u03c2 \u03bd\u03b1 \u03c0\u03c1\u03bf\u03b5\u03c4\u03bf\u03b9\u03bc\u03b1\u03c3\u03c4\u03b5\u03af, \u03cc\u03c4\u03b1\u03bd \u03b1\u03ba\u03bf\u03cd\u03b5\u03b9 \u03b5\u03b4\u03ce \u03bc\u03b9\u03b1 \u03b4\u03ae\u03bb\u03c9\u03c3\u03b7 \u03c7\u03c9\u03c1\u03af\u03c2 \u03bd\u03b1 \u03b3\u03bd\u03c9\u03c1\u03af\u03b6\u03b5\u03b9 \u03ba\u03b1\u03b8\u03cc\u03bb\u03bf\u03c5 \u03c0\u03bf\u03b9\u03bf \u03b5\u03af\u03bd\u03b1\u03b9 \u03c4\u03bf \u03c0\u03b5\u03c1\u03b9\u03b5\u03c7\u03cc\u03bc\u03b5\u03bd\u03bf \u03bc\u03b9\u03b1\u03c2 \u03c4\u03ad\u03c4\u03bf\u03b9\u03b1\u03c2 \u03b4\u03ae\u03bb\u03c9\u03c3\u03b7\u03c2. \u0393\u03b9\u03b1 \u03c4\u03bf\u03bd \u03bb\u03cc\u03b3\u03bf \u03b1\u03c5\u03c4\u03cc \u03ba\u03ac\u03bd\u03bf\u03c5\u03bc\u03b5 \u03c4\u03b7 \u03c3\u03cd\u03c3\u03c4\u03b1\u03c3\u03b7 - \u03ba\u03b1\u03b9 \u03ad\u03c7\u03c9 \u03c4\u03b7\u03bd \u03b5\u03bd\u03c4\u03cd\u03c0\u03c9\u03c3\u03b7 \u03c0\u03c9\u03c2 \u03ba\u03b1\u03b9 \u03b7 \u0395\u03c0\u03b9\u03c4\u03c1\u03bf\u03c0\u03ae \u03b5\u03af\u03bd\u03b1\u03b9 \u03b1\u03bd\u03bf\u03b9\u03c7\u03c4\u03ae \u03c0\u03c1\u03bf\u03c2 \u03b1\u03c5\u03c4\u03ae \u03c4\u03b7 \u03c3\u03ba\u03ad\u03c8\u03b7 - \u03bd\u03b1 \u03b4\u03b9\u03b5\u03be\u03ac\u03b3\u03bf\u03c5\u03bc\u03b5 \u03b5\u03bd\u03c4\u03cc\u03c2 \u03c4\u03bf\u03c5 \u03a6\u03b5\u03b2\u03c1\u03bf\u03c5\u03b1\u03c1\u03af\u03bf\u03c5 \u03c4\u03b7 \u03c3\u03c5\u03b6\u03ae\u03c4\u03b7\u03c3\u03b7 \u03b5\u03c0\u03af \u03c4\u03bf\u03c5 \u03bc\u03b1\u03ba\u03c1\u03bf\u03c0\u03c1\u03cc\u03b8\u03b5\u03c3\u03bc\u03bf\u03c5 \u03c0\u03c1\u03bf\u03b3\u03c1\u03ac\u03bc\u03bc\u03b1\u03c4\u03bf\u03c2 \u03c4\u03b7\u03c2 \u0395\u03c0\u03b9\u03c4\u03c1\u03bf\u03c0\u03ae\u03c2 \u03ad\u03c9\u03c2 \u03c4\u03bf \u03ad\u03c4\u03bf\u03c2 2005 - \u03b5\u03bb\u03c0\u03af\u03b6\u03c9 \u03c0\u03c9\u03c2 \u03ba\u03b1\u03b9 \u03b7 \u0395\u03c0\u03b9\u03c4\u03c1\u03bf\u03c0\u03ae \u03b8\u03b1 \u03c3\u03c5\u03bc\u03c6\u03c9\u03bd\u03ae\u03c3\u03b5\u03b9 \u03bc\u03ad\u03c7\u03c1\u03b9 \u03c4\u03cc\u03c4\u03b5 \u03cc\u03c3\u03bf\u03bd \u03b1\u03c6\u03bf\u03c1\u03ac \u03ba\u03ac\u03c0
 \u03b9\u03bf \u03c0\u03c1\u03cc\u03b3\u03c1\u03b1\u03bc\u03bc\u03b1 \u03c0\u03bf\u03c5 \u03b8\u03b1 \u03bc\u03b1\u03c2 \u03c0\u03c1\u03bf\u03c4\u03b5\u03af\u03bd\u03b5\u03b9 - \u03ba\u03b1\u03b9 \u03bd\u03b1 \u03b4\u03b9\u03b5\u03be\u03b1\u03b3\u03ac\u03b3\u03bf\u03c5\u03bc\u03b5 \u03c4\u03b1\u03c5\u03c4\u03cc\u03c7\u03c1\u03bf\u03bd\u03b1 \u03c4\u03bf\u03bd \u03a6\u03b5\u03b2\u03c1\u03bf\u03c5\u03ac\u03c1\u03b9\u03bf \u03ba\u03b1\u03b9 \u03c4\u03b7 \u03c3\u03c5\u03b6\u03ae\u03c4\u03b7\u03c3\u03b7 \u03c3\u03c7\u03b5\u03c4\u03b9\u03ba\u03ac \u03bc\u03b5 \u03c4\u03bf\u03bd \u03c0\u03c1\u03bf\u03b3\u03c1\u03b1\u03bc\u03bc\u03b1\u03c4\u03b9\u03c3\u03bc\u03cc \u03c4\u03bf\u03c5 \u03bd\u03bf\u03bc\u03bf\u03b8\u03b5\u03c4\u03b9\u03ba\u03bf\u03cd \u03ad\u03c1\u03b3\u03bf\u03c5 \u03c4\u03b7\u03c2 \u0395\u03c0\u03b9\u03c4\u03c1\u03bf\u03c0\u03ae\u03c2 \u03b3\u03b9\u03b1 \u03c4\u03bf \u03ad\u03c4\u03bf\u03c2 2000. \u0395\u03c0\u03bf\u03bc\u03ad\u03bd\u03c9\u03c2, \u03c5\u03c0\u03ac\u03c1\u03c7\u03b5\u03b9 \u03ba\u03b1\u03b9 \u03bc\u03b9\u03b1 \u03bb\u03bf\u03b3\u03b9\u03ba\u03ae \u03b1\u03bd\u03c4\u03b9\u03ba\u03b5\u03b9\u03bc\u03b5\u03bd\u03b9\u03ba\u03ae \u03c3\u03c5\u03bd\u03bf\u03c7\u03ae \u03c0\u03bf\u03c5 \u03bc\u03b1\u03c2 \u03c3\u03c5\u03bd\u03b9\u03c3\u03c4\u03ac \u03bd\u03b1 \u03b4\u03b9\u03b5\u03be\u03b1\u03b3\u03ac\u03b3\u03bf\u03c5\u03bc\u03b5 \u03b1\u03c0\u03cc \u03ba\u03bf\u03b9\u03bd\u03bf\u03cd \u03c4\u03b7 \u03c3\u03c5\u03b6\u03ae\u03c4\u03b7\u03c3\u03b7 \u03ba\u03b1\u03b9 \u03b3\u03b9\u03b1 \u03c4\u03b1 \u03b4\u03cd\u03bf \u03c0\u03c1\u03bf\u03b3\u03c1\u03ac\u03bc\u03bc\u03b1\u03c4\u03b1. \u0393\u03b9\u03b1 \u03c4\u03bf\u03bd \u03bb\u03cc\u03b3\u03bf \u03b1\u03c5\u03c4\u03cc, \u03b7 \u039f\u03bc\u03ac\u03b4\u03b1 \u03bc\u03bf\u03c5 \u03b1\u03c0\u03bf\u03c1\u03c1\u03af\u03c0\u03c4\u03b5\u03b9 \u03ba\u03b1\u03c4\u03b7\u03b3\u03bf\u03c1\u03b7\u03bc\u03b1\u03c4\u03b9\u03ba\u03ac \u03c4\u03b7\u03bd \u03c0\u03c1\u03cc\u03c4\u03b1\u03c3\u03b7 \u03c4\u03b7\u03c2 \u039f\u03bc\u03ac\u03b4\u03b1\u03c2 \u03c4\u03bf\u03c5 \u039a\u03cc\u03bc\u03bc\u03b1\u03c4\u03bf\u03c2 \u03c4\u03c9\u03bd \u0395\u03c5\u03c1\u03c9\u03c0\u03b1\u03af\u03c9\u03bd \u03a3\u03bf\u03c3\u03b9\u03b1\u03bb\u03b9\u03c3\u03c4\u03ce\u03bd!
+(\u03a7\u03b5\u03b9\u03c1\u03bf\u03ba\u03c1\u03bf\u03c4\u03ae\u03bc\u03b1\u03c4\u03b1 \u03b1\u03c0\u03cc \u03c4\u03b7\u03bd \u039f\u03bc\u03ac\u03b4\u03b1 \u03c4\u03bf\u03c5 \u0395\u03c5\u03c1\u03c9\u03c0\u03b1\u03ca\u03ba\u03bf\u03cd \u039b\u03b1\u03ca\u03ba\u03bf\u03cd \u039a\u03cc\u03bc\u03bc\u03b1\u03c4\u03bf\u03c2 (\u03a7\u03c1\u03b9\u03c3\u03c4\u03b9\u03b1\u03bd\u03bf\u03b4\u03b7\u03bc\u03bf\u03ba\u03c1\u03ac\u03c4\u03b5\u03c2) \u03ba\u03b1\u03b9 \u03c4\u03c9\u03bd \u0395\u03c5\u03c1\u03c9\u03c0\u03b1\u03af\u03c9\u03bd \u0394\u03b7\u03bc\u03bf\u03ba\u03c1\u03b1\u03c4\u03ce\u03bd)
+
+\u039a\u03c5\u03c1\u03af\u03b1 \u03a0\u03c1\u03cc\u03b5\u03b4\u03c1\u03b5, \u03b8\u03ad\u03bb\u03c9 \u03bd\u03b1 \u03ba\u03b1\u03c4\u03b1\u03c3\u03c4\u03ae\u03c3\u03c9 \u03b1\u03c0\u03bf\u03bb\u03cd\u03c4\u03c9\u03c2 \u03c3\u03b1\u03c6\u03ad\u03c2 \u03cc\u03c4\u03b9, \u03ba\u03b1\u03c4\u03b1\u03c1\u03c7\u03ac\u03c2, \u03b7 \u0395\u03c0\u03b9\u03c4\u03c1\u03bf\u03c0\u03ae \u03c3\u03ad\u03b2\u03b5\u03c4\u03b1\u03b9 \u03c3\u03c4\u03bf \u03bc\u03ad\u03b3\u03b9\u03c3\u03c4\u03bf \u03b2\u03b1\u03b8\u03bc\u03cc \u03c4\u03b9\u03c2 \u03b1\u03c0\u03bf\u03c6\u03ac\u03c3\u03b5\u03b9\u03c2 \u03c4\u03bf\u03c5 \u039a\u03bf\u03b9\u03bd\u03bf\u03b2\u03bf\u03c5\u03bb\u03af\u03bf\u03c5 \u03ba\u03b1\u03b9, \u03bc\u03b5\u03c4\u03b1\u03be\u03cd \u03b1\u03c5\u03c4\u03ce\u03bd, \u03c4\u03b7\u03bd \u03b1\u03c0\u03cc\u03c6\u03b1\u03c3\u03b7 \u03bd\u03b1 \u03ba\u03b1\u03b8\u03bf\u03c1\u03af\u03b6\u03b5\u03b9 \u03c4\u03b7\u03bd \u03b7\u03bc\u03b5\u03c1\u03ae\u03c3\u03b9\u03b1 \u03b4\u03b9\u03ac\u03c4\u03b1\u03be\u03ae \u03c4\u03bf\u03c5. \u0395\u03c0\u03bf\u03bc\u03ad\u03bd\u03c9\u03c2, \u03b5\u03bc\u03b5\u03af\u03c2 \u03c3\u03b5\u03b2\u03cc\u03bc\u03b1\u03c3\u03c4\u03b5 \u03c4\u03b7\u03bd \u03b1\u03c0\u03cc\u03c6\u03b1\u03c3\u03b7 \u03c0\u03bf\u03c5 \u03bc\u03c0\u03bf\u03c1\u03b5\u03af \u03bd\u03b1 \u03bb\u03ac\u03b2\u03b5\u03b9 \u03c4\u03bf \u039a\u03bf\u03b9\u03bd\u03bf\u03b2\u03bf\u03cd\u03bb\u03b9\u03bf \u03c9\u03c2 \u03c0\u03c1\u03bf\u03c2 \u03b1\u03c5\u03c4\u03cc.
+\u038c\u03bc\u03c9\u03c2, \u03b8\u03ad\u03bb\u03c9 \u03b5\u03c0\u03af\u03c3\u03b7\u03c2 \u03bd\u03b1 \u03ba\u03b1\u03c4\u03b1\u03c3\u03c4\u03ae\u03c3\u03c9 \u03b1\u03c0\u03bf\u03bb\u03cd\u03c4\u03c9\u03c2 \u03c3\u03b1\u03c6\u03ad\u03c2 \u03cc\u03c4\u03b9, \u03bf \u03a0\u03c1\u03cc\u03b5\u03b4\u03c1\u03bf\u03c2 Prodi \u03b4\u03b5\u03c3\u03bc\u03b5\u03cd\u03c4\u03b7\u03ba\u03b5 \u03ad\u03bd\u03b1\u03bd\u03c4\u03b9 \u03c4\u03bf\u03c5 \u039a\u03bf\u03b9\u03bd\u03bf\u03b2\u03bf\u03c5\u03bb\u03af\u03bf\u03c5 \u03bd\u03b1 \u03b5\u03bd\u03c3\u03c9\u03bc\u03b1\u03c4\u03ce\u03c3\u03b5\u03b9 \u03bc\u03b9\u03b1 \u03bd\u03ad\u03b1 \u03c3\u03c5\u03b6\u03ae\u03c4\u03b7\u03c3\u03b7, \u03cc\u03c0\u03c9\u03c2 \u03c5\u03c0\u03b5\u03bd\u03b8\u03cd\u03bc\u03b9\u03c3\u03b5 \u03bf \u03ba. Bar\u03c3n, \u03b7 \u03bf\u03c0\u03bf\u03af\u03b1 \u03b8\u03b1 \u03c0\u03c1\u03bf\u03c3\u03c4\u03b5\u03b8\u03b5\u03af \u03c3\u03c4\u03b7\u03bd \u03b5\u03c4\u03ae\u03c3\u03b9\u03b1 \u03c3\u03c5\u03b6\u03ae\u03c4\u03b7\u03c3\u03b7 \u03c3\u03c7\u03b5\u03c4\u03b9\u03ba\u03ac \u03bc\u03b5 \u03c4\u03bf \u03bd\u03bf\u03bc\u03bf\u03b8\u03b5\u03c4\u03b9\u03ba\u03cc \u03c0\u03c1\u03cc\u03b3\u03c1\u03b1\u03bc\u03bc\u03b1 \u03c4\u03b7\u03c2 \u0395\u03c0\u03b9\u03c4\u03c1\u03bf\u03c0\u03ae\u03c2, \u03c3\u03c7\u03b5\u03c4\u03b9\u03ba\u03ac \u03bc\u03b5 \u03c4\u03b9\u03c2 \u03c3\u03b7\u03bc\u03b1\u03bd\u03c4\u03b9\u03ba\u03ad\u03c2 \u03b3\u03c1\u03b1\u03bc\u03bc\u03ad\u03c2 \u03b4\u03c1\u03ac\u03c3\u03b7\u03c2 \u03b3\u03b9\u03b1 \u03c4\u03b7\u03bd \u03b5\u03c0\u03cc\u03bc\u03b5\u03bd\u03b7 \u03c0\u03b5\u03bd\u03c4\u03b1\u03b5\u03c4\u03af\u03b1, \u03b4\u03b7\u03bb\u03b1\u03b4\u03ae, \u03b3\u03b9\u03b1 \u03c4\u03b7\u03bd \u03c4\u03c1\u03ad\u03c7\u03bf\u03c5\u03c3\u03b1 \u03ba\u03bf\u03b9\u03bd\u03bf\u03b2\u03bf\u03c5\u03bb\u03b5\u03c5\u03c4\u03b9\u03ba\u03ae \u03c0\u03b5\u03c1\u03af\u03bf\u03b4\u03bf.
+\u0398\u03ad\u03bb\u03c9 \u03bd\u03b1 \u03b1\u03bd\u03b1\u03c6\u03ad\u03c1\u03c9, \u03ba\u03c5\u03c1\u03af\u03b1 \u03a0\u03c1\u03cc\u03b5\u03b4\u03c1\u03b5, \u03cc\u03c4\u03b9 \u03b1\u03c5\u03c4\u03ae \u03b7 \u03c3\u03c5\u03b6\u03ae\u03c4\u03b7\u03c3\u03b7 \u03b4\u03b5\u03bd \u03c4\u03b1\u03c5\u03c4\u03b9\u03b6\u03cc\u03c4\u03b1\u03bd, \u03c3\u03c4\u03b7 \u03c3\u03c5\u03bc\u03c6\u03c9\u03bd\u03af\u03b1 \u03b7 \u03bf\u03c0\u03bf\u03af\u03b1 \u03b5\u03c0\u03b9\u03c4\u03b5\u03cd\u03c7\u03b8\u03b7\u03ba\u03b5 \u03c4\u03bf \u03a3\u03b5\u03c0\u03c4\u03ad\u03bc\u03b2\u03c1\u03b9\u03bf, \u03bc\u03b5 \u03c4\u03b7\u03bd \u03b5\u03c4\u03ae\u03c3\u03b9\u03b1 \u03c0\u03b1\u03c1\u03bf\u03c5\u03c3\u03af\u03b1\u03c3\u03b7 \u03c4\u03bf\u03c5 \u03bd\u03bf\u03bc\u03bf\u03b8\u03b5\u03c4\u03b9\u03ba\u03bf\u03cd \u03c0\u03c1\u03bf\u03b3\u03c1\u03ac\u03bc\u03bc\u03b1\u03c4\u03bf\u03c2 \u03c4\u03b7\u03c2 \u0395\u03c0\u03b9\u03c4\u03c1\u03bf\u03c0\u03ae\u03c2. \u039a\u03b1\u03b9 \u03b8\u03ad\u03bb\u03c9 \u03bd\u03b1 \u03c0\u03c9, \u03ba\u03c5\u03c1\u03af\u03b1 \u03a0\u03c1\u03cc\u03b5\u03b4\u03c1\u03b5, \u03cc\u03c4\u03b9, \u03b5\u03ba \u03bc\u03ad\u03c1\u03bf\u03c5\u03c2 \u03c4\u03b7\u03c2 \u0395\u03c0\u03b9\u03c4\u03c1\u03bf\u03c0\u03ae\u03c2, \u03b5\u03af\u03bc\u03b1\u03c3\u03c4\u03b5 \u03ad\u03c4\u03bf\u03b9\u03bc\u03bf\u03b9 \u03ba\u03b1\u03b9 \u03b4\u03b9\u03b1\u03c4\u03b5\u03b8\u03b5\u03b9\u03bc\u03ad\u03bd\u03bf\u03b9 \u03bd\u03b1 \u03c0\u03c1\u03b1\u03b3\u03bc\u03b1\u03c4\u03bf\u03c0\u03bf\u03b9\u03ae\u03c3\u03bf\u03c5\u03bc\u03b5 \u03c4\u03b7 \u03c3\u03c5\u03b6\u03ae\u03c4\u03b7\u03c3\u03b7 \u03b1\u03c5\u03c4\u03ae \u03c4\u03b7\u03bd \u03ba\u03b1\u03c4\u03ac\u03bb\u03bb\u03b7\u03bb\u03b7 \u03c3\u03c4\u03b9\u03b3\u03bc\u03ae, \u03cc\u03c4\u03b9 \u03ae\u03bc\u03b1\u03c3\u03c4\u03b1\u03bd \u03ad\u03c4\u03bf\u03b9\u03bc\u03bf\u03b9 \u03bd\u03b1 \u03b1\u03bd\u03b1\u03c0\u03c4\u03cd\u03be\u03bf\u03c5\u03bc\u03b5 \u03c4\u03b7 \u03c3\u03c5\u03b6\u03ae\u03c4\u03b7\u03c3\u03b7 \u03b1\u03c5\u03c4\u03ae\u03bd \u03c4\u03b7\u03bd \u03b5\u03b2\u03b4\u03bf\u03bc\u03ac\u03b4\u03b1, \u03cc\u03c0\u03c9\u03c2 \u03b5\u03af\u03c7\u03b5 \u03c3\u03c5\u03bc\u03c6\u03c9\u03bd\u03b7\u03b8\u03b5\u03af \u03b1\u03c1\u03c7\u03b9\u03ba\u03ac, \u03be\u03b5\u03ba\u03b9\u03bd\u03ce\u03bd\u03c4\u03b1\u03c2 \u03b1\u03c0\u03cc \u03c4\u03b7 \u03b2\u03ac\u03c3\u03b7 \u03cc\u03c4\u03b9 \u03b8\u03b1 \u03c0\u03b1\u03c1\u03bf\u03c5\u03c3\u03b9\u03b1\u03b6\u03cc\u03c4\u03b1\u03bd \u03c4\u03b7\u03bd \u03c0\u03c1\u03bf\u03b7\u03b3\u03bf\u03cd\u03bc\u03b5\u03bd\u03b7 \u03bc\u03ad\u03c1\u03b1 \u03c3\u03b5 \u03ad\u03bd\u03b1\u03bd \u03bb\u03cc\u03b3\u03bf \u03c0\u03c1\u03bf\u03c2 \u03c4\u03b9\u03c2 \u03ba\u03bf\u03b9\u03bd\u03bf\u03b2\u03bf\u03c5\u03bb\u03b5\u03c5\u03c4\u03b9\u03ba\u03ad\u03c2 \u03bf\u03bc\u03ac\u03b4
 \u03c2.
+\u0395\u03c0\u03bf\u03bc\u03ad\u03bd\u03c9\u03c2, \u03ba\u03c5\u03c1\u03af\u03b1 \u03a0\u03c1\u03cc\u03b5\u03b4\u03c1\u03b5, \u03b8\u03ad\u03bb\u03c9 \u03bd\u03b1 \u03b5\u03c0\u03b1\u03bd\u03b1\u03bb\u03ac\u03b2\u03c9 \u03cc\u03c4\u03b9, \u03b1\u03c0\u03cc \u03c4\u03b7 \u03b4\u03b9\u03ba\u03ae \u03bc\u03b1\u03c2 \u03c0\u03bb\u03b5\u03c5\u03c1\u03ac, \u03ad\u03c7\u03bf\u03c5\u03bc\u03b5 \u03c3\u03c5\u03b6\u03b7\u03c4\u03ae\u03c3\u03b5\u03b9 \u03c4\u03bf \u03c0\u03c1\u03cc\u03b3\u03c1\u03b1\u03bc\u03bc\u03b1 \u03b4\u03c1\u03ac\u03c3\u03b7\u03c2 \u03b3\u03b9\u03b1 \u03c4\u03b1 \u03b5\u03c0\u03cc\u03bc\u03b5\u03bd\u03b1 \u03c0\u03ad\u03bd\u03c4\u03b5 \u03c7\u03c1\u03cc\u03bd\u03b9\u03b1 \u03ba\u03b1\u03b9 \u03b5\u03af\u03bc\u03b1\u03c3\u03c4\u03b5 \u03ad\u03c4\u03bf\u03b9\u03bc\u03bf\u03b9 \u03ce\u03c3\u03c4\u03b5, \u03cc\u03c4\u03b1\u03bd \u03c4\u03bf \u03b1\u03c0\u03bf\u03c6\u03b1\u03c3\u03af\u03c3\u03b5\u03b9 \u03c4\u03bf \u039a\u03bf\u03b9\u03bd\u03bf\u03b2\u03bf\u03cd\u03bb\u03b9\u03bf, - \u03b1\u03ba\u03cc\u03bc\u03b1 \u03ba\u03b1\u03b9 \u03b1\u03c5\u03c4\u03ae\u03bd \u03c4\u03b7\u03bd \u03b5\u03b2\u03b4\u03bf\u03bc\u03ac\u03b4\u03b1, \u03b1\u03bd \u03b1\u03c5\u03c4\u03ae \u03b5\u03af\u03bd\u03b1\u03b9 \u03b7 \u03b1\u03c0\u03cc\u03c6\u03b1\u03c3\u03b7 - \u03bd\u03b1 \u03c0\u03b1\u03c1\u03bf\u03c5\u03c3\u03b9\u03ac\u03c3\u03bf\u03c5\u03bc\u03b5 \u03c4\u03bf \u03c0\u03c1\u03cc\u03b3\u03c1\u03b1\u03bc\u03bc\u03b1 \u03b3\u03b9\u03b1 \u03c4\u03b1 \u03c0\u03ad\u03bd\u03c4\u03b5 \u03b5\u03c0\u03cc\u03bc\u03b5\u03bd\u03b1 \u03ad\u03c4\u03b7 \u03ba\u03b1\u03b9, \u03c4\u03bf\u03bd \u03b5\u03c0\u03cc\u03bc\u03b5\u03bd\u03bf \u03bc\u03ae\u03bd\u03b1, \u03c4\u03bf \u03c0\u03c1\u03cc\u03b3\u03c1\u03b1\u03bc\u03bc\u03b1 \u03b3\u03b9\u03b1 \u03c4\u03bf \u03ad\u03c4\u03bf\u03c2 2000, \u03b1\u03c5\u03c4\u03cc \u03b4\u03b7\u03bb\u03b1\u03b4\u03ae \u03b1\u03ba\u03c1\u03b9\u03b2\u03ce\u03c2 \u03c0\u03bf\u03c5 \u03b5\u03af\u03c7\u03b5 \u03c3\u03c5\u03bc\u03c6\u03c9\u03bd\u03b7\u03b8\u03b5\u03af.
+
+\u03a0\u03c1\u03bf\u03c4\u03b5\u03af\u03bd\u03c9 \u03bd\u03b1 \u03c8\u03b7\u03c6\u03af\u03c3\u03bf\u03c5\u03bc\u03b5 \u03b5\u03c0\u03af \u03c4\u03b7\u03c2 \u03b1\u03b9\u03c4\u03ae\u03c3\u03b5\u03c9\u03c2 \u03c4\u03b7\u03c2 \u039f\u03bc\u03ac\u03b4\u03b1\u03c2 \u03c4\u03bf\u03c5 \u039a\u03cc\u03bc\u03bc\u03b1\u03c4\u03bf\u03c2 \u03c4\u03c9\u03bd \u0395\u03c5\u03c1\u03c9\u03c0\u03b1\u03af\u03c9\u03bd \u03a3\u03bf\u03c3\u03b9\u03b1\u03bb\u03b9\u03c3\u03c4\u03ce\u03bd \u03b3\u03b9\u03b1 \u03c4\u03b7\u03bd \u03b5\u03c0\u03b1\u03bd\u03b5\u03b3\u03b3\u03c1\u03b1\u03c6\u03ae \u03c4\u03b7\u03c2 \u03b4\u03ae\u03bb\u03c9\u03c3\u03b7\u03c2 \u03c4\u03b7\u03c2 \u0395\u03c0\u03b9\u03c4\u03c1\u03bf\u03c0\u03ae\u03c2 \u03c3\u03c7\u03b5\u03c4\u03b9\u03ba\u03ac \u03bc\u03b5 \u03c4\u03bf\u03c5\u03c2 \u03c3\u03c4\u03c1\u03b1\u03c4\u03b7\u03b3\u03b9\u03ba\u03bf\u03cd\u03c2 \u03c3\u03c4\u03cc\u03c7\u03bf\u03c5\u03c2 \u03c4\u03b7\u03c2.
+(\u03a4\u03bf \u03a3\u03ce\u03bc\u03b1 \u03b1\u03c0\u03bf\u03c1\u03c1\u03af\u03c0\u03c4\u03b5\u03b9 \u03c4\u03b7\u03bd \u03b1\u03af\u03c4\u03b7\u03c3\u03b7) \u03a0\u03c1\u03cc\u03b5\u03b4\u03c1\u03bf\u03c2. \u03a0\u03ac\u03bd\u03c4\u03bf\u03c4\u03b5 \u03c3\u03b5 \u03cc,\u03c4\u03b9 \u03b1\u03c6\u03bf\u03c1\u03ac \u03c4\u03b7\u03bd \u03b7\u03bc\u03ad\u03c1\u03b1 \u03c4\u03b7\u03c2 \u03a4\u03b5\u03c4\u03ac\u03c1\u03c4\u03b7\u03c2, \u03ad\u03c7\u03c9 \u03bc\u03af\u03b1 \u03ac\u03bb\u03bb\u03b7 \u03c0\u03c1\u03cc\u03c4\u03b1\u03c3\u03b7 \u03c0\u03bf\u03c5 \u03b1\u03c6\u03bf\u03c1\u03ac \u03c4\u03b7\u03bd \u03c0\u03c1\u03bf\u03c6\u03bf\u03c1\u03b9\u03ba\u03ae \u03b5\u03c1\u03ce\u03c4\u03b7\u03c3\u03b7 \u03c3\u03c7\u03b5\u03c4\u03b9\u03ba\u03ac \u03bc\u03b5 \u03c4\u03bf\u03bd \u03c6\u03cc\u03c1\u03bf \u03b5\u03c0\u03af \u03c4\u03bf\u03c5 \u03ba\u03b5\u03c6\u03b1\u03bb\u03b1\u03af\u03bf\u03c5. \u0397 \u039f\u03bc\u03ac\u03b4\u03b1 \u03c4\u03bf\u03c5 \u0395\u03c5\u03c1\u03c9\u03c0\u03b1\u03ca\u03ba\u03bf\u03cd \u039b\u03b1\u03ca\u03ba\u03bf\u03cd \u039a\u03cc\u03bc\u03bc\u03b1\u03c4\u03bf\u03c2 (\u03a7\u03c1\u03b9\u03c3\u03c4\u03b9\u03b1\u03bd\u03bf\u03b4\u03b7\u03bc\u03bf\u03ba\u03c1\u03ac\u03c4\u03b5\u03c2) \u03ba\u03b1\u03b9 \u03c4\u03c9\u03bd \u0395\u03c5\u03c1\u03c9\u03c0\u03b1\u03af\u03c9\u03bd \u0394\u03b7\u03bc\u03bf\u03ba\u03c1\u03b1\u03c4\u03ce\u03bd \u03b6\u03b7\u03c4\u03ac \u03bd\u03b1 \u03b1\u03c0\u03bf\u03c3\u03c5\u03c1\u03b8\u03b5\u03af \u03b1\u03c5\u03c4\u03cc \u03c4\u03bf \u03c3\u03b7\u03bc\u03b5\u03af\u03bf \u03b1\u03c0\u03cc \u03c4\u03b7\u03bd \u03b7\u03bc\u03b5\u03c1\u03ae\u03c3\u03b9\u03b1 \u03b4\u03b9\u03ac\u03c4\u03b1\u03be\u03b7.
+\u03a5\u03c0\u03ac\u03c1\u03c7\u03b5\u03b9 \u03ba\u03ac\u03c0\u03bf\u03b9\u03bf\u03c2 \u03c3\u03c5\u03bd\u03ac\u03b4\u03b5\u03bb\u03c6\u03bf\u03c2 \u03b3\u03b9\u03b1 \u03bd\u03b1 \u03bb\u03ac\u03b2\u03b5\u03b9 \u03c4\u03bf\u03bd \u03bb\u03cc\u03b3\u03bf \u03b5\u03be \u03bf\u03bd\u03cc\u03bc\u03b1\u03c4\u03bf\u03c2 \u03c4\u03b7\u03c2 \u03c3\u03c5\u03b3\u03ba\u03b5\u03ba\u03c1\u03b9\u03bc\u03ad\u03bd\u03b7\u03c2 \u039f\u03bc\u03ac\u03b4\u03b1\u03c2 \u03ba\u03b1\u03b9 \u03bd\u03b1 \u03b1\u03b9\u03c4\u03b9\u03bf\u03bb\u03bf\u03b3\u03ae\u03c3\u03b5\u03b9 \u03b1\u03c5\u03c4\u03ae \u03c4\u03b7\u03bd \u03b1\u03af\u03c4\u03b7\u03c3\u03b7;
+
+\u039a\u03c5\u03c1\u03af\u03b1 \u03a0\u03c1\u03cc\u03b5\u03b4\u03c1\u03b5, \u03b5\u03c0\u03b5\u03b9\u03b4\u03ae \u03b1\u03ba\u03bf\u03cd\u03c9 \u03ba\u03ac\u03c0\u03bf\u03b9\u03b1 \u03b3\u03ad\u03bb\u03b9\u03b1 \u03b1\u03c0\u03cc \u03c4\u03b7\u03bd \u03c0\u03bb\u03b5\u03c5\u03c1\u03ac \u03c4\u03c9\u03bd \u03a3\u03bf\u03c3\u03b9\u03b1\u03bb\u03b9\u03c3\u03c4\u03ce\u03bd, \u03b5\u03af\u03bd\u03b1\u03b9 \u03ba\u03b1\u03bb\u03cc \u03bd\u03b1 \u03b3\u03bd\u03c9\u03c1\u03af\u03b6\u03b5\u03c4\u03b5 \u03c0\u03c9\u03c2 \u03bc\u03bf\u03c5 \u03b1\u03bd\u03b1\u03c6\u03ad\u03c1\u03b8\u03b7\u03ba\u03b5 \u03c4\u03bf \u03b3\u03b5\u03b3\u03bf\u03bd\u03cc\u03c2 \u03cc\u03c4\u03b9 \u03ba\u03b1\u03b9 \u03c3\u03c4\u03b7\u03bd \u039f\u03bc\u03ac\u03b4\u03b1 \u03c4\u03bf\u03c5 \u039a\u03cc\u03bc\u03bc\u03b1\u03c4\u03bf\u03c2 \u03c4\u03c9\u03bd \u0395\u03c5\u03c1\u03c9\u03c0\u03b1\u03af\u03c9\u03bd \u03a3\u03bf\u03c3\u03b9\u03b1\u03bb\u03b9\u03c3\u03c4\u03ce\u03bd \u03c5\u03c0\u03ac\u03c1\u03c7\u03bf\u03c5\u03bd \u03b5\u03c5\u03c1\u03b5\u03af\u03c2 \u03ba\u03cd\u03ba\u03bb\u03bf\u03b9 \u03c0\u03bf\u03c5 \u03b8\u03b1 \u03b5\u03c0\u03b9\u03b8\u03c5\u03bc\u03bf\u03cd\u03c3\u03b1\u03bd \u03c0\u03bf\u03bb\u03cd \u03c4\u03b7\u03bd \u03b1\u03c0\u03cc\u03c3\u03c5\u03c1\u03c3\u03b7 \u03b1\u03c0\u03cc \u03c4\u03b7\u03bd \u03b7\u03bc\u03b5\u03c1\u03ae\u03c3\u03b9\u03b1 \u03b4\u03b9\u03ac\u03c4\u03b1\u03be\u03b7 \u03c4\u03bf\u03c5 \u03b5\u03bd \u03bb\u03cc\u03b3\u03c9 \u03b8\u03ad\u03bc\u03b1\u03c4\u03bf\u03c2, \u03b5\u03c0\u03b5\u03b9\u03b4\u03ae \u03ba\u03b1\u03c4\u03ac \u03c4\u03b7\u03bd \u03c8\u03b7\u03c6\u03bf\u03c6\u03bf\u03c1\u03af\u03b1 \u03c3\u03c4\u03b7 \u0394\u03b9\u03ac\u03c3\u03ba\u03b5\u03c8\u03b7 \u03c4\u03c9\u03bd \u03a0\u03c1\u03bf\u03ad\u03b4\u03c1\u03c9\u03bd \u03b4\u03b5\u03bd \u03b5\u03af\u03c7\u03b5 \u03ba\u03b1\u03c4\u03b1\u03c4\u03b5\u03b8\u03b5\u03af \u03b7 \u03c8\u03ae\u03c6\u03bf\u03c2 \u03c4\u03b7\u03c2 \u03bf\u03bc\u03ac\u03b4\u03b1\u03c2 \u03b5\u03c1\u03b3\u03b1\u03c3\u03af\u03b1\u03c2 \u03c4\u03c9\u03bd \u03b1\u03c1\u03bc\u03cc\u03b4\u03b9\u03c9\u03bd \u03c3\u03c5\u03bd\u03b1\u03b4\u03ad\u03bb\u03c6\u03c9\u03bd \u03c4\u03b7\u03c2 \u039f\u03bc\u03ac\u03b4\u03b1\u03c2 \u03c4\u03bf\u03c5 \u039a\u03cc\u03bc\u03bc\u03b1\u03c4\u03bf\u03c2 \u03c4\u03c9\u03bd \u0395\u03c5\u03c1\u03c9\u03c0\u03b1\u03af\u03c9\u03bd \u03a3\u03bf\u03c3\u03b9\u03b1\u03bb\u03b9\u03c3\u03c4\u03ce\u03bd. \u0394\u03b5\u03bd \u03b3\u03bd\u03c9\u03c1\u03af\u03b6\u03c9 \u03b5\u03ac\u03bd \u03b5\u03af\u03bd\u03b1\u03b9 \u03c3\u03c9\u03c3\u03c4\u03ae \u03b1\u03c5\u03c4\u03ae \u03b7 \u03c0\u03bb\u03b7\u03c1\u03bf\u03c6\u03bf\u03c1\u03af\u03b1, \u03b5\u03bc\u03b5\u03af\u03c2 \u03cc\u03bc\u03c9\u03c2 \u03c9\u03c2 \u039f\u03bc\u03ac\u03b4\u03b1 \u03c4\u03bf\u03c5 \u0395\u03c5\u03c1\u03c9\u03c0\u03b1\u03ca
 \u03bf\u03cd \u039b\u03b1\u03ca\u03ba\u03bf\u03cd \u039a\u03cc\u03bc\u03bc\u03b1\u03c4\u03bf\u03c2 \u03ba\u03b1\u03b9 \u03c4\u03c9\u03bd \u0395\u03c5\u03c1\u03c9\u03c0\u03b1\u03af\u03c9\u03bd \u0394\u03b7\u03bc\u03bf\u03ba\u03c1\u03b1\u03c4\u03ce\u03bd \u03b8\u03b1 \u03b5\u03af\u03bc\u03b1\u03c3\u03c4\u03b5 \u03c3\u03af\u03b3\u03bf\u03c5\u03c1\u03b1 \u03b5\u03c5\u03b3\u03bd\u03ce\u03bc\u03bf\u03bd\u03b5\u03c2 \u03b5\u03ac\u03bd \u03b1\u03c0\u03bf\u03c3\u03c5\u03c1\u03cc\u03c4\u03b1\u03bd \u03c4\u03bf \u03b8\u03ad\u03bc\u03b1 \u03b1\u03c5\u03c4\u03cc, \u03b5\u03c0\u03b5\u03b9\u03b4\u03ae \u03c4\u03bf \u039a\u03bf\u03b9\u03bd\u03bf\u03b2\u03bf\u03cd\u03bb\u03b9\u03bf \u03ad\u03c7\u03b5\u03b9 \u03ae\u03b4\u03b7 \u03b1\u03c3\u03c7\u03bf\u03bb\u03b7\u03b8\u03b5\u03af, \u03c9\u03c2 \u03b3\u03bd\u03c9\u03c3\u03c4\u03cc\u03bd, \u03c0\u03bf\u03bb\u03bb\u03ad\u03c2 \u03c6\u03bf\u03c1\u03ad\u03c2 \u03bc\u03b5 \u03c4\u03bf \u03b5\u03bd \u03bb\u03cc\u03b3\u03c9 \u03b6\u03ae\u03c4\u03b7\u03bc\u03b1. \u03a5\u03c0\u03ac\u03c1\u03c7\u03bf\u03c5\u03bd \u03bc\u03ac\u03bb\u03b9\u03c3\u03c4\u03b1 \u03ba\u03b1\u03b9 \u03b1\u03c0\u03bf\u03c6\u03ac\u03c3\u03b5\u03b9\u03c2 \u03ba\u03b1\u03c4\u03ac \u03c4\u03b7\u03c2 \u03b5\u03c0\u03b9\u03b2\u03bf\u03bb\u03ae\u03c2 \u03b5\u03bd\u03cc\u03c2 \u03c0\u03b1\u03c1\u03cc\u03bc\u03bf\u03b9\u03bf\u03c5 \u03c6\u03cc\u03c1\u03bf\u03c5. \u0393\u03b9\u03b1 \u03c4\u03bf\u03bd \u03bb\u03cc\u03b3\u03bf \u03b1\u03c5\u03c4\u03cc, \u03b7 \u039f\u03bc\u03ac\u03b4\u03b1 \u03bc\u03bf\u03c5 \u03b6\u03b7\u03c4\u03b5\u03af \u03bd\u03b1 \u03b1\u03c0\u03bf\u03c3\u03c5\u03c1\u03b8\u03b5\u03af \u03c4\u03bf \u03b8\u03ad\u03bc\u03b1 \u03b1\u03c5\u03c4\u03cc \u03b1\u03c0\u03cc \u03c4\u03b7\u03bd \u03b7\u03bc\u03b5\u03c1\u03ae\u03c3\u03b9\u03b1 \u03b4\u03b9\u03ac\u03c4\u03b1\u03be\u03b7.
+
+\u03a3\u03b1\u03c2 \u03b5\u03c5\u03c7\u03b1\u03c1\u03b9\u03c3\u03c4\u03ce, \u03ba\u03cd\u03c1\u03b9\u03b5 Poettering.
+\u0398\u03b1 \u03b1\u03ba\u03bf\u03cd\u03c3\u03bf\u03c5\u03bc\u03b5 \u03c4\u03ce\u03c1\u03b1 \u03c4\u03bf\u03bd \u03ba. Wurtz, \u03bf \u03bf\u03c0\u03bf\u03af\u03bf\u03c2 \u03b1\u03bd\u03c4\u03b9\u03c4\u03af\u03b8\u03b5\u03c4\u03b1\u03b9 \u03c3\u03c4\u03b7 \u03c3\u03c5\u03b3\u03ba\u03b5\u03ba\u03c1\u03b9\u03bc\u03ad\u03bd\u03b7 \u03b1\u03af\u03c4\u03b7\u03c3\u03b7.
+
+\u039a\u03c5\u03c1\u03af\u03b1 \u03a0\u03c1\u03cc\u03b5\u03b4\u03c1\u03b5, \u03b8\u03b1 \u03ae\u03b8\u03b5\u03bb\u03b1 \u03ba\u03b1\u03c4\u03b1\u03c1\u03c7\u03ac\u03c2 \u03bd\u03b1 \u03c5\u03c0\u03bf\u03b3\u03c1\u03b1\u03bc\u03bc\u03af\u03c3\u03c9 \u03c4\u03b7\u03bd \u03ad\u03bb\u03bb\u03b5\u03b9\u03c8\u03b7 \u03bb\u03bf\u03b3\u03b9\u03ba\u03ae\u03c2 \u03c4\u03bf\u03c5 \u03ba. Poettering. \u0391\u03c5\u03c4\u03ae \u03c4\u03b7 \u03c3\u03c4\u03b9\u03b3\u03bc\u03ae, \u03bc\u03cc\u03bb\u03b9\u03c2 \u03ad\u03ba\u03b1\u03bd\u03b5 \u03bc\u03ac\u03b8\u03b7\u03bc\u03b1 \u03b7\u03b8\u03b9\u03ba\u03ae\u03c2 \u03c3\u03c4\u03b7\u03bd \u039f\u03bc\u03ac\u03b4\u03b1 \u03c4\u03bf\u03c5 \u039a\u03cc\u03bc\u03bc\u03b1\u03c4\u03bf\u03c2 \u03c4\u03c9\u03bd \u0395\u03c5\u03c1\u03c9\u03c0\u03b1\u03af\u03c9\u03bd \u03a3\u03bf\u03c3\u03b9\u03b1\u03bb\u03b9\u03c3\u03c4\u03ce\u03bd, \u03b4\u03b9\u03cc\u03c4\u03b9 \u03b1\u03bd\u03b1\u03af\u03c1\u03b5\u03c3\u03b5 \u03bc\u03af\u03b1 \u03b1\u03c0\u03cc\u03c6\u03b1\u03c3\u03b7 \u03c0\u03bf\u03c5 \u03ad\u03bb\u03b1\u03b2\u03b5 \u03bc\u03b5 \u03b9\u03b4\u03b9\u03b1\u03af\u03c4\u03b5\u03c1\u03b1 \u03be\u03b5\u03ba\u03ac\u03b8\u03b1\u03c1\u03bf \u03c4\u03c1\u03cc\u03c0\u03bf \u03b7 \u0394\u03b9\u03ac\u03c3\u03ba\u03b5\u03c8\u03b7 \u03c4\u03c9\u03bd \u03a0\u03c1\u03bf\u03ad\u03b4\u03c1\u03c9\u03bd. \u03a9\u03c3\u03c4\u03cc\u03c3\u03bf, \u03ba\u03b1\u03b9 \u03b1\u03c5\u03c4\u03cc\u03c2 \u03c0\u03c1\u03ac\u03c4\u03c4\u03b5\u03b9 \u03c4\u03bf \u03af\u03b4\u03b9\u03bf \u03c0\u03c1\u03ac\u03b3\u03bc\u03b1. \u03a3\u03c5\u03b6\u03b7\u03c4\u03ae\u03c3\u03b1\u03bc\u03b5 \u03ba\u03b1\u03b9 \u03b5\u03af\u03bc\u03b1\u03c3\u03c4\u03b1\u03bd \u03bf\u03bc\u03cc\u03c6\u03c9\u03bd\u03bf\u03b9 \u03b5\u03ba\u03c4\u03cc\u03c2 \u03b1\u03c0\u03cc \u03c4\u03b7\u03bd \u039f\u03bc\u03ac\u03b4\u03b1 \u03c4\u03bf\u03c5 \u0395\u03c5\u03c1\u03c9\u03c0\u03b1\u03ca\u03ba\u03bf\u03cd \u039b\u03b1\u03ca\u03ba\u03bf\u03cd \u039a\u03cc\u03bc\u03bc\u03b1\u03c4\u03bf\u03c2 \u03ba\u03b1\u03b9 \u03c4\u03b7\u03bd \u039f\u03bc\u03ac\u03b4\u03b1 \u03c4\u03c9\u03bd \u03a6\u03b9\u03bb\u03b5\u03bb\u03b5\u03c5\u03b8\u03ad\u03c1\u03c9\u03bd, \u03ba\u03b1\u03b9 \u03b5\u03af\u03c7\u03b1 \u03bc\u03ac\u03bb\u03b9\u03c3\u03c4\u03b1 \u03b5\u03c0\u03b9\u03c3\u03b7\u03bc\u03ac\u03bd\u03b5\u03b9, \u03b8\u03b1 \u03c4\u03bf \u03b8\u03c5\u03bc\u03ac\u03c3\u03c4\u03b5 \u03b1\u03b3\u03b1\u03c0\u03b7\u03c4\u03bf\u03af \u03c3\u03c5\u03bd\u03ac\u03b4\u03b5\u03bb\u03c6\u03bf\u03b9, \u03c0\u03c1\u03cc\u03b5\u03b4\u03c1\u03bf\u03b9 \u03c4\u03c9\u03bd \u039f\u03bc\u03ac\u03b4\u03c9\u03bd, \u03cc\u03c4\u03b9 \u03c4\u03bf \u03b6\u03ae\u03c4\u03b7\u03bc\u03b1 \u03b4\u03b5\u03bd \u03b5\u03af\u03bd\u03b1\u03b9 \u03b5\u03ac\u03bd \u03b5\u03af\u03c3\u03c4\u03b5 \u03c5\u03c0\u03ad\u03c1 
 \u03ae \u03ba\u03b1\u03c4\u03ac \u03c4\u03bf\u03c5 \u03c6\u03cc\u03c1\u03bf\u03c5 Tobin, \u03b1\u03bb\u03bb\u03ac \u03b5\u03ac\u03bd \u03c4\u03bf\u03bb\u03bc\u03ac\u03c4\u03b5 \u03bd\u03b1 \u03b1\u03ba\u03bf\u03cd\u03c3\u03b5\u03c4\u03b5 \u03c4\u03b9 \u03c3\u03ba\u03ad\u03c0\u03c4\u03bf\u03bd\u03c4\u03b1\u03b9 \u03b5\u03c0\u03af \u03c4\u03bf\u03c5 \u03c0\u03c1\u03bf\u03ba\u03b5\u03b9\u03bc\u03ad\u03bd\u03bf\u03c5 \u03b7 \u0395\u03c0\u03b9\u03c4\u03c1\u03bf\u03c0\u03ae \u03ba\u03b1\u03b9 \u03c4\u03bf \u03a3\u03c5\u03bc\u03b2\u03bf\u03cd\u03bb\u03b9\u03bf. \u0394\u03b5\u03bd \u03b6\u03b7\u03c4\u03ac\u03bc\u03b5 \u03c0\u03bf\u03bb\u03bb\u03ac. \u03a3\u03c5\u03bd\u03b5\u03c0\u03ce\u03c2, \u03b5\u03c0\u03b1\u03bd\u03b1\u03bb\u03b1\u03bc\u03b2\u03ac\u03bd\u03c9 \u03c4\u03b7\u03bd \u03c0\u03c1\u03cc\u03c4\u03b1\u03c3\u03b7 \u03bd\u03b1 \u03b4\u03b9\u03b1\u03c4\u03b7\u03c1\u03b7\u03b8\u03b5\u03af \u03b7 \u03b5\u03bd \u03bb\u03cc\u03b3\u03c9 \u03c0\u03c1\u03bf\u03c6\u03bf\u03c1\u03b9\u03ba\u03ae \u03b5\u03c1\u03ce\u03c4\u03b7\u03c3\u03b7 \u03c0\u03c1\u03bf\u03c2 \u03c4\u03b7\u03bd \u0395\u03c0\u03b9\u03c4\u03c1\u03bf\u03c0\u03ae \u03ba\u03b1\u03b9 \u03c4\u03bf \u03a3\u03c5\u03bc\u03b2\u03bf\u03cd\u03bb\u03b9\u03bf, \u03b3\u03b9\u03b1 \u03bd\u03b1 \u03bc\u03ac\u03b8\u03bf\u03c5\u03bc\u03b5 \u03ac\u03c0\u03b1\u03be \u03b4\u03b9\u03b1 \u03c0\u03b1\u03bd\u03c4\u03cc\u03c2 \u03c4\u03b7 \u03b8\u03ad\u03c3\u03b7 \u03c4\u03c9\u03bd \u03b5\u03bd \u03bb\u03cc\u03b3\u03c9 \u03b4\u03cd\u03bf \u03b1\u03c1\u03c7\u03ce\u03bd \u03c3\u03b5 \u03c3\u03c7\u03ad\u03c3\u03b7 \u03bc\u03b5 \u03c4\u03b7\u03bd \u03c0\u03c1\u03bf\u03ba\u03b5\u03b9\u03bc\u03ad\u03bd\u03b7 \u03c0\u03c1\u03cc\u03c4\u03b1\u03c3\u03b7, \u03b7 \u03bf\u03c0\u03bf\u03af\u03b1 \u03b5\u03af\u03bd\u03b1\u03b9 \u03c3\u03c7\u03b5\u03c4\u03b9\u03ba\u03ac \u03bc\u03b5\u03c4\u03c1\u03b9\u03bf\u03c0\u03b1\u03b8\u03ae\u03c2, \u03b1\u03bb\u03bb\u03ac \u03c0\u03bf\u03c5 \u03b8\u03b1 \u03ad\u03c3\u03c4\u03b5\u03bb\u03bd\u03b5, \u03c9\u03c3\u03c4\u03cc\u03c3\u03bf, \u03ad\u03bd\u03b1 \u03c3\u03b7\u03bc\u03b1\u03bd\u03c4\u03b9\u03ba\u03cc \u03bc\u03ae\u03bd\u03c5\u03bc\u03b1 \u03c3\u03c4\u03b7\u03bd \u03ba\u03bf\u03b9\u03bd\u03ae \u03b3\u03bd\u03ce\u03bc\u03b7, \u03b9\u03b4\u03b9\u03b1\u03af\u03c4\u03b5\u03c1\u03b1 \u03bc\u03b5\u03c4\u03ac \u03c4\u03b7\u03bd \u03c4\u03b1\u03c1\u03b1\u03c7\u03ae \u03c0\u03bf\u03c5 \u03c0\u03c1\u03bf\u03ba\u03ac\u03bb\u03b5\u03c3\u03b5 \u03b7 \u03b1\u03c0\u03bf\u03c4\u03c5\u03c7\u03af\u03b1 \u03c4\u03b7\u03c2 \u0394\u03b9\u03ac\u03c3\u03ba\u03b5\u03c8\u03b7\u03c2 \u03c4\u03bf\u03c5 Seattle.
+
+\u0398\u03b1 \u03c8\u03b7\u03c6\u03af\u03c3\u03bf\u03c5\u03bc\u03b5 \u03b5\u03c0\u03af \u03c4\u03b7\u03c2 \u03b1\u03b9\u03c4\u03ae\u03c3\u03b5\u03c9\u03c2 \u03c4\u03b7\u03c2 \u039f\u03bc\u03ac\u03b4\u03b1\u03c2 \u03c4\u03bf\u03c5 \u0395\u03c5\u03c1\u03c9\u03c0\u03b1\u03ca\u03ba\u03bf\u03cd \u039b\u03b1\u03ca\u03ba\u03bf\u03cd \u039a\u03cc\u03bc\u03bc\u03b1\u03c4\u03bf\u03c2 (\u03a7\u03c1\u03b9\u03c3\u03c4\u03b9\u03b1\u03bd\u03bf\u03b4\u03b7\u03bc\u03bf\u03ba\u03c1\u03ac\u03c4\u03b5\u03c2) \u03ba\u03b1\u03b9 \u03c4\u03c9\u03bd \u0395\u03c5\u03c1\u03c9\u03c0\u03b1\u03af\u03c9\u03bd \u0394\u03b7\u03bc\u03bf\u03ba\u03c1\u03b1\u03c4\u03ce\u03bd \u03b3\u03b9\u03b1 \u03c4\u03b7\u03bd \u03b1\u03c0\u03cc\u03c3\u03c5\u03c1\u03c3\u03b7 \u03b1\u03c0\u03cc \u03c4\u03b7\u03bd \u03b7\u03bc\u03b5\u03c1\u03ae\u03c3\u03b9\u03b1 \u03b4\u03b9\u03ac\u03c4\u03b1\u03be\u03b7 \u03c4\u03b7\u03c2 \u03c0\u03c1\u03bf\u03c6\u03bf\u03c1\u03b9\u03ba\u03ae\u03c2 \u03b5\u03c1\u03ce\u03c4\u03b7\u03c3\u03b7\u03c2 \u03b1\u03bd\u03b1\u03c6\u03bf\u03c1\u03b9\u03ba\u03ac \u03bc\u03b5 \u03c4\u03bf\u03bd \u03c6\u03cc\u03c1\u03bf \u03b5\u03c0\u03af \u03c4\u03bf\u03c5 \u03ba\u03b5\u03c6\u03b1\u03bb\u03b1\u03af\u03bf\u03c5.
+(\u03a4\u03bf \u03a3\u03ce\u03bc\u03b1 \u03b1\u03c0\u03bf\u03c1\u03c1\u03af\u03c0\u03c4\u03b5\u03b9 \u03c4\u03b7\u03bd \u03b1\u03af\u03c4\u03b7\u03c3\u03b7 \u03bc\u03b5 164 \u03c8\u03ae\u03c6\u03bf\u03c5\u03c2 \u03c5\u03c0\u03ad\u03c1, 166 \u03c8\u03ae\u03c6\u03bf\u03c5\u03c2 \u03ba\u03b1\u03c4\u03ac \u03ba\u03b1\u03b9 7 \u03b1\u03c0\u03bf\u03c7\u03ad\u03c2)
+
+\u039a\u03c5\u03c1\u03af\u03b1 \u03a0\u03c1\u03cc\u03b5\u03b4\u03c1\u03b5, \u03b8\u03b1 \u03ae\u03b8\u03b5\u03bb\u03b1 \u03bd\u03b1 \u03b5\u03c5\u03c7\u03b1\u03c1\u03b9\u03c3\u03c4\u03ae\u03c3\u03c9 \u03c4\u03bf\u03bd \u03ba. Poettering \u03b3\u03b9\u03b1 \u03c4\u03b7 \u03b4\u03b7\u03bc\u03bf\u03c3\u03b9\u03cc\u03c4\u03b7\u03c4\u03b1 \u03c0\u03bf\u03c5 \u03ad\u03b4\u03c9\u03c3\u03b5 \u03bc\u03cc\u03bb\u03b9\u03c2 \u03c4\u03ce\u03c1\u03b1 \u03c3\u03b5 \u03b1\u03c5\u03c4\u03ae \u03c4\u03b7 \u03c3\u03c5\u03b6\u03ae\u03c4\u03b7\u03c3\u03b7. \u0395\u03c5\u03c7\u03b1\u03c1\u03b9\u03c3\u03c4\u03ce.
+
+\u039a\u03c5\u03c1\u03af\u03b1 \u03a0\u03c1\u03cc\u03b5\u03b4\u03c1\u03b5, \u03c5\u03c0\u03bf\u03bb\u03bf\u03b3\u03af\u03c3\u03c4\u03b7\u03ba\u03b5 \u03b7 \u03c8\u03ae\u03c6\u03bf\u03c2 \u03bc\u03bf\u03c5, \u03c4\u03b7\u03bd \u03bf\u03c0\u03bf\u03af\u03b1 \u03b4\u03b5\u03bd \u03bc\u03c0\u03cc\u03c1\u03b5\u03c3\u03b1 \u03bd\u03b1 \u03b4\u03ce\u03c3\u03c9 \u03bc\u03b5 \u03c4\u03b1 \u03b7\u03bb\u03b5\u03ba\u03c4\u03c1\u03bf\u03bd\u03b9\u03ba\u03ac \u03bc\u03ad\u03c3\u03b1, \u03b5\u03c0\u03b5\u03b9\u03b4\u03ae \u03b4\u03b5\u03bd \u03ad\u03c7\u03c9 \u03c4\u03b7\u03bd \u03ba\u03ac\u03c1\u03c4\u03b1; \u0397 \u03c8\u03ae\u03c6\u03bf\u03c2 \u03bc\u03bf\u03c5 \u03ae\u03c4\u03b1\u03bd "\u03c5\u03c0\u03ad\u03c1";.
+
+\u03a0\u03c1\u03ac\u03b3\u03bc\u03b1\u03c4\u03b9, \u03b5\u03ac\u03bd \u03c0\u03c1\u03bf\u03c3\u03b8\u03ad\u03c3\u03bf\u03c5\u03bc\u03b5 \u03c4\u03bf\u03c5\u03c2 \u03b4\u03cd\u03bf \u03c3\u03c5\u03bd\u03b1\u03b4\u03ad\u03bb\u03c6\u03bf\u03c5\u03c2 \u03c0\u03bf\u03c5 \u03b5\u03be\u03ad\u03c6\u03c1\u03b1\u03c3\u03b1\u03bd \u03c4\u03b7 \u03b8\u03ad\u03c3\u03b7 \u03c4\u03bf\u03c5\u03c2, \u03ad\u03c7\u03bf\u03c5\u03bc\u03b5 \u03c9\u03c2 \u03b1\u03c0\u03bf\u03c4\u03ad\u03bb\u03b5\u03c3\u03bc\u03b1�
+(\u0394\u03b9\u03b1\u03bc\u03b1\u03c1\u03c4\u03c5\u03c1\u03af\u03b5\u03c2)
+
+\u039a\u03c5\u03c1\u03af\u03b1 \u03a0\u03c1\u03cc\u03b5\u03b4\u03c1\u03b5, \u03b7 \u03a0\u03c1\u03bf\u03b5\u03b4\u03c1\u03af\u03b1 \u03b1\u03bd\u03b1\u03ba\u03bf\u03af\u03bd\u03c9\u03c3\u03b5 \u03c4\u03bf \u03b1\u03c0\u03bf\u03c4\u03ad\u03bb\u03b5\u03c3\u03bc\u03b1 \u03c4\u03b7\u03c2 \u03c8\u03b7\u03c6\u03bf\u03c6\u03bf\u03c1\u03af\u03b1\u03c2. \u0394\u03b5\u03bd \u03c5\u03c0\u03ac\u03c1\u03c7\u03bf\u03c5\u03bd \u03c4\u03c1\u03bf\u03c0\u03bf\u03c0\u03bf\u03b9\u03ae\u03c3\u03b5\u03b9\u03c2.
+(\u03a7\u03b5\u03b9\u03c1\u03bf\u03ba\u03c1\u03bf\u03c4\u03ae\u03bc\u03b1\u03c4\u03b1)
+
+\u0391\u03b3\u03b1\u03c0\u03b7\u03c4\u03bf\u03af \u03c3\u03c5\u03bd\u03ac\u03b4\u03b5\u03bb\u03c6\u03bf\u03b9, \u03b1\u03ba\u03cc\u03bc\u03b7 \u03bc\u03af\u03b1 \u03c6\u03bf\u03c1\u03ac, \u03c0\u03c1\u03ad\u03c0\u03b5\u03b9 \u03ba\u03b1\u03b8\u03ad\u03bd\u03b1\u03c2 \u03bd\u03b1 \u03ad\u03c7\u03b5\u03b9 \u03cc\u03bd\u03c4\u03c9\u03c2 \u03c4\u03b7\u03bd \u03ba\u03ac\u03c1\u03c4\u03b1 \u03c4\u03bf\u03c5 \u03c4\u03b7 \u0394\u03b5\u03c5\u03c4\u03ad\u03c1\u03b1. \u0392\u03bb\u03ad\u03c0\u03bf\u03c5\u03bc\u03b5 \u03cc\u03c4\u03b9 \u03b5\u03b4\u03ce \u03ad\u03c7\u03bf\u03c5\u03bc\u03b5 \u03c0\u03c1\u03cc\u03b2\u03bb\u03b7\u03bc\u03b1. \u03a5\u03c0\u03cc \u03b1\u03c5\u03c4\u03ad\u03c2 \u03c4\u03b9\u03c2 \u03c3\u03c5\u03bd\u03b8\u03ae\u03ba\u03b5\u03c2 \u03bf\u03c6\u03b5\u03af\u03bb\u03c9 \u03bd\u03b1 \u03bb\u03ac\u03b2\u03c9 \u03bc\u03af\u03b1 \u03b1\u03c0\u03cc\u03c6\u03b1\u03c3\u03b7.
+\u039a\u03b1\u03b9 \u03b5\u03b3\u03ce \u03b5\u03c0\u03af\u03c3\u03b7\u03c2 \u03ad\u03c7\u03c9 \u03be\u03b5\u03c7\u03ac\u03c3\u03b5\u03b9 \u03c4\u03b7\u03bd \u03ba\u03ac\u03c1\u03c4\u03b1 \u03bc\u03bf\u03c5 \u03ba\u03b1\u03b9 \u03b8\u03b1 \u03c8\u03ae\u03c6\u03b9\u03b6\u03b1 \u03ba\u03b1\u03c4\u03ac \u03c4\u03b7\u03c2 \u03c0\u03c1\u03cc\u03c4\u03b1\u03c3\u03b7\u03c2. \u0398\u03b5\u03c9\u03c1\u03ce \u03c3\u03c5\u03bd\u03b5\u03c0\u03ce\u03c2 \u03cc\u03c4\u03b9 \u03b7 \u03c0\u03c1\u03bf\u03c6\u03bf\u03c1\u03b9\u03ba\u03ae \u03b5\u03c1\u03ce\u03c4\u03b7\u03c3\u03b7 \u03c0\u03b1\u03c1\u03b1\u03bc\u03ad\u03bd\u03b5\u03b9 \u03c3\u03c4\u03b7\u03bd \u03b7\u03bc\u03b5\u03c1\u03ae\u03c3\u03b9\u03b1 \u03b4\u03b9\u03ac\u03c4\u03b1\u03be\u03b7.
+(\u03a7\u03b5\u03b9\u03c1\u03bf\u03ba\u03c1\u03bf\u03c4\u03ae\u03bc\u03b1\u03c4\u03b1) \u0395\u03af\u03bd\u03b1\u03b9 \u03b7 \u03c4\u03b5\u03bb\u03b5\u03c5\u03c4\u03b1\u03af\u03b1 \u03c6\u03bf\u03c1\u03ac \u03c0\u03bf\u03c5 \u03bb\u03b1\u03bc\u03b2\u03ac\u03bd\u03bf\u03c5\u03bc\u03b5 \u03c5\u03c0\u03cc\u03c8\u03b7 \u03cc\u03c3\u03bf\u03c5\u03c2 \u03ad\u03c7\u03bf\u03c5\u03bd \u03be\u03b5\u03c7\u03ac\u03c3\u03b5\u03b9 \u03c4\u03b9\u03c2 \u03ba\u03ac\u03c1\u03c4\u03b5\u03c2 \u03c4\u03bf\u03c5\u03c2. \u039d\u03b1 \u03b5\u03af\u03bd\u03b1\u03b9 \u03b1\u03c0\u03bf\u03bb\u03cd\u03c4\u03c9\u03c2 \u03c3\u03b1\u03c6\u03ad\u03c2 \u03ba\u03b1\u03b9 \u03bd\u03b1 \u03c4\u03bf \u03c0\u03b5\u03af\u03c4\u03b5 \u03ba\u03b1\u03b9 \u03c3\u03c4\u03bf\u03c5\u03c2 \u03ac\u03bb\u03bb\u03bf\u03c5\u03c2.
+(\u03a7\u03b5\u03b9\u03c1\u03bf\u03ba\u03c1\u03bf\u03c4\u03ae\u03bc\u03b1\u03c4\u03b1)\u039d\u03b1\u03b9, \u03b7 \u03c0\u03c1\u03bf\u03c6\u03bf\u03c1\u03b9\u03ba\u03ae \u03b5\u03c1\u03ce\u03c4\u03b7\u03c3\u03b7 \u03c0\u03b1\u03c1\u03b1\u03bc\u03ad\u03bd\u03b5\u03b9 \u03c3\u03c4\u03b7\u03bd \u03b7\u03bc\u03b5\u03c1\u03ae\u03c3\u03b9\u03b1 \u03b4\u03b9\u03ac\u03c4\u03b1\u03be\u03b7 \u03ba\u03b1\u03b9 \u03bd\u03b1\u03b9, \u03b7 \u03a0\u03c1\u03cc\u03b5\u03b4\u03c1\u03bf\u03c2 \u03ad\u03c7\u03b5\u03b9 \u03b4\u03b9\u03ba\u03b1\u03af\u03c9\u03bc\u03b1 \u03c8\u03ae\u03c6\u03bf\u03c5, \u03cc\u03c0\u03c9\u03c2 \u03b5\u03c0\u03af\u03c3\u03b7\u03c2 \u03ad\u03c7\u03b5\u03b9 \u03b4\u03b9\u03ba\u03b1\u03af\u03c9\u03bc\u03b1 \u03bd\u03b1 \u03be\u03b5\u03c7\u03ac\u03c3\u03b5\u03b9 \u03c4\u03b7\u03bd \u03ba\u03ac\u03c1\u03c4\u03b1 \u03c4\u03b7\u03c2.
+\u03a0\u03c1\u03bf\u03c7\u03c9\u03c1\u03bf\u03cd\u03bc\u03b5 \u03c3\u03c4\u03b9\u03c2 \u03c5\u03c0\u03cc\u03bb\u03bf\u03b9\u03c0\u03b5\u03c2 \u03c4\u03c1\u03bf\u03c0\u03bf\u03c0\u03bf\u03b9\u03ae\u03c3\u03b5\u03b9\u03c2 \u03c4\u03b7\u03c2 \u03b7\u03bc\u03b5\u03c1\u03ae\u03c3\u03b9\u03b1\u03c2 \u03b4\u03b9\u03ac\u03c4\u03b1\u03be\u03b7\u03c2.
+
+\u039a\u03c5\u03c1\u03af\u03b1 \u03a0\u03c1\u03cc\u03b5\u03b4\u03c1\u03b5, \u03c3\u03c4\u03b7\u03bd \u03c0\u03c1\u03bf\u03b7\u03b3\u03bf\u03cd\u03bc\u03b5\u03bd\u03b7 \u03c8\u03b7\u03c6\u03bf\u03c6\u03bf\u03c1\u03af\u03b1 - \u03ba\u03b1\u03b9 \u03b8\u03b1 \u03b5\u03bc\u03bc\u03b5\u03af\u03bd\u03c9 \u03c3\u03c4\u03b7\u03bd \u03b1\u03c0\u03cc\u03c6\u03b1\u03c3\u03ae \u03c3\u03b1\u03c2 \u03c3\u03b5 \u03b1\u03c5\u03c4\u03cc \u03c4\u03bf \u03b8\u03ad\u03bc\u03b1 - \u03b1\u03bd\u03b1\u03c6\u03bf\u03c1\u03b9\u03ba\u03ac \u03bc\u03b5 \u03c4\u03bf \u03c3\u03c4\u03c1\u03b1\u03c4\u03b7\u03b3\u03b9\u03ba\u03cc \u03c3\u03c7\u03ad\u03b4\u03b9\u03bf \u03c4\u03b7\u03c2 \u0395\u03c0\u03b9\u03c4\u03c1\u03bf\u03c0\u03ae\u03c2, \u03b5\u03ba\u03b4\u03ae\u03bb\u03c9\u03c3\u03b1 \u03c4\u03b7\u03bd \u03c0\u03c1\u03cc\u03b8\u03b5\u03c3\u03ae \u03bc\u03bf\u03c5 \u03bd\u03b1 \u03bc\u03b9\u03bb\u03ae\u03c3\u03c9 \u03c0\u03c1\u03b9\u03bd \u03b1\u03c0\u03cc \u03c4\u03b7\u03bd \u03c8\u03b7\u03c6\u03bf\u03c6\u03bf\u03c1\u03af\u03b1 \u03b5\u03be \u03bf\u03bd\u03cc\u03bc\u03b1\u03c4\u03bf\u03c2 \u03c4\u03b7\u03c2 \u039f\u03bc\u03ac\u03b4\u03b1\u03c2 \u03bc\u03bf\u03c5. \u0391\u03c5\u03c4\u03cc \u03b4\u03b5\u03bd \u03c3\u03c5\u03bd\u03ad\u03b2\u03b7. \u0398\u03b1 \u03c4\u03bf \u03b5\u03ba\u03c4\u03b9\u03bc\u03bf\u03cd\u03c3\u03b1 \u03b5\u03ac\u03bd \u03bc\u03b5 \u03c4\u03bf \u03c0\u03ad\u03c1\u03b1\u03c2 \u03b1\u03c5\u03c4\u03bf\u03cd \u03c4\u03bf\u03c5 \u03b8\u03ad\u03bc\u03b1\u03c4\u03bf\u03c2 \u03bc\u03bf\u03c5 \u03b5\u03c0\u03b9\u03c4\u03c1\u03b1\u03c0\u03b5\u03af \u03bd\u03b1 \u03c0\u03c1\u03bf\u03b2\u03ce \u03c3\u03b5 \u03b1\u03b9\u03c4\u03b9\u03bf\u03bb\u03cc\u03b3\u03b7\u03c3\u03b7 \u03c8\u03ae\u03c6\u03bf\u03c5 \u03b5\u03ba \u03bc\u03ad\u03c1\u03bf\u03c5\u03c2 \u03c4\u03b7\u03c2 \u039f\u03bc\u03ac\u03b4\u03b1\u03c2 \u03bc\u03bf\u03c5. \u03a0\u03c1\u03cc\u03ba\u03b5\u03b9\u03c4\u03b1\u03b9 \u03b3\u03b9\u03b1 \u03c3\u03b7\u03bc\u03b1\u03bd\u03c4\u03b9\u03ba\u03cc \u03b8\u03ad\u03bc\u03b1. \u0398\u03b1 \u03ae\u03c4\u03b1\u03bd \u03c7\u03c1\u03ae\u03c3\u03b9\u03bc\u03bf \u03bd\u03b1 \u03ba\u03b1\u03c4\u03b1\u03b3\u03c1\u03b1\u03c6\u03b5\u03af \u03c3\u03c4\u03b1 \u03c0\u03c1\u03b1\u03ba\u03c4\u03b9\u03ba\u03ac \u03c4\u03bf\u03c5 \u03a3\u03ce\u03bc\u03b1\u03c4\u03bf\u03c2 \u03c0\u03ce\u03c2 \u03b1\u03bd\u03c4\u03b9\u03bb\u03b1\u03bc\u03b2\u03ac\u03bd\u03b5\u03c4\u03b1\u03b9 \u03bf \u03ba\u03b1\u03b8\u03ad\u03bd\u03b1\u03c2 \u03b1\u03c5\u03c4\u03ac \u03c0\u03bf\u03c5 \u03bc\u03cc\u03bb\u03b9\u03c2 \u03c0\u03c1\u03ac\u03be\u03b1\u03bc\u03b5, \u03c5\u03c0\u03cc \u03c4\u03bf \u03c6\u03c9\u03c2 \u03c4\u03b7\u03c2 \u03b4\u03b9\u03ba\u03ae\u03c2 \u03c4\u03bf\u03c5 \u03c0\u03bf\u03bb\u03b9\u03c4\u03b9\u03ba\u03ae\u03c2 \u03b1\u03bd\u03ac\u03bb\u03c5\u03c3\u03b7\u03c2.
+
+\u039a\u03c5\u03c1\u03af\u03b1 \u03a0\u03c1\u03cc\u03b5\u03b4\u03c1\u03b5, \u03b4\u03b5\u03bd \u03c3\u03ba\u03bf\u03c0\u03b5\u03cd\u03c9 \u03bd\u03b1 \u03b5\u03c0\u03b1\u03bd\u03b1\u03bb\u03ac\u03b2\u03c9 \u03c4\u03b7 \u03c3\u03c5\u03b6\u03ae\u03c4\u03b7\u03c3\u03b7, \u03cc\u03bc\u03c9\u03c2 \u03ba\u03b1\u03b9 \u03b5\u03b3\u03ce \u03b5\u03af\u03c7\u03b1 \u03b4\u03b7\u03bb\u03ce\u03c3\u03b5\u03b9 \u03c4\u03b7\u03bd \u03b5\u03c0\u03b9\u03b8\u03c5\u03bc\u03af\u03b1 \u03bd\u03b1 \u03c0\u03b1\u03c1\u03ad\u03bc\u03b2\u03c9 \u03b5\u03c0\u03af \u03c4\u03b7\u03c2 \u03b1\u03b9\u03c4\u03ae\u03c3\u03b5\u03c9\u03c2 \u03c4\u03bf\u03c5 \u03ba. Bar\u03c3n Crespo. \u0394\u03b5\u03bd \u03bc\u03bf\u03c5 \u03b4\u03ce\u03c3\u03b1\u03c4\u03b5 \u03cc\u03bc\u03c9\u03c2 \u03c4\u03bf\u03bd \u03bb\u03cc\u03b3\u03bf. \u039b\u03c5\u03c0\u03ac\u03bc\u03b1\u03b9 \u03b3\u03b9' \u03b1\u03c5\u03c4\u03cc, \u03cc\u03bc\u03c9\u03c2 \u03b7 \u03c8\u03b7\u03c6\u03bf\u03c6\u03bf\u03c1\u03af\u03b1 \u03b4\u03b9\u03b5\u03bd\u03b5\u03c1\u03b3\u03ae\u03b8\u03b7\u03ba\u03b5, \u03b7 \u03b1\u03c0\u03cc\u03c6\u03b1\u03c3\u03b7 \u03bb\u03ae\u03c6\u03b8\u03b7\u03ba\u03b5, \u03b5\u03c0\u03bf\u03bc\u03ad\u03bd\u03c9\u03c2 \u03b8\u03b5\u03c9\u03c1\u03ce \u03c4\u03bf \u03b8\u03ad\u03bc\u03b1 \u03bb\u03ae\u03be\u03b1\u03bd.
+
+\u039b\u03c5\u03c0\u03bf\u03cd\u03bc\u03b1\u03b9 \u03ba\u03cd\u03c1\u03b9\u03b5 H\u03b4nsch \u03ba\u03b1\u03b9 \u03ba\u03cd\u03c1\u03b9\u03b5 Cox, \u03b4\u03b5\u03bd \u03b5\u03af\u03b4\u03b1 \u03cc\u03c4\u03b9 \u03b6\u03b7\u03c4\u03bf\u03cd\u03c3\u03b1\u03c4\u03b5 \u03c4\u03bf\u03bd \u03bb\u03cc\u03b3\u03bf. \u03a5\u03c0\u03cc \u03c4\u03b9\u03c2 \u03c3\u03c5\u03b3\u03ba\u03b5\u03ba\u03c1\u03b9\u03bc\u03ad\u03bd\u03b5\u03c2 \u03c3\u03c5\u03bd\u03b8\u03ae\u03ba\u03b5\u03c2, \u03b8\u03b5\u03c9\u03c1\u03ce \u03cc\u03c4\u03b9 \u03bf\u03b9 \u03b8\u03ad\u03c3\u03b5\u03b9\u03c2 \u03b5\u03af\u03bd\u03b1\u03b9 \u03b1\u03c1\u03ba\u03b5\u03c4\u03ac \u03c3\u03b1\u03c6\u03b5\u03af\u03c2 \u03ba\u03b1\u03b9 \u03b8\u03b1 \u03ba\u03b1\u03c4\u03b1\u03b3\u03c1\u03b1\u03c6\u03bf\u03cd\u03bd \u03c3\u03c4\u03b1 \u03c3\u03c5\u03bd\u03bf\u03c0\u03c4\u03b9\u03ba\u03ac \u03c0\u03c1\u03b1\u03ba\u03c4\u03b9\u03ba\u03ac. \u038c\u03c4\u03b1\u03bd \u03b1\u03cd\u03c1\u03b9\u03bf \u03b5\u03b3\u03ba\u03c1\u03af\u03bd\u03bf\u03c5\u03bc\u03b5 \u03c4\u03b1 \u03c3\u03c5\u03bd\u03bf\u03c0\u03c4\u03b9\u03ba\u03ac \u03c0\u03c1\u03b1\u03ba\u03c4\u03b9\u03ba\u03ac \u03c4\u03b7\u03c2 \u03c3\u03b7\u03bc\u03b5\u03c1\u03b9\u03bd\u03ae\u03c2 \u03c3\u03c5\u03bd\u03b5\u03b4\u03c1\u03af\u03b1\u03c3\u03b7\u03c2, \u03bf\u03b9 \u03c3\u03c5\u03bd\u03ac\u03b4\u03b5\u03bb\u03c6\u03bf\u03b9 \u03c0\u03bf\u03c5 \u03b8\u03b5\u03c9\u03c1\u03bf\u03cd\u03bd \u03cc\u03c4\u03b9 \u03bf\u03b9 \u03b8\u03ad\u03c3\u03b5\u03b9\u03c2 \u03b4\u03b5\u03bd \u03ad\u03c7\u03bf\u03c5\u03bd \u03b5\u03c0\u03b5\u03be\u03b7\u03b3\u03b7\u03b8\u03b5\u03af \u03b5\u03c0\u03b1\u03c1\u03ba\u03ce\u03c2, \u03bc\u03c0\u03bf\u03c1\u03bf\u03cd\u03bd \u03bd\u03b1 \u03b6\u03b7\u03c4\u03ae\u03c3\u03bf\u03c5\u03bd \u03c4\u03c1\u03bf\u03c0\u03bf\u03c0\u03bf\u03b9\u03ae\u03c3\u03b5\u03b9\u03c2. \u039c\u03bf\u03c5 \u03c6\u03b1\u03af\u03bd\u03b5\u03c4\u03b1\u03b9 \u03cc\u03c4\u03b9 \u03b5\u03af\u03bd\u03b1\u03b9 \u03bc\u03b9\u03b1 \u03ba\u03b1\u03bb\u03ae \u03bb\u03cd\u03c3\u03b7. \u0395\u03be\u03c5\u03c0\u03b1\u03ba\u03bf\u03cd\u03b5\u03c4\u03b1\u03b9 \u03cc\u03c4\u03b9 \u03c3\u03c4\u03b1 \u03c3\u03c5\u03bd\u03bf\u03c0\u03c4\u03b9\u03ba\u03ac \u03c0\u03c1\u03b1\u03ba\u03c4\u03b9\u03ba\u03ac \u03c4\u03b7\u03c2 \u03b1\u03c5\u03c1\u03b9\u03b1\u03bd\u03ae\u03c2 \u03c3\u03c5\u03bd\u03b5\u03b4\u03c1\u03af\u03b1\u03c3\u03b7\u03c2 \u03b8\u03b1 \u03bb\u03b7\u03c6\u03b8\u03bf\u03cd\u03bd \u03c5\u03c0\u03cc\u03c8\u03b7 \u03cc\u03bb\u03b5\u03c2 \u03bf\u03b9 \u03c3\u03c5\u03bc\u03c0\u03bb\u03b7\u03c1\u03c9\u03bc\u03b1\u03c4\u03b9\u03ba\u03ad\u03c2 \u03b5\u03c0\u03b5\u03be\u03b7\u03b3\u03ae\u03c3\u03b5\u03b9\u03c2. \u0398\u03b5\u03c9\u03c1\u03ce \u03cc\u03c4\u03b9 \u03b5\u03af\u03bd\u03b1\u03b9 \u03ba\u03b1\u03bb\u03cd\u03c4\u03b5\u03c1\u03b7 \u03bb\u03cd\u03c3
  \u03b1\u03c0\u03cc \u03c4\u03bf \u03bd\u03b1 \u03c0\u03c1\u03bf\u03b2\u03bf\u03cd\u03bc\u03b5 \u03c4\u03ce\u03c1\u03b1 \u03c3\u03b5 \u03b1\u03b9\u03c4\u03b9\u03bf\u03bb\u03bf\u03b3\u03ae\u03c3\u03b5\u03b9\u03c2 \u03c8\u03ae\u03c6\u03bf\u03c5, \u03bf\u03b9 \u03bf\u03c0\u03bf\u03af\u03b5\u03c2 \u03b8\u03b1 \u03bc\u03b1\u03c2 \u03b1\u03c1\u03b3\u03bf\u03c0\u03bf\u03c1\u03bf\u03cd\u03c3\u03b1\u03bd. \u039a\u03cd\u03c1\u03b9\u03b5 Cox, \u03ba\u03cd\u03c1\u03b9\u03b5 H\u03b4nsch, \u03c3\u03c5\u03bc\u03c6\u03c9\u03bd\u03b5\u03af\u03c4\u03b5 \u03bc\u03b5 \u03b1\u03c5\u03c4\u03cc;
+
+\u039a\u03c5\u03c1\u03af\u03b1 \u03a0\u03c1\u03cc\u03b5\u03b4\u03c1\u03b5, \u03b5\u03ac\u03bd \u03b7 \u03c8\u03b7\u03c6\u03bf\u03c6\u03bf\u03c1\u03af\u03b1 \u03ba\u03b1\u03c4\u03b1\u03b3\u03c1\u03ac\u03c6\u03b5\u03b9 \u03bf\u03c1\u03b8\u03ce\u03c2 \u03c4\u03bf\u03bd \u03c4\u03c1\u03cc\u03c0\u03bf \u03bc\u03b5 \u03c4\u03bf\u03bd \u03bf\u03c0\u03bf\u03af\u03bf \u03c8\u03ae\u03c6\u03b9\u03c3\u03b5 \u03b7 \u039f\u03bc\u03ac\u03b4\u03b1 \u03bc\u03bf\u03c5, \u03b4\u03b5\u03bd \u03b8\u03b1 \u03b5\u03bd\u03b1\u03bd\u03c4\u03b9\u03c9\u03b8\u03ce \u03c3\u03b5 \u03b1\u03c5\u03c4\u03cc \u03ba\u03b1\u03b9 \u03bf\u03cd\u03c4\u03b5 \u03bc\u03c0\u03bf\u03c1\u03ce \u03bd\u03b1 \u03c4\u03bf \u03c0\u03c1\u03ac\u03be\u03c9. \u0395\u03ac\u03bd \u03b7 \u03b1\u03c0\u03cc\u03c6\u03b1\u03c3\u03ae \u03c3\u03b1\u03c2 \u03b5\u03af\u03bd\u03b1\u03b9 \u03cc\u03c4\u03b9 \u03b4\u03b5\u03bd \u03bc\u03c0\u03bf\u03c1\u03ce \u03bd\u03b1 \u03c0\u03c1\u03bf\u03b2\u03ce \u03c3\u03b5 \u03b1\u03b9\u03c4\u03b9\u03bf\u03bb\u03cc\u03b3\u03b7\u03c3\u03b7 \u03c8\u03ae\u03c6\u03bf\u03c5, \u03c4\u03b7 \u03b4\u03ad\u03c7\u03bf\u03bc\u03b1\u03b9 \u03b1\u03bb\u03bb\u03ac \u03bc\u03b5 \u03b5\u03c0\u03b9\u03c6\u03cd\u03bb\u03b1\u03be\u03b7.
+
+\u0398\u03b1 \u03b4\u03ce\u03c3\u03bf\u03c5\u03bc\u03b5 \u03bb\u03bf\u03b9\u03c0\u03cc\u03bd \u03bc\u03b5\u03b3\u03ac\u03bb\u03b7 \u03c0\u03c1\u03bf\u03c3\u03bf\u03c7\u03ae \u03c3\u03c4\u03b7 \u03c3\u03cd\u03bd\u03c4\u03b1\u03be\u03b7 \u03c4\u03c9\u03bd \u03c3\u03c5\u03bd\u03bf\u03c0\u03c4\u03b9\u03ba\u03ce\u03bd \u03c0\u03c1\u03b1\u03ba\u03c4\u03b9\u03ba\u03ce\u03bd. \u03a4\u03bf \u03c0\u03c1\u03ac\u03c4\u03c4\u03bf\u03c5\u03bc\u03b5 \u03ac\u03bb\u03bb\u03c9\u03c3\u03c4\u03b5 \u03c0\u03ac\u03bd\u03c4\u03bf\u03c4\u03b5. \u0395\u03ac\u03bd \u03b4\u03b5\u03bd \u03b1\u03bd\u03c4\u03b9\u03ba\u03b1\u03c4\u03bf\u03c0\u03c4\u03c1\u03af\u03b6\u03bf\u03c5\u03bd \u03b5\u03c0\u03b1\u03c1\u03ba\u03ce\u03c2 \u03c4\u03b9\u03c2 \u03b8\u03ad\u03c3\u03b5\u03b9\u03c2 \u03b8\u03b1 \u03bc\u03c0\u03bf\u03c1\u03ad\u03c3\u03bf\u03c5\u03bc\u03b5 \u03b5\u03bd\u03b4\u03b5\u03c7\u03bf\u03bc\u03ad\u03bd\u03c9\u03c2 \u03bd\u03b1 \u03c4\u03b1 \u03b4\u03b9\u03bf\u03c1\u03b8\u03ce\u03c3\u03bf\u03c5\u03bc\u03b5.
+(\u03a4\u03bf \u03a3\u03ce\u03bc\u03b1 \u03b5\u03b3\u03ba\u03c1\u03af\u03bd\u03b5\u03b9 \u03c4\u03b7 \u03b4\u03b9\u03ac\u03c4\u03b1\u03be\u03b7 \u03c4\u03c9\u03bd \u03b5\u03c1\u03b3\u03b1\u03c3\u03b9\u03ce\u03bd \u03cc\u03c0\u03c9\u03c2 \u03c4\u03c1\u03bf\u03c0\u03bf\u03c0\u03bf\u03b9\u03ae\u03b8\u03b7\u03ba\u03b5)
+

[34/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build for nutch-core and nutch-plugins

Posted by th...@apache.org.

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/tools/warc/WARCExporter.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/tools/warc/WARCExporter.java b/nutch-core/src/main/java/org/apache/nutch/tools/warc/WARCExporter.java
new file mode 100644
index 0000000..2e50105
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/tools/warc/WARCExporter.java
@@ -0,0 +1,333 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tools.warc;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.DataInput;
+import java.io.DataInputStream;
+import java.io.IOException;
+import java.net.URI;
+import java.text.SimpleDateFormat;
+import java.util.ArrayList;
+import java.util.Date;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Locale;
+import java.util.UUID;
+
+import org.apache.commons.lang.StringUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapred.FileInputFormat;
+import org.apache.hadoop.mapred.FileOutputFormat;
+import org.apache.hadoop.mapred.JobClient;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.Mapper;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reducer;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.hadoop.mapred.RunningJob;
+import org.apache.hadoop.mapred.SequenceFileInputFormat;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.NutchWritable;
+import org.apache.nutch.parse.ParseSegment;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.HadoopFSUtil;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.NutchJob;
+import org.apache.nutch.util.TimingUtil;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.martinkl.warc.WARCRecord;
+import com.martinkl.warc.WARCWritable;
+import com.martinkl.warc.mapred.WARCOutputFormat;
+
+/**
+ * MapReduce job to exports Nutch segments as WARC files. The file format is
+ * documented in the [ISO
+ * Standard](http://bibnum.bnf.fr/warc/WARC_ISO_28500_version1_latestdraft.pdf).
+ * Generates elements of type response if the configuration 'store.http.headers'
+ * was set to true during the fetching and the http headers were stored
+ * verbatim; generates elements of type 'resource' otherwise.
+ **/
+
+public class WARCExporter extends Configured implements Tool {
+
+  public static Logger LOG = LoggerFactory.getLogger(WARCExporter.class);
+
+  private static final String CRLF = "\r\n";
+  private static final byte[] CRLF_BYTES = { 13, 10 };
+
+  public WARCExporter() {
+    super(null);
+  }
+
+  public WARCExporter(Configuration conf) {
+    super(conf);
+  }
+
+  public static class WARCReducer
+      implements Mapper<Text, Writable, Text, NutchWritable>,
+      Reducer<Text, NutchWritable, NullWritable, WARCWritable> {
+
+    SimpleDateFormat warcdf = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'",
+        Locale.ENGLISH);
+
+    @Override
+    public void configure(JobConf job) {
+    }
+
+    @Override
+    public void close() throws IOException {
+    }
+
+    public void map(Text key, Writable value,
+        OutputCollector<Text, NutchWritable> output, Reporter reporter)
+            throws IOException {
+      output.collect(key, new NutchWritable(value));
+    }
+
+    @Override
+    public void reduce(Text key, Iterator<NutchWritable> values,
+        OutputCollector<NullWritable, WARCWritable> output, Reporter reporter)
+            throws IOException {
+
+      Content content = null;
+      CrawlDatum cd = null;
+
+      // aggregate the values found
+      while (values.hasNext()) {
+        final Writable value = values.next().get(); // unwrap
+        if (value instanceof Content) {
+          content = (Content) value;
+          continue;
+        }
+        if (value instanceof CrawlDatum) {
+          cd = (CrawlDatum) value;
+          continue;
+        }
+      }
+
+      // check that we have everything we need
+      if (content == null) {
+        LOG.info("Missing content for {}", key);
+        reporter.getCounter("WARCExporter", "missing content").increment(1);
+        return;
+      }
+
+      if (cd == null) {
+        LOG.info("Missing fetch datum for {}", key);
+        reporter.getCounter("WARCExporter", "missing metadata").increment(1);
+        return;
+      }
+
+      // were the headers stored as is? Can write a response element then
+      String headersVerbatim = content.getMetadata().get("_response.headers_");
+      byte[] httpheaders = new byte[0];
+      if (StringUtils.isNotBlank(headersVerbatim)) {
+        // check that ends with an empty line
+        if (!headersVerbatim.endsWith(CRLF + CRLF)) {
+          headersVerbatim += CRLF + CRLF;
+        }
+        httpheaders = headersVerbatim.getBytes();
+      }
+
+      StringBuilder buffer = new StringBuilder();
+      buffer.append(WARCRecord.WARC_VERSION);
+      buffer.append(CRLF);
+
+      buffer.append("WARC-Record-ID").append(": ").append("<urn:uuid:")
+          .append(UUID.randomUUID().toString()).append(">").append(CRLF);
+
+      int contentLength = 0;
+      if (content != null) {
+        contentLength = content.getContent().length;
+      }
+
+      // add the length of the http header
+      contentLength += httpheaders.length;
+
+      buffer.append("Content-Length").append(": ")
+          .append(Integer.toString(contentLength)).append(CRLF);
+
+      Date fetchedDate = new Date(cd.getFetchTime());
+      buffer.append("WARC-Date").append(": ").append(warcdf.format(fetchedDate))
+          .append(CRLF);
+
+      // check if http headers have been stored verbatim
+      // if not generate a response instead
+      String WARCTypeValue = "resource";
+
+      if (StringUtils.isNotBlank(headersVerbatim)) {
+        WARCTypeValue = "response";
+      }
+
+      buffer.append("WARC-Type").append(": ").append(WARCTypeValue)
+          .append(CRLF);
+
+      // "WARC-IP-Address" if present
+      String IP = content.getMetadata().get("_ip_");
+      if (StringUtils.isNotBlank(IP)) {
+        buffer.append("WARC-IP-Address").append(": ").append("IP").append(CRLF);
+      }
+
+      // detect if truncated only for fetch success
+      String status = CrawlDatum.getStatusName(cd.getStatus());
+      if (status.equalsIgnoreCase("STATUS_FETCH_SUCCESS")
+          && ParseSegment.isTruncated(content)) {
+        buffer.append("WARC-Truncated").append(": ").append("unspecified")
+            .append(CRLF);
+      }
+
+      // must be a valid URI
+      try {
+        String normalised = key.toString().replaceAll(" ", "%20");
+        URI uri = URI.create(normalised);
+        buffer.append("WARC-Target-URI").append(": ")
+            .append(uri.toASCIIString()).append(CRLF);
+      } catch (Exception e) {
+        LOG.error("Invalid URI {} ", key);
+        reporter.getCounter("WARCExporter", "invalid URI").increment(1);
+        return;
+      }
+
+      // provide a ContentType if type response
+      if (WARCTypeValue.equals("response")) {
+        buffer.append("Content-Type: application/http; msgtype=response")
+            .append(CRLF);
+      }
+
+      // finished writing the WARC headers, now let's serialize it
+
+      ByteArrayOutputStream bos = new ByteArrayOutputStream();
+
+      // store the headers
+      bos.write(buffer.toString().getBytes("UTF-8"));
+      bos.write(CRLF_BYTES);
+      // the http headers
+      bos.write(httpheaders);
+
+      // the binary content itself
+      if (content.getContent() != null) {
+        bos.write(content.getContent());
+      }
+      bos.write(CRLF_BYTES);
+      bos.write(CRLF_BYTES);
+
+      try {
+        DataInput in = new DataInputStream(
+            new ByteArrayInputStream(bos.toByteArray()));
+        WARCRecord record = new WARCRecord(in);
+        output.collect(NullWritable.get(), new WARCWritable(record));
+        reporter.getCounter("WARCExporter", "records generated").increment(1);
+      } catch (IOException exception) {
+        LOG.error("Exception when generating WARC record for {} : {}", key,
+            exception.getMessage());
+        reporter.getCounter("WARCExporter", "exception").increment(1);
+      }
+
+    }
+  }
+
+  public int generateWARC(String output, List<Path> segments) {
+    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+    long start = System.currentTimeMillis();
+    LOG.info("WARCExporter: starting at {}", sdf.format(start));
+
+    final JobConf job = new NutchJob(getConf());
+    job.setJobName("warc-exporter " + output);
+
+    for (final Path segment : segments) {
+      LOG.info("warc-exporter: adding segment: {}", segment);
+      FileInputFormat.addInputPath(job, new Path(segment, Content.DIR_NAME));
+      FileInputFormat.addInputPath(job,
+          new Path(segment, CrawlDatum.FETCH_DIR_NAME));
+    }
+
+    job.setInputFormat(SequenceFileInputFormat.class);
+
+    job.setMapperClass(WARCReducer.class);
+    job.setReducerClass(WARCReducer.class);
+
+    job.setMapOutputKeyClass(Text.class);
+    job.setMapOutputValueClass(NutchWritable.class);
+
+    FileOutputFormat.setOutputPath(job, new Path(output));
+    // using the old api
+    job.setOutputFormat(WARCOutputFormat.class);
+
+    job.setOutputKeyClass(NullWritable.class);
+    job.setOutputValueClass(WARCWritable.class);
+
+    try {
+      RunningJob rj = JobClient.runJob(job);
+      LOG.info(rj.getCounters().toString());
+      long end = System.currentTimeMillis();
+      LOG.info("WARCExporter: finished at {}, elapsed: {}", sdf.format(end),
+          TimingUtil.elapsedTime(start, end));
+    } catch (Exception e) {
+      LOG.error("Exception caught", e);
+      return -1;
+    }
+
+    return 0;
+  }
+
+  public int run(String[] args) throws Exception {
+    if (args.length < 2) {
+      System.err.println(
+          "Usage: WARCExporter <output> (<segment> ... | -dir <segments>)");
+      return -1;
+    }
+
+    final List<Path> segments = new ArrayList<Path>();
+
+    for (int i = 1; i < args.length; i++) {
+      if (args[i].equals("-dir")) {
+        Path dir = new Path(args[++i]);
+        FileSystem fs = dir.getFileSystem(getConf());
+        FileStatus[] fstats = fs.listStatus(dir,
+            HadoopFSUtil.getPassDirectoriesFilter(fs));
+        Path[] files = HadoopFSUtil.getPaths(fstats);
+        for (Path p : files) {
+          segments.add(p);
+        }
+      } else {
+        segments.add(new Path(args[i]));
+      }
+    }
+
+    return generateWARC(args[0], segments);
+  }
+
+  public static void main(String[] args) throws Exception {
+    final int res = ToolRunner.run(NutchConfiguration.create(),
+        new WARCExporter(), args);
+    System.exit(res);
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/tools/warc/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/tools/warc/package-info.java b/nutch-core/src/main/java/org/apache/nutch/tools/warc/package-info.java
new file mode 100644
index 0000000..44e1a94
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/tools/warc/package-info.java
@@ -0,0 +1,23 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with this
+ * work for additional information regarding copyright ownership. The ASF
+ * licenses this file to You under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+/**
+ * Tools to import / export between Nutch segments and
+ * <a href="http://bibnum.bnf.fr/warc/WARC_ISO_28500_version1_latestdraft.pdf">
+ * WARC archives</a>.
+ */
+package org.apache.nutch.tools.warc;

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/CommandRunner.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/util/CommandRunner.java b/nutch-core/src/main/java/org/apache/nutch/util/CommandRunner.java
new file mode 100644
index 0000000..593d590
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/util/CommandRunner.java
@@ -0,0 +1,291 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Adopted by John Xing for Nutch Project from
+ * http://blog.fivesight.com/prb/space/Call+an+External+Command+from+Java/,
+ * which explains the code in detail.
+ * [Original author is moving his site to http://mult.ifario.us/   -peb]
+ *
+ * Comments by John Xing on 20040621:
+ * (1) EDU.oswego.cs.dl.util.concurrent.* is in j2sdk 1.5 now.
+ *     Modifications are needed if we move to j2sdk 1.5.
+ * (2) The original looks good, not much to change.
+ *
+ * This code is in the public domain and comes with no warranty.  
+ */
+package org.apache.nutch.util;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.io.InterruptedIOException;
+import java.util.concurrent.BrokenBarrierException;
+import java.util.concurrent.CyclicBarrier;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+
+public class CommandRunner {
+
+  private boolean _waitForExit = true;
+  private String _command;
+  private int _timeout = 10;
+
+  private InputStream _stdin;
+  private OutputStream _stdout;
+  private OutputStream _stderr;
+
+  private static final int BUF = 4096;
+
+  private int _xit;
+
+  private Throwable _thrownError;
+
+  private CyclicBarrier _barrier;
+
+  public int getExitValue() {
+    return _xit;
+  }
+
+  public void setCommand(String s) {
+    _command = s;
+  }
+
+  public String getCommand() {
+    return _command;
+  }
+
+  public void setInputStream(InputStream is) {
+    _stdin = is;
+  }
+
+  public void setStdOutputStream(OutputStream os) {
+    _stdout = os;
+  }
+
+  public void setStdErrorStream(OutputStream os) {
+    _stderr = os;
+  }
+
+  public void evaluate() throws IOException {
+    this.exec();
+  }
+
+  /**
+   * 
+   * @return process exit value (return code) or -1 if timed out.
+   * @throws IOException
+   */
+  public int exec() throws IOException {
+    Process proc = Runtime.getRuntime().exec(_command);
+    _barrier = new CyclicBarrier(3 + ((_stdin != null) ? 1 : 0));
+
+    PullerThread so = new PullerThread("STDOUT", proc.getInputStream(), _stdout);
+    so.setDaemon(true);
+    so.start();
+
+    PullerThread se = new PullerThread("STDERR", proc.getErrorStream(), _stderr);
+    se.setDaemon(true);
+    se.start();
+
+    PusherThread si = null;
+    if (_stdin != null) {
+      si = new PusherThread("STDIN", _stdin, proc.getOutputStream());
+      si.setDaemon(true);
+      si.start();
+    }
+
+    boolean _timedout = false;
+    long end = System.currentTimeMillis() + _timeout * 1000;
+
+    //
+    try {
+      if (_timeout == 0) {
+        _barrier.await();
+      } else {
+        _barrier.await(_timeout, TimeUnit.SECONDS);
+      }
+    } catch (TimeoutException ex) {
+      _timedout = true;
+    } catch (BrokenBarrierException bbe) {
+      /* IGNORE */
+    } catch (InterruptedException e) {
+      /* IGNORE */
+    }
+
+    // tell the io threads we are finished
+    if (si != null) {
+      si.interrupt();
+    }
+    so.interrupt();
+    se.interrupt();
+
+    _xit = -1;
+
+    if (!_timedout) {
+      if (_waitForExit) {
+        do {
+          try {
+            Thread.sleep(1000);
+            _xit = proc.exitValue();
+          } catch (InterruptedException ie) {
+            if (Thread.interrupted()) {
+              break; // stop waiting on an interrupt for this thread
+            } else {
+              continue;
+            }
+          } catch (IllegalThreadStateException iltse) {
+            continue;
+          }
+          break;
+        } while (!(_timedout = (System.currentTimeMillis() > end)));
+      } else {
+        try {
+          _xit = proc.exitValue();
+        } catch (IllegalThreadStateException iltse) {
+          _timedout = true;
+        }
+      }
+    }
+
+    if (_waitForExit) {
+      proc.destroy();
+    }
+    return _xit;
+  }
+
+  public Throwable getThrownError() {
+    return _thrownError;
+  }
+
+  private class PumperThread extends Thread {
+
+    private OutputStream _os;
+    private InputStream _is;
+
+    private boolean _closeInput;
+
+    protected PumperThread(String name, InputStream is, OutputStream os,
+        boolean closeInput) {
+      super(name);
+      _is = is;
+      _os = os;
+      _closeInput = closeInput;
+    }
+
+    public void run() {
+      try {
+        byte[] buf = new byte[BUF];
+        int read = 0;
+        while (!isInterrupted() && (read = _is.read(buf)) != -1) {
+          if (read == 0)
+            continue;
+          _os.write(buf, 0, read);
+          _os.flush();
+        }
+      } catch (InterruptedIOException iioe) {
+        // ignored
+      } catch (Throwable t) {
+        _thrownError = t;
+      } finally {
+        try {
+          if (_closeInput) {
+            _is.close();
+          } else {
+            _os.close();
+          }
+        } catch (IOException ioe) {
+          /* IGNORE */
+        }
+      }
+      try {
+        _barrier.await();
+      } catch (InterruptedException ie) {
+        /* IGNORE */
+      } catch (BrokenBarrierException bbe) {
+        /* IGNORE */
+      }
+    }
+  }
+
+  private class PusherThread extends PumperThread {
+    PusherThread(String name, InputStream is, OutputStream os) {
+      super(name, is, os, false);
+    }
+  }
+
+  private class PullerThread extends PumperThread {
+    PullerThread(String name, InputStream is, OutputStream os) {
+      super(name, is, os, true);
+    }
+  }
+
+  public int getTimeout() {
+    return _timeout;
+  }
+
+  public void setTimeout(int timeout) {
+    _timeout = timeout;
+  }
+
+  public boolean getWaitForExit() {
+    return _waitForExit;
+  }
+
+  public void setWaitForExit(boolean waitForExit) {
+    _waitForExit = waitForExit;
+  }
+
+  public static void main(String[] args) throws Exception {
+    String commandPath = null;
+    String filePath = null;
+    int timeout = 10;
+
+    String usage = "Usage: CommandRunner [-timeout timeoutSecs] commandPath filePath";
+
+    if (args.length < 2) {
+      System.err.println(usage);
+      System.exit(-1);
+    }
+
+    for (int i = 0; i < args.length; i++) {
+      if (args[i].equals("-timeout")) {
+        timeout = Integer.parseInt(args[++i]);
+        ;
+      } else if (i != args.length - 2) {
+        System.err.println(usage);
+        System.exit(-1);
+      } else {
+        commandPath = args[i];
+        filePath = args[++i];
+      }
+    }
+
+    CommandRunner cr = new CommandRunner();
+
+    cr.setCommand(commandPath);
+    cr.setInputStream(new java.io.FileInputStream(filePath));
+    cr.setStdErrorStream(System.err);
+    cr.setStdOutputStream(System.out);
+
+    cr.setTimeout(timeout);
+
+    cr.evaluate();
+
+    System.err.println("output value: " + cr.getExitValue());
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/CrawlCompletionStats.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/util/CrawlCompletionStats.java b/nutch-core/src/main/java/org/apache/nutch/util/CrawlCompletionStats.java
new file mode 100644
index 0000000..8aafe59
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/util/CrawlCompletionStats.java
@@ -0,0 +1,245 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util;
+
+import java.io.File;
+import java.io.IOException;
+import java.net.URL;
+import java.text.SimpleDateFormat;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.TimingUtil;
+import org.apache.nutch.util.URLUtil;
+
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.CommandLineParser;
+import org.apache.commons.cli.GnuParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.OptionBuilder;
+import org.apache.commons.cli.Options;
+import org.apache.commons.cli.MissingOptionException;
+
+/**
+ * Extracts some simple crawl completion stats from the crawldb
+ *
+ * Stats will be sorted by host/domain and will be of the form:
+ * 1	www.spitzer.caltech.edu FETCHED
+ * 50	www.spitzer.caltech.edu UNFETCHED
+ *
+ */
+public class CrawlCompletionStats extends Configured implements Tool {
+
+  private static final Logger LOG = LoggerFactory
+      .getLogger(CrawlCompletionStats.class);
+
+  private static final int MODE_HOST = 1;
+  private static final int MODE_DOMAIN = 2;
+
+  private int mode = 0;
+
+  public int run(String[] args) throws Exception {
+    Option helpOpt = new Option("h", "help", false, "Show this message");
+    Option inDirs = OptionBuilder
+        .withArgName("inputDirs")
+        .isRequired()
+        .withDescription("Comma separated list of crawl directories (e.g., \"./crawl1,./crawl2\")")
+        .hasArgs()
+        .create("inputDirs");
+    Option outDir = OptionBuilder
+        .withArgName("outputDir")
+        .isRequired()
+        .withDescription("Output directory where results should be dumped")
+        .hasArgs()
+        .create("outputDir");
+    Option modeOpt = OptionBuilder
+        .withArgName("mode")
+        .isRequired()
+        .withDescription("Set statistics gathering mode (by 'host' or by 'domain')")
+        .hasArgs()
+        .create("mode");
+    Option numReducers = OptionBuilder
+        .withArgName("numReducers")
+        .withDescription("Optional number of reduce jobs to use. Defaults to 1")
+        .hasArgs()
+        .create("numReducers");
+
+    Options options = new Options();
+    options.addOption(helpOpt);
+    options.addOption(inDirs);
+    options.addOption(outDir);
+    options.addOption(modeOpt);
+    options.addOption(numReducers);
+
+    CommandLineParser parser = new GnuParser();
+    CommandLine cli;
+
+    try {
+      cli = parser.parse(options, args);
+    } catch (MissingOptionException e) {
+      HelpFormatter formatter = new HelpFormatter();
+      formatter.printHelp("CrawlCompletionStats", options, true);
+      return 1;
+    }
+
+    if (cli.hasOption("help")) {
+      HelpFormatter formatter = new HelpFormatter();
+      formatter.printHelp("CrawlCompletionStats", options, true);
+      return 1;
+    }
+
+    String inputDir = cli.getOptionValue("inputDirs");
+    String outputDir = cli.getOptionValue("outputDir");
+
+    int numOfReducers = 1;
+    if (cli.hasOption("numReducers")) {
+      numOfReducers = Integer.parseInt(args[3]);
+    }
+
+    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+    long start = System.currentTimeMillis();
+    LOG.info("CrawlCompletionStats: starting at {}", sdf.format(start));
+
+    int mode = 0;
+    String jobName = "CrawlCompletionStats";
+    if (cli.getOptionValue("mode").equals("host")) {
+      jobName = "Host CrawlCompletionStats";
+      mode = MODE_HOST;
+    } else if (cli.getOptionValue("mode").equals("domain")) {
+      jobName = "Domain CrawlCompletionStats";
+      mode = MODE_DOMAIN;
+    } 
+
+    Configuration conf = getConf();
+    conf.setInt("domain.statistics.mode", mode);
+    conf.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);
+
+    Job job = Job.getInstance(conf, jobName);
+    job.setJarByClass(CrawlCompletionStats.class);
+
+    String[] inputDirsSpecs = inputDir.split(",");
+    for (int i = 0; i < inputDirsSpecs.length; i++) {
+      File completeInputPath = new File(new File(inputDirsSpecs[i]), "crawldb/current");
+      FileInputFormat.addInputPath(job, new Path(completeInputPath.toString()));
+      
+    }
+
+    job.setInputFormatClass(SequenceFileInputFormat.class);
+    FileOutputFormat.setOutputPath(job, new Path(outputDir));
+    job.setOutputFormatClass(TextOutputFormat.class);
+
+    job.setMapOutputKeyClass(Text.class);
+    job.setMapOutputValueClass(LongWritable.class);
+    job.setOutputKeyClass(Text.class);
+    job.setOutputValueClass(LongWritable.class);
+
+    job.setMapperClass(CrawlCompletionStatsMapper.class);
+    job.setReducerClass(CrawlCompletionStatsReducer.class);
+    job.setCombinerClass(CrawlCompletionStatsCombiner.class);
+    job.setNumReduceTasks(numOfReducers);
+
+    try {
+      job.waitForCompletion(true);
+    } catch (Exception e) {
+      throw e;
+    }
+
+    long end = System.currentTimeMillis();
+    LOG.info("CrawlCompletionStats: finished at {}, elapsed: {}",
+      sdf.format(end), TimingUtil.elapsedTime(start, end));
+    return 0;
+  }
+
+  static class CrawlCompletionStatsMapper extends
+      Mapper<Text, CrawlDatum, Text, LongWritable> {
+    int mode = 0;
+
+    public void setup(Context context) {
+      mode = context.getConfiguration().getInt("domain.statistics.mode", MODE_DOMAIN);
+    }
+
+    public void map(Text urlText, CrawlDatum datum, Context context)
+        throws IOException, InterruptedException {
+
+      URL url = new URL(urlText.toString());
+      String out = "";
+      switch (mode) {
+        case MODE_HOST:
+          out = url.getHost();
+          break;
+        case MODE_DOMAIN:
+          out = URLUtil.getDomainName(url);
+          break;
+      }
+
+      if (datum.getStatus() == CrawlDatum.STATUS_DB_FETCHED
+          || datum.getStatus() == CrawlDatum.STATUS_DB_NOTMODIFIED) {
+        context.write(new Text(out + " FETCHED"), new LongWritable(1));
+      } else {
+        context.write(new Text(out + " UNFETCHED"), new LongWritable(1));
+      }
+    }
+  }
+
+  static class CrawlCompletionStatsReducer extends
+      Reducer<Text, LongWritable, LongWritable, Text> {
+    public void reduce(Text key, Iterable<LongWritable> values, Context context)
+        throws IOException, InterruptedException {
+      long total = 0;
+
+      for (LongWritable val : values) {
+        total += val.get();
+      }
+
+      context.write(new LongWritable(total), key);
+    }
+  }
+
+  public static class CrawlCompletionStatsCombiner extends
+      Reducer<Text, LongWritable, Text, LongWritable> {
+    public void reduce(Text key, Iterable<LongWritable> values, Context context)
+        throws IOException, InterruptedException {
+      long total = 0;
+
+      for (LongWritable val : values) {
+        total += val.get();
+      }
+      context.write(key, new LongWritable(total));
+    }
+  }
+
+  public static void main(String[] args) throws Exception {
+    ToolRunner.run(NutchConfiguration.create(), new CrawlCompletionStats(), args);
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/DeflateUtils.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/util/DeflateUtils.java b/nutch-core/src/main/java/org/apache/nutch/util/DeflateUtils.java
new file mode 100644
index 0000000..5863522
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/util/DeflateUtils.java
@@ -0,0 +1,140 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util;
+
+import java.io.ByteArrayOutputStream;
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.util.zip.Inflater;
+import java.util.zip.InflaterInputStream;
+import java.util.zip.DeflaterOutputStream;
+
+// Slf4j Logging imports
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * A collection of utility methods for working on deflated data.
+ */
+public class DeflateUtils {
+
+  private static final Logger LOG = LoggerFactory.getLogger(DeflateUtils.class);
+  private static final int EXPECTED_COMPRESSION_RATIO = 5;
+  private static final int BUF_SIZE = 4096;
+
+  /**
+   * Returns an inflated copy of the input array. If the deflated input has been
+   * truncated or corrupted, a best-effort attempt is made to inflate as much as
+   * possible. If no data can be extracted <code>null</code> is returned.
+   */
+  public static final byte[] inflateBestEffort(byte[] in) {
+    return inflateBestEffort(in, Integer.MAX_VALUE);
+  }
+
+  /**
+   * Returns an inflated copy of the input array, truncated to
+   * <code>sizeLimit</code> bytes, if necessary. If the deflated input has been
+   * truncated or corrupted, a best-effort attempt is made to inflate as much as
+   * possible. If no data can be extracted <code>null</code> is returned.
+   */
+  public static final byte[] inflateBestEffort(byte[] in, int sizeLimit) {
+    // decompress using InflaterInputStream
+    ByteArrayOutputStream outStream = new ByteArrayOutputStream(
+        EXPECTED_COMPRESSION_RATIO * in.length);
+
+    // "true" because HTTP does not provide zlib headers
+    Inflater inflater = new Inflater(true);
+    InflaterInputStream inStream = new InflaterInputStream(
+        new ByteArrayInputStream(in), inflater);
+
+    byte[] buf = new byte[BUF_SIZE];
+    int written = 0;
+    while (true) {
+      try {
+        int size = inStream.read(buf);
+        if (size <= 0)
+          break;
+        if ((written + size) > sizeLimit) {
+          outStream.write(buf, 0, sizeLimit - written);
+          break;
+        }
+        outStream.write(buf, 0, size);
+        written += size;
+      } catch (Exception e) {
+        LOG.info("Caught Exception in inflateBestEffort", e);
+        break;
+      }
+    }
+    try {
+      outStream.close();
+    } catch (IOException e) {
+    }
+
+    return outStream.toByteArray();
+  }
+
+  /**
+   * Returns an inflated copy of the input array.
+   * 
+   * @throws IOException
+   *           if the input cannot be properly decompressed
+   */
+  public static final byte[] inflate(byte[] in) throws IOException {
+    // decompress using InflaterInputStream
+    ByteArrayOutputStream outStream = new ByteArrayOutputStream(
+        EXPECTED_COMPRESSION_RATIO * in.length);
+
+    InflaterInputStream inStream = new InflaterInputStream(
+        new ByteArrayInputStream(in));
+
+    byte[] buf = new byte[BUF_SIZE];
+    while (true) {
+      int size = inStream.read(buf);
+      if (size <= 0)
+        break;
+      outStream.write(buf, 0, size);
+    }
+    outStream.close();
+
+    return outStream.toByteArray();
+  }
+
+  /**
+   * Returns a deflated copy of the input array.
+   */
+  public static final byte[] deflate(byte[] in) {
+    // compress using DeflaterOutputStream
+    ByteArrayOutputStream byteOut = new ByteArrayOutputStream(in.length
+        / EXPECTED_COMPRESSION_RATIO);
+
+    DeflaterOutputStream outStream = new DeflaterOutputStream(byteOut);
+
+    try {
+      outStream.write(in);
+    } catch (Exception e) {
+      LOG.error("Error compressing: ", e);
+    }
+
+    try {
+      outStream.close();
+    } catch (IOException e) {
+      LOG.error("Error closing: ", e);
+    }
+
+    return byteOut.toByteArray();
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/DomUtil.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/util/DomUtil.java b/nutch-core/src/main/java/org/apache/nutch/util/DomUtil.java
new file mode 100644
index 0000000..9595bf4
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/util/DomUtil.java
@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.util;
+
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.io.UnsupportedEncodingException;
+
+import javax.xml.transform.Transformer;
+import javax.xml.transform.TransformerConfigurationException;
+import javax.xml.transform.TransformerException;
+import javax.xml.transform.TransformerFactory;
+import javax.xml.transform.dom.DOMSource;
+import javax.xml.transform.stream.StreamResult;
+
+import org.apache.xerces.parsers.DOMParser;
+import org.w3c.dom.Element;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+
+// Slf4j Logging imports
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class DomUtil {
+
+  private final static Logger LOG = LoggerFactory.getLogger(DomUtil.class);
+
+  /**
+   * Returns parsed dom tree or null if any error
+   * 
+   * @param is
+   * @return A parsed DOM tree from the given {@link InputStream}.
+   */
+  public static Element getDom(InputStream is) {
+
+    Element element = null;
+
+    DOMParser parser = new DOMParser();
+
+    InputSource input;
+    try {
+      input = new InputSource(is);
+      input.setEncoding("UTF-8");
+      parser.parse(input);
+      int i = 0;
+      while (!(parser.getDocument().getChildNodes().item(i) instanceof Element)) {
+        i++;
+      }
+      element = (Element) parser.getDocument().getChildNodes().item(i);
+    } catch (FileNotFoundException e) {
+      LOG.error("Error: ", e);
+    } catch (SAXException e) {
+      LOG.error("Error: ", e);
+    } catch (IOException e) {
+      LOG.error("Error: ", e);
+    }
+    return element;
+  }
+
+  /**
+   * save dom into ouputstream
+   * 
+   * @param os
+   * @param e
+   */
+  public static void saveDom(OutputStream os, Element e) {
+
+    DOMSource source = new DOMSource(e);
+    TransformerFactory transFactory = TransformerFactory.newInstance();
+    Transformer transformer;
+    try {
+      transformer = transFactory.newTransformer();
+      transformer.setOutputProperty("indent", "yes");
+      StreamResult result = new StreamResult(os);
+      transformer.transform(source, result);
+      os.flush();
+    } catch (UnsupportedEncodingException e1) {
+      LOG.error("Error: ", e1);
+    } catch (IOException e1) {
+      LOG.error("Error: ", e1);
+    } catch (TransformerConfigurationException e2) {
+      LOG.error("Error: ", e2);
+    } catch (TransformerException ex) {
+      LOG.error("Error: ", ex);
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/DumpFileUtil.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/util/DumpFileUtil.java b/nutch-core/src/main/java/org/apache/nutch/util/DumpFileUtil.java
new file mode 100644
index 0000000..9ed3e75
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/util/DumpFileUtil.java
@@ -0,0 +1,147 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util;
+
+import org.apache.commons.codec.digest.DigestUtils;
+import org.apache.commons.io.FileUtils;
+import org.apache.commons.lang.StringUtils;
+import org.apache.hadoop.io.MD5Hash;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Map;
+
+public class DumpFileUtil {
+    private static final Logger LOG = LoggerFactory.getLogger(DumpFileUtil.class
+            .getName());
+
+    private final static String DIR_PATTERN = "%s/%s/%s";
+    private final static String FILENAME_PATTERN = "%s_%s.%s";
+    private final static Integer MAX_LENGTH_OF_FILENAME = 32;
+    private final static Integer MAX_LENGTH_OF_EXTENSION = 5; 
+   
+    public static String getUrlMD5(String url) {
+        byte[] digest = MD5Hash.digest(url).getDigest();
+
+        StringBuffer sb = new StringBuffer();
+        for (byte b : digest) {
+            sb.append(String.format("%02x", b & 0xff));
+        }
+
+        return sb.toString();
+    }
+
+    public static String createTwoLevelsDirectory(String basePath, String md5, boolean makeDir) {
+        String firstLevelDirName = new StringBuilder().append(md5.charAt(0)).append(md5.charAt(8)).toString();
+        String secondLevelDirName = new StringBuilder().append(md5.charAt(16)).append(md5.charAt(24)).toString();
+
+        String fullDirPath = String.format(DIR_PATTERN, basePath, firstLevelDirName, secondLevelDirName);
+
+        if (makeDir) {
+	        try {
+	            FileUtils.forceMkdir(new File(fullDirPath));
+	        } catch (IOException e) {
+	            LOG.error("Failed to create dir: {}", fullDirPath);
+	            fullDirPath = null;
+	        }
+        }
+
+        return fullDirPath;
+    }
+    
+    public static String createTwoLevelsDirectory(String basePath, String md5) {
+        return createTwoLevelsDirectory(basePath, md5, true);
+    }
+
+    public static String createFileName(String md5, String fileBaseName, String fileExtension) {
+        if (fileBaseName.length() > MAX_LENGTH_OF_FILENAME) {
+            LOG.info("File name is too long. Truncated to {} characters.", MAX_LENGTH_OF_FILENAME);
+            fileBaseName = StringUtils.substring(fileBaseName, 0, MAX_LENGTH_OF_FILENAME);
+        } 
+        
+        if (fileExtension.length() > MAX_LENGTH_OF_EXTENSION) {
+            LOG.info("File extension is too long. Truncated to {} characters.", MAX_LENGTH_OF_EXTENSION);
+            fileExtension = StringUtils.substring(fileExtension, 0, MAX_LENGTH_OF_EXTENSION);
+        }
+	
+	// Added to prevent FileNotFoundException (Invalid Argument) - in *nix environment
+        fileBaseName = fileBaseName.replaceAll("\\?", "");
+        fileExtension = fileExtension.replaceAll("\\?", "");
+
+        return String.format(FILENAME_PATTERN, md5, fileBaseName, fileExtension);
+    }
+    
+    public static String createFileNameFromUrl(String basePath, String reverseKey, String urlString, String epochScrapeTime, String fileExtension, boolean makeDir) {
+		String fullDirPath = basePath + File.separator + reverseKey + File.separator + DigestUtils.sha1Hex(urlString);
+		
+		if (makeDir) {
+	        try {
+	            FileUtils.forceMkdir(new File(fullDirPath));
+	        } catch (IOException e) {
+	            LOG.error("Failed to create dir: {}", fullDirPath);
+	            fullDirPath = null;
+	        }
+        }
+		
+		if (fileExtension.length() > MAX_LENGTH_OF_EXTENSION) {
+			LOG.info("File extension is too long. Truncated to {} characters.", MAX_LENGTH_OF_EXTENSION);
+			fileExtension = StringUtils.substring(fileExtension, 0, MAX_LENGTH_OF_EXTENSION);
+	    }
+		
+		String outputFullPath = fullDirPath + File.separator + epochScrapeTime + "." + fileExtension;
+		
+		return outputFullPath;
+    }
+    
+	public static String displayFileTypes(Map<String, Integer> typeCounts, Map<String, Integer> filteredCounts) {
+		StringBuilder builder = new StringBuilder();
+		// print total stats
+		builder.append("\nTOTAL Stats:\n");
+		builder.append("[\n");
+		int mimetypeCount = 0;
+		for (String mimeType : typeCounts.keySet()) {
+			builder.append("    {\"mimeType\":\"");
+			builder.append(mimeType);
+			builder.append("\",\"count\":\"");
+			builder.append(typeCounts.get(mimeType));
+			builder.append("\"}\n");
+			mimetypeCount += typeCounts.get(mimeType);
+		}
+		builder.append("]\n");
+		builder.append("Total count: " + mimetypeCount + "\n");
+		// filtered types stats
+		mimetypeCount = 0;
+		if (!filteredCounts.isEmpty()) {
+			builder.append("\nFILTERED Stats:\n");
+			builder.append("[\n");
+			for (String mimeType : filteredCounts.keySet()) {
+				builder.append("    {\"mimeType\":\"");
+				builder.append(mimeType);
+				builder.append("\",\"count\":\"");
+				builder.append(filteredCounts.get(mimeType));
+				builder.append("\"}\n");
+				mimetypeCount += filteredCounts.get(mimeType);
+			}
+			builder.append("]\n");
+			builder.append("Total filtered count: " + mimetypeCount + "\n");
+		}
+		return builder.toString();
+	}  
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/EncodingDetector.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/util/EncodingDetector.java b/nutch-core/src/main/java/org/apache/nutch/util/EncodingDetector.java
new file mode 100644
index 0000000..4e62dd3
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/util/EncodingDetector.java
@@ -0,0 +1,386 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.util;
+
+import java.io.BufferedInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.nio.charset.Charset;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.NutchConfiguration;
+
+import com.ibm.icu.text.CharsetDetector;
+import com.ibm.icu.text.CharsetMatch;
+
+/**
+ * A simple class for detecting character encodings.
+ * 
+ * <p>
+ * Broadly this encompasses two functions, which are distinctly separate:
+ * 
+ * <ol>
+ * <li>Auto detecting a set of "clues" from input text.</li>
+ * <li>Taking a set of clues and making a "best guess" as to the "real"
+ * encoding.</li>
+ * </ol>
+ * </p>
+ * 
+ * <p>
+ * A caller will often have some extra information about what the encoding might
+ * be (e.g. from the HTTP header or HTML meta-tags, often wrong but still
+ * potentially useful clues). The types of clues may differ from caller to
+ * caller. Thus a typical calling sequence is:
+ * <ul>
+ * <li>Run step (1) to generate a set of auto-detected clues;</li>
+ * <li>Combine these clues with the caller-dependent "extra clues" available;</li>
+ * <li>Run step (2) to guess what the most probable answer is.</li>
+ * </p>
+ */
+public class EncodingDetector {
+
+  private class EncodingClue {
+    private String value;
+    private String source;
+    private int confidence;
+
+    // Constructor for clues with no confidence values (ignore thresholds)
+    public EncodingClue(String value, String source) {
+      this(value, source, NO_THRESHOLD);
+    }
+
+    public EncodingClue(String value, String source, int confidence) {
+      this.value = value.toLowerCase();
+      this.source = source;
+      this.confidence = confidence;
+    }
+
+    public String getSource() {
+      return source;
+    }
+
+    public String getValue() {
+      return value;
+    }
+
+    public String toString() {
+      return value + " (" + source
+          + ((confidence >= 0) ? ", " + confidence + "% confidence" : "") + ")";
+    }
+
+    public boolean isEmpty() {
+      return (value == null || "".equals(value));
+    }
+
+    public boolean meetsThreshold() {
+      return (confidence < 0 || (minConfidence >= 0 && confidence >= minConfidence));
+    }
+  }
+
+  public static final Logger LOG = LoggerFactory
+      .getLogger(EncodingDetector.class);
+
+  public static final int NO_THRESHOLD = -1;
+
+  public static final String MIN_CONFIDENCE_KEY = "encodingdetector.charset.min.confidence";
+
+  private static final HashMap<String, String> ALIASES = new HashMap<String, String>();
+
+  private static final HashSet<String> DETECTABLES = new HashSet<String>();
+
+  // CharsetDetector will die without a minimum amount of data.
+  private static final int MIN_LENGTH = 4;
+
+  static {
+    DETECTABLES.add("text/html");
+    DETECTABLES.add("text/plain");
+    DETECTABLES.add("text/richtext");
+    DETECTABLES.add("text/rtf");
+    DETECTABLES.add("text/sgml");
+    DETECTABLES.add("text/tab-separated-values");
+    DETECTABLES.add("text/xml");
+    DETECTABLES.add("application/rss+xml");
+    DETECTABLES.add("application/xhtml+xml");
+    /*
+     * the following map is not an alias mapping table, but maps character
+     * encodings which are often used in mislabelled documents to their correct
+     * encodings. For instance, there are a lot of documents labelled
+     * 'ISO-8859-1' which contain characters not covered by ISO-8859-1 but
+     * covered by windows-1252. Because windows-1252 is a superset of ISO-8859-1
+     * (sharing code points for the common part), it's better to treat
+     * ISO-8859-1 as synonymous with windows-1252 than to reject, as invalid,
+     * documents labelled as ISO-8859-1 that have characters outside ISO-8859-1.
+     */
+    ALIASES.put("ISO-8859-1", "windows-1252");
+    ALIASES.put("EUC-KR", "x-windows-949");
+    ALIASES.put("x-EUC-CN", "GB18030");
+    ALIASES.put("GBK", "GB18030");
+    // ALIASES.put("Big5", "Big5HKSCS");
+    // ALIASES.put("TIS620", "Cp874");
+    // ALIASES.put("ISO-8859-11", "Cp874");
+
+  }
+
+  private int minConfidence;
+
+  private CharsetDetector detector;
+
+  private List<EncodingClue> clues;
+
+  public EncodingDetector(Configuration conf) {
+    minConfidence = conf.getInt(MIN_CONFIDENCE_KEY, -1);
+    detector = new CharsetDetector();
+    clues = new ArrayList<EncodingClue>();
+  }
+
+  public void autoDetectClues(Content content, boolean filter) {
+    byte[] data = content.getContent();
+
+    if (minConfidence >= 0 && DETECTABLES.contains(content.getContentType())
+        && data.length > MIN_LENGTH) {
+      CharsetMatch[] matches = null;
+
+      // do all these in a try/catch; setText and detect/detectAll
+      // will sometimes throw exceptions
+      try {
+        detector.enableInputFilter(filter);
+        if (data.length > MIN_LENGTH) {
+          detector.setText(data);
+          matches = detector.detectAll();
+        }
+      } catch (Exception e) {
+        LOG.debug("Exception from ICU4J (ignoring): ", e);
+      }
+
+      if (matches != null) {
+        for (CharsetMatch match : matches) {
+          addClue(match.getName(), "detect", match.getConfidence());
+        }
+      }
+    }
+
+    // add character encoding coming from HTTP response header
+    addClue(
+        parseCharacterEncoding(content.getMetadata().get(Response.CONTENT_TYPE)),
+        "header");
+  }
+
+  public void addClue(String value, String source, int confidence) {
+    if (value == null || "".equals(value)) {
+      return;
+    }
+    value = resolveEncodingAlias(value);
+    if (value != null) {
+      clues.add(new EncodingClue(value, source, confidence));
+    }
+  }
+
+  public void addClue(String value, String source) {
+    addClue(value, source, NO_THRESHOLD);
+  }
+
+  /**
+   * Guess the encoding with the previously specified list of clues.
+   * 
+   * @param content
+   *          Content instance
+   * @param defaultValue
+   *          Default encoding to return if no encoding can be detected with
+   *          enough confidence. Note that this will <b>not</b> be normalized
+   *          with {@link EncodingDetector#resolveEncodingAlias}
+   * 
+   * @return Guessed encoding or defaultValue
+   */
+  public String guessEncoding(Content content, String defaultValue) {
+    /*
+     * This algorithm could be replaced by something more sophisticated; ideally
+     * we would gather a bunch of data on where various clues (autodetect, HTTP
+     * headers, HTML meta tags, etc.) disagree, tag each with the correct
+     * answer, and use machine learning/some statistical method to generate a
+     * better heuristic.
+     */
+
+    String base = content.getBaseUrl();
+
+    if (LOG.isTraceEnabled()) {
+      findDisagreements(base, clues);
+    }
+
+    /*
+     * Go down the list of encoding "clues". Use a clue if: 1. Has a confidence
+     * value which meets our confidence threshold, OR 2. Doesn't meet the
+     * threshold, but is the best try, since nothing else is available.
+     */
+    EncodingClue defaultClue = new EncodingClue(defaultValue, "default");
+    EncodingClue bestClue = defaultClue;
+
+    for (EncodingClue clue : clues) {
+      if (LOG.isTraceEnabled()) {
+        LOG.trace(base + ": charset " + clue);
+      }
+      String charset = clue.value;
+      if (minConfidence >= 0 && clue.confidence >= minConfidence) {
+        if (LOG.isTraceEnabled()) {
+          LOG.trace(base + ": Choosing encoding: " + charset
+              + " with confidence " + clue.confidence);
+        }
+        return resolveEncodingAlias(charset).toLowerCase();
+      } else if (clue.confidence == NO_THRESHOLD && bestClue == defaultClue) {
+        bestClue = clue;
+      }
+    }
+
+    if (LOG.isTraceEnabled()) {
+      LOG.trace(base + ": Choosing encoding: " + bestClue);
+    }
+    return bestClue.value.toLowerCase();
+  }
+
+  /** Clears all clues. */
+  public void clearClues() {
+    clues.clear();
+  }
+
+  /*
+   * Strictly for analysis, look for "disagreements." The top guess from each
+   * source is examined; if these meet the threshold and disagree, then we log
+   * the information -- useful for testing or generating training data for a
+   * better heuristic.
+   */
+  private void findDisagreements(String url, List<EncodingClue> newClues) {
+    HashSet<String> valsSeen = new HashSet<String>();
+    HashSet<String> sourcesSeen = new HashSet<String>();
+    boolean disagreement = false;
+    for (int i = 0; i < newClues.size(); i++) {
+      EncodingClue clue = newClues.get(i);
+      if (!clue.isEmpty() && !sourcesSeen.contains(clue.source)) {
+        if (valsSeen.size() > 0 && !valsSeen.contains(clue.value)
+            && clue.meetsThreshold()) {
+          disagreement = true;
+        }
+        if (clue.meetsThreshold()) {
+          valsSeen.add(clue.value);
+        }
+        sourcesSeen.add(clue.source);
+      }
+    }
+    if (disagreement) {
+      // dump all values in case of disagreement
+      StringBuffer sb = new StringBuffer();
+      sb.append("Disagreement: " + url + "; ");
+      for (int i = 0; i < newClues.size(); i++) {
+        if (i > 0) {
+          sb.append(", ");
+        }
+        sb.append(newClues.get(i));
+      }
+      LOG.trace(sb.toString());
+    }
+  }
+
+  public static String resolveEncodingAlias(String encoding) {
+    try {
+      if (encoding == null || !Charset.isSupported(encoding))
+        return null;
+      String canonicalName = new String(Charset.forName(encoding).name());
+      return ALIASES.containsKey(canonicalName) ? ALIASES.get(canonicalName)
+          : canonicalName;
+    } catch (Exception e) {
+      LOG.warn("Invalid encoding " + encoding + " detected, using default.");
+      return null;
+    }
+  }
+
+  /**
+   * Parse the character encoding from the specified content type header. If the
+   * content type is null, or there is no explicit character encoding,
+   * <code>null</code> is returned. <br />
+   * This method was copied from org.apache.catalina.util.RequestUtil, which is
+   * licensed under the Apache License, Version 2.0 (the "License").
+   * 
+   * @param contentType
+   *          a content type header
+   */
+  public static String parseCharacterEncoding(String contentType) {
+    if (contentType == null)
+      return (null);
+    int start = contentType.indexOf("charset=");
+    if (start < 0)
+      return (null);
+    String encoding = contentType.substring(start + 8);
+    int end = encoding.indexOf(';');
+    if (end >= 0)
+      encoding = encoding.substring(0, end);
+    encoding = encoding.trim();
+    if ((encoding.length() > 2) && (encoding.startsWith("\""))
+        && (encoding.endsWith("\"")))
+      encoding = encoding.substring(1, encoding.length() - 1);
+    return (encoding.trim());
+
+  }
+
+  public static void main(String[] args) throws IOException {
+    if (args.length != 1) {
+      System.err.println("Usage: EncodingDetector <file>");
+      System.exit(1);
+    }
+
+    Configuration conf = NutchConfiguration.create();
+    EncodingDetector detector = new EncodingDetector(
+        NutchConfiguration.create());
+
+    // do everything as bytes; don't want any conversion
+    BufferedInputStream istr = new BufferedInputStream(new FileInputStream(
+        args[0]));
+    ByteArrayOutputStream ostr = new ByteArrayOutputStream();
+    byte[] bytes = new byte[1000];
+    boolean more = true;
+    while (more) {
+      int len = istr.read(bytes);
+      if (len < bytes.length) {
+        more = false;
+        if (len > 0) {
+          ostr.write(bytes, 0, len);
+        }
+      } else {
+        ostr.write(bytes);
+      }
+    }
+
+    byte[] data = ostr.toByteArray();
+
+    // make a fake Content
+    Content content = new Content("", "", data, "text/html", new Metadata(),
+        conf);
+
+    detector.autoDetectClues(content, true);
+    String encoding = detector.guessEncoding(content,
+        conf.get("parser.character.encoding.default"));
+    System.out.println("Guessed encoding: " + encoding);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/FSUtils.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/util/FSUtils.java b/nutch-core/src/main/java/org/apache/nutch/util/FSUtils.java
new file mode 100644
index 0000000..6aed8d5
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/util/FSUtils.java
@@ -0,0 +1,106 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.util;
+
+import java.io.IOException;
+
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.MapFile;
+import org.apache.hadoop.io.SequenceFile;
+
+/**
+ * Utility methods for common filesystem operations.
+ */
+public class FSUtils {
+
+  /**
+   * Replaces the current path with the new path and if set removes the old
+   * path. If removeOld is set to false then the old path will be set to the
+   * name current.old.
+   * 
+   * @param fs
+   *          The FileSystem.
+   * @param current
+   *          The end path, the one being replaced.
+   * @param replacement
+   *          The path to replace with.
+   * @param removeOld
+   *          True if we are removing the current path.
+   * 
+   * @throws IOException
+   *           If an error occurs during replacement.
+   */
+  public static void replace(FileSystem fs, Path current, Path replacement,
+      boolean removeOld) throws IOException {
+
+    // rename any current path to old
+    Path old = new Path(current + ".old");
+    if (fs.exists(current)) {
+      fs.rename(current, old);
+    }
+
+    // rename the new path to current and remove the old path if needed
+    fs.rename(replacement, current);
+    if (fs.exists(old) && removeOld) {
+      fs.delete(old, true);
+    }
+  }
+
+  /**
+   * Closes a group of SequenceFile readers.
+   * 
+   * @param readers
+   *          The SequenceFile readers to close.
+   * @throws IOException
+   *           If an error occurs while closing a reader.
+   */
+  public static void closeReaders(SequenceFile.Reader[] readers)
+      throws IOException {
+
+    // loop through the readers, closing one by one
+    if (readers != null) {
+      for (int i = 0; i < readers.length; i++) {
+        SequenceFile.Reader reader = readers[i];
+        if (reader != null) {
+          reader.close();
+        }
+      }
+    }
+  }
+
+  /**
+   * Closes a group of MapFile readers.
+   * 
+   * @param readers
+   *          The MapFile readers to close.
+   * @throws IOException
+   *           If an error occurs while closing a reader.
+   */
+  public static void closeReaders(MapFile.Reader[] readers) throws IOException {
+
+    // loop through the readers closing one by one
+    if (readers != null) {
+      for (int i = 0; i < readers.length; i++) {
+        MapFile.Reader reader = readers[i];
+        if (reader != null) {
+          reader.close();
+        }
+      }
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/GZIPUtils.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/util/GZIPUtils.java b/nutch-core/src/main/java/org/apache/nutch/util/GZIPUtils.java
new file mode 100644
index 0000000..63b10e2
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/util/GZIPUtils.java
@@ -0,0 +1,148 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util;
+
+import java.io.ByteArrayOutputStream;
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.util.zip.GZIPInputStream;
+import java.util.zip.GZIPOutputStream;
+
+// Slf4j Logging imports
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * A collection of utility methods for working on GZIPed data.
+ */
+public class GZIPUtils {
+
+  private static final Logger LOG = LoggerFactory.getLogger(GZIPUtils.class);
+  private static final int EXPECTED_COMPRESSION_RATIO = 5;
+  private static final int BUF_SIZE = 4096;
+
+  /**
+   * Returns an gunzipped copy of the input array. If the gzipped input has been
+   * truncated or corrupted, a best-effort attempt is made to unzip as much as
+   * possible. If no data can be extracted <code>null</code> is returned.
+   */
+  public static final byte[] unzipBestEffort(byte[] in) {
+    return unzipBestEffort(in, Integer.MAX_VALUE);
+  }
+
+  /**
+   * Returns an gunzipped copy of the input array, truncated to
+   * <code>sizeLimit</code> bytes, if necessary. If the gzipped input has been
+   * truncated or corrupted, a best-effort attempt is made to unzip as much as
+   * possible. If no data can be extracted <code>null</code> is returned.
+   */
+  public static final byte[] unzipBestEffort(byte[] in, int sizeLimit) {
+    try {
+      // decompress using GZIPInputStream
+      ByteArrayOutputStream outStream = new ByteArrayOutputStream(
+          EXPECTED_COMPRESSION_RATIO * in.length);
+
+      GZIPInputStream inStream = new GZIPInputStream(new ByteArrayInputStream(
+          in));
+
+      byte[] buf = new byte[BUF_SIZE];
+      int written = 0;
+      while (true) {
+        try {
+          int size = inStream.read(buf);
+          if (size <= 0)
+            break;
+          if ((written + size) > sizeLimit) {
+            outStream.write(buf, 0, sizeLimit - written);
+            break;
+          }
+          outStream.write(buf, 0, size);
+          written += size;
+        } catch (Exception e) {
+          break;
+        }
+      }
+      try {
+        outStream.close();
+      } catch (IOException e) {
+      }
+
+      return outStream.toByteArray();
+
+    } catch (IOException e) {
+      return null;
+    }
+  }
+
+  /**
+   * Returns an gunzipped copy of the input array.
+   * 
+   * @throws IOException
+   *           if the input cannot be properly decompressed
+   */
+  public static final byte[] unzip(byte[] in) throws IOException {
+    // decompress using GZIPInputStream
+    ByteArrayOutputStream outStream = new ByteArrayOutputStream(
+        EXPECTED_COMPRESSION_RATIO * in.length);
+
+    GZIPInputStream inStream = new GZIPInputStream(new ByteArrayInputStream(in));
+
+    byte[] buf = new byte[BUF_SIZE];
+    while (true) {
+      int size = inStream.read(buf);
+      if (size <= 0)
+        break;
+      outStream.write(buf, 0, size);
+    }
+    outStream.close();
+
+    return outStream.toByteArray();
+  }
+
+  /**
+   * Returns an gzipped copy of the input array.
+   */
+  public static final byte[] zip(byte[] in) {
+    try {
+      // compress using GZIPOutputStream
+      ByteArrayOutputStream byteOut = new ByteArrayOutputStream(in.length
+          / EXPECTED_COMPRESSION_RATIO);
+
+      GZIPOutputStream outStream = new GZIPOutputStream(byteOut);
+
+      try {
+        outStream.write(in);
+      } catch (Exception e) {
+        LOG.error("Error writing outStream: ", e);
+      }
+
+      try {
+        outStream.close();
+      } catch (IOException e) {
+        LOG.error("Error closing outStream: ", e);
+      }
+
+      return byteOut.toByteArray();
+
+    } catch (IOException e) {
+      LOG.error("Error: ", e);
+      return null;
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/GenericWritableConfigurable.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/util/GenericWritableConfigurable.java b/nutch-core/src/main/java/org/apache/nutch/util/GenericWritableConfigurable.java
new file mode 100644
index 0000000..755aad0
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/util/GenericWritableConfigurable.java
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.util;
+
+import java.io.DataInput;
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configurable;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.GenericWritable;
+import org.apache.hadoop.io.Writable;
+
+/**
+ * A generic Writable wrapper that can inject Configuration to
+ * {@link Configurable}s
+ */
+public abstract class GenericWritableConfigurable extends GenericWritable
+    implements Configurable {
+
+  private Configuration conf;
+
+  public Configuration getConf() {
+    return conf;
+  }
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+  }
+
+  @Override
+  public void readFields(DataInput in) throws IOException {
+    byte type = in.readByte();
+    Class<?> clazz = getTypes()[type];
+    try {
+      set((Writable) clazz.newInstance());
+    } catch (Exception e) {
+      e.printStackTrace();
+      throw new IOException("Cannot initialize the class: " + clazz);
+    }
+    Writable w = get();
+    if (w instanceof Configurable)
+      ((Configurable) w).setConf(conf);
+    w.readFields(in);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/HadoopFSUtil.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/util/HadoopFSUtil.java b/nutch-core/src/main/java/org/apache/nutch/util/HadoopFSUtil.java
new file mode 100644
index 0000000..6f471c1
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/util/HadoopFSUtil.java
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.util;
+
+import java.io.IOException;
+
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.PathFilter;
+
+public class HadoopFSUtil {
+
+  /**
+   * Returns PathFilter that passes all paths through.
+   */
+  public static PathFilter getPassAllFilter() {
+    return new PathFilter() {
+      public boolean accept(Path arg0) {
+        return true;
+      }
+    };
+  }
+
+  /**
+   * Returns PathFilter that passes directories through.
+   */
+  public static PathFilter getPassDirectoriesFilter(final FileSystem fs) {
+    return new PathFilter() {
+      public boolean accept(final Path path) {
+        try {
+          return fs.getFileStatus(path).isDirectory();
+        } catch (IOException ioe) {
+          return false;
+        }
+      }
+
+    };
+  }
+
+  /**
+   * Turns an array of FileStatus into an array of Paths.
+   */
+  public static Path[] getPaths(FileStatus[] stats) {
+    if (stats == null) {
+      return null;
+    }
+    if (stats.length == 0) {
+      return new Path[0];
+    }
+    Path[] res = new Path[stats.length];
+    for (int i = 0; i < stats.length; i++) {
+      res[i] = stats[i].getPath();
+    }
+    return res;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/JexlUtil.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/util/JexlUtil.java b/nutch-core/src/main/java/org/apache/nutch/util/JexlUtil.java
new file mode 100644
index 0000000..656a458
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/util/JexlUtil.java
@@ -0,0 +1,76 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.util;
+
+import java.util.Date;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.commons.jexl2.Expression;
+import org.apache.commons.jexl2.JexlEngine;
+import org.apache.commons.lang.time.DateUtils;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * A collection of Jexl utilit(y|ies).
+ */
+public class JexlUtil {
+
+  public static final Logger LOG = LoggerFactory.getLogger(JexlUtil.class);
+
+  /**
+   * 
+   */
+  public static Pattern datePattern = Pattern.compile("\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}Z");
+
+  /**
+   * Parses the given experssion to a Jexl expression. This supports
+   * date parsing.
+   *
+   * @param expr the Jexl expression
+   * @return parsed Jexl expression or null in case of parse error
+   */
+  public static Expression parseExpression(String expr) {
+    if (expr == null) return null;
+    
+    try {
+      // Translate any date object into a long, dates must be specified as 20-03-2016T00:00:00Z
+      Matcher matcher = datePattern.matcher(expr);
+      if (matcher.find()) {
+        String date = matcher.group();
+        
+        // Parse the thing and get epoch!
+        Date parsedDate = DateUtils.parseDateStrictly(date, new String[] {"yyyy-MM-dd'T'HH:mm:ss'Z'"});
+        long time = parsedDate.getTime();
+        
+        // Replace in the original expression
+        expr = expr.replace(date, Long.toString(time));
+      }
+      
+      JexlEngine jexl = new JexlEngine();
+      jexl.setSilent(true);
+      jexl.setStrict(true);
+      return jexl.createExpression(expr);
+    } catch (Exception e) {
+      LOG.error(e.getMessage());
+    }
+    
+    return null;
+  }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/LockUtil.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/util/LockUtil.java b/nutch-core/src/main/java/org/apache/nutch/util/LockUtil.java
new file mode 100644
index 0000000..7e3bb97
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/util/LockUtil.java
@@ -0,0 +1,84 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util;
+
+import java.io.IOException;
+
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+
+/**
+ * Utility methods for handling application-level locking.
+ * 
+ * @author Andrzej Bialecki
+ */
+public class LockUtil {
+
+  /**
+   * Create a lock file.
+   * 
+   * @param fs
+   *          filesystem
+   * @param lockFile
+   *          name of the lock file
+   * @param accept
+   *          if true, and the target file exists, consider it valid. If false
+   *          and the target file exists, throw an IOException.
+   * @throws IOException
+   *           if accept is false, and the target file already exists, or if
+   *           it's a directory.
+   */
+  public static void createLockFile(FileSystem fs, Path lockFile, boolean accept)
+      throws IOException {
+    if (fs.exists(lockFile)) {
+      if (!accept)
+        throw new IOException("lock file " + lockFile + " already exists.");
+      if (fs.getFileStatus(lockFile).isDirectory())
+        throw new IOException("lock file " + lockFile
+            + " already exists and is a directory.");
+      // do nothing - the file already exists.
+    } else {
+      // make sure parents exist
+      fs.mkdirs(lockFile.getParent());
+      fs.createNewFile(lockFile);
+    }
+  }
+
+  /**
+   * Remove lock file. NOTE: applications enforce the semantics of this file -
+   * this method simply removes any file with a given name.
+   * 
+   * @param fs
+   *          filesystem
+   * @param lockFile
+   *          lock file name
+   * @return false, if the lock file doesn't exist. True, if it existed and was
+   *         successfully removed.
+   * @throws IOException
+   *           if lock file exists but it is a directory.
+   */
+  public static boolean removeLockFile(FileSystem fs, Path lockFile)
+      throws IOException {
+    if (!fs.exists(lockFile))
+      return false;
+    if (fs.getFileStatus(lockFile).isDirectory())
+      throw new IOException("lock file " + lockFile
+          + " exists but is a directory!");
+    return fs.delete(lockFile, false);
+  }
+}

[50/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build for nutch-core and nutch-plugins

Posted by th...@apache.org.

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/.gitignore
----------------------------------------------------------------------
diff --git a/.gitignore b/.gitignore
index 5b3c687..7a70f9d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,3 +6,9 @@ build/
 runtime/
 logs/
 /bin/
+
+*.class
+target/
+nutch-core/target
+nutch-plugins/target
+nutch-plugins/*/target
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/bin/crawl
----------------------------------------------------------------------
diff --git a/bin/crawl b/bin/crawl
new file mode 100755
index 0000000..567d35e
--- /dev/null
+++ b/bin/crawl
@@ -0,0 +1,281 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Usage: crawl [-i|--index] [-D "key=value"] [-w|--wait] <Seed Dir> <Crawl Dir> <Num Rounds>
+#    -i|--index      Indexes crawl results into a configured indexer
+#    -w|--wait       NUMBER[SUFFIX] Time to wait before generating a new segment when no URLs
+#                    are scheduled for fetching. Suffix can be: s for second,
+#                    m for minute, h for hour and d for day. If no suffix is
+#                    specified second is used by default.
+#    -D              A Java property to pass to Nutch calls
+#    Seed Dir        Directory in which to look for a seeds file
+#    Crawl Dir       Directory where the crawl/link/segments dirs are saved
+#    Num Rounds      The number of rounds to run this crawl for
+#
+#
+# UNLIKE THE NUTCH ALL-IN-ONE-CRAWL COMMAND THIS SCRIPT DOES THE LINK INVERSION AND
+# INDEXING FOR EACH SEGMENT
+
+INDEXFLAG=false
+JAVA_PROPERTIES=""
+WAIT=-1 # don't wait if there are no URLs to fetch
+
+function __to_seconds() {
+  NUMBER=$(echo $1 | tr -dc '0-9')
+  MODIFIER=$(echo $1 | tr -dc '[^s|h|m|d]]')
+
+  case $MODIFIER in
+      m|M)
+        SECONDS=`expr $NUMBER \* 60`
+        ;;
+      h|H)
+        SECONDS=`expr $NUMBER \* 120`
+        ;;
+      d|D)
+        SECONDS=`expr $NUMBER \* 86400`
+        ;;
+      s|S|*)
+        SECONDS=$NUMBER
+        ;;
+  esac
+
+  echo $SECONDS
+}
+
+while [[ $# > 0 ]]
+do
+    case $1 in
+        -i|--index)
+            INDEXFLAG=true
+            shift
+            ;;
+        -D)
+            JAVA_PROPERTIES="-D${2} ${JAVA_PROPERTIES}"
+            shift 2
+            ;;
+        -w|--wait)
+            WAIT="${2}"
+            shift 2
+            ;;
+        *)
+            break
+            ;;
+    esac
+done
+
+if [[ $# != 3 ]]; then
+    echo "Usage: crawl [-i|--index] [-D \"key=value\"] [-w|--wait] <Seed Dir> <Crawl Dir> <Num Rounds>"
+    echo -e "\t-i|--index\tIndexes crawl results into a configured indexer"
+    echo -e "\t-D\t\tA Java property to pass to Nutch calls"
+    echo -e "\t-w|--wait\tNUMBER[SUFFIX] Time to wait before generating a new segment when no URLs"
+    echo -e "\t\t\tare scheduled for fetching. Suffix can be: s for second,"
+    echo -e "\t\t\tm for minute, h for hour and d for day. If no suffix is"
+    echo -e "\t\t\tspecified second is used by default."
+    echo -e "\tSeed Dir\tDirectory in which to look for a seeds file"
+    echo -e "\tCrawl Dir\tDirectory where the crawl/link/segments dirs are saved"
+    echo -e "\tNum Rounds\tThe number of rounds to run this crawl for"
+    exit 1
+fi
+
+SEEDDIR="$1"
+CRAWL_PATH="$2"
+LIMIT="$3"
+
+# convert wait time to seconds for compatibility reasons
+if [ "$WAIT" != "-1" ]; then
+  WAIT=$( __to_seconds "$WAIT" )
+  echo "Time to wait (--wait) = $WAIT sec."
+fi
+
+#############################################
+# MODIFY THE PARAMETERS BELOW TO YOUR NEEDS #
+#############################################
+
+# set the number of slaves nodes
+numSlaves=1
+
+# and the total number of available tasks
+# sets Hadoop parameter "mapreduce.job.reduces"
+numTasks=`expr $numSlaves \* 2`
+
+# number of urls to fetch in one iteration
+# 250K per task?
+sizeFetchlist=`expr $numSlaves \* 50000`
+
+# time limit for feching
+timeLimitFetch=180
+
+# num threads for fetching
+numThreads=50
+
+#############################################
+
+bin="`dirname "$0"`"
+bin="`cd "$bin"; pwd`"
+
+# determines whether mode based on presence of job file
+mode=local
+if [ -f "${bin}"/../*nutch*.job ]; then
+    mode=distributed
+fi
+
+# note that some of the options listed here could be set in the
+# corresponding hadoop site xml param file
+commonOptions="-D mapreduce.job.reduces=$numTasks -D mapred.child.java.opts=-Xmx1000m -D mapreduce.reduce.speculative=false -D mapreduce.map.speculative=false -D mapreduce.map.output.compress=true"
+
+ # check that hadoop can be found on the path
+if [ $mode = "distributed" ]; then
+ if [ $(which hadoop | wc -l ) -eq 0 ]; then
+    echo "Can't find Hadoop executable. Add HADOOP_COMMON_HOME/bin to the path or run in local mode."
+    exit -1;
+ fi
+fi
+
+
+function __bin_nutch {
+    # run $bin/nutch, exit if exit value indicates error
+
+    echo "$bin/nutch $@" ;# echo command and arguments
+    "$bin/nutch" "$@"
+
+    RETCODE=$?
+    if [ $RETCODE -ne 0 ]
+    then
+        echo "Error running:"
+        echo "  $bin/nutch $@"
+        echo "Failed with exit value $RETCODE."
+        exit $RETCODE
+    fi
+}
+
+
+
+# initial injection
+echo "Injecting seed URLs"
+__bin_nutch inject "$CRAWL_PATH"/crawldb "$SEEDDIR"
+
+# main loop : rounds of generate - fetch - parse - update
+for ((a=1; ; a++))
+do
+  if [ -e ".STOP" ]
+  then
+   echo "STOP file found - escaping loop"
+   break
+  fi
+
+  if [ $LIMIT -ne -1 ]; then
+    if [ $a -gt $LIMIT ]; then
+      echo `date` ": Finished loop with $LIMIT iterations"
+      break
+    fi
+    echo `date` ": Iteration $a of $LIMIT"
+  else
+    echo `date` ": Iteration $a"
+  fi
+
+  echo "Generating a new segment"
+  generate_args=($commonOptions "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments -topN $sizeFetchlist -numFetchers $numSlaves -noFilter)
+  echo "$bin/nutch generate ${generate_args[@]}"
+  $bin/nutch generate "${generate_args[@]}"
+  RETCODE=$?
+  if [ $RETCODE -eq 0 ]; then
+      : # ok: no error
+  elif [ $RETCODE -eq 1 ]; then
+    echo "Generate returned 1 (no new segments created)"
+
+    if [ "$WAIT" -ne -1 ]; then
+      echo "Waiting for $WAIT sec. ..."
+      sleep $WAIT
+      continue
+    else
+      echo "Escaping loop: no more URLs to fetch now"
+      break
+    fi
+  else
+    echo "Error running:"
+    echo "  $bin/nutch generate ${generate_args[@]}"
+    echo "Failed with exit value $RETCODE."
+    exit $RETCODE
+  fi
+
+  # capture the name of the segment
+  # call hadoop in distributed mode
+  # or use ls
+
+  if [ $mode = "local" ]; then
+   SEGMENT=`ls "$CRAWL_PATH"/segments/ | sort -n | tail -n 1`
+  else
+   SEGMENT=`hadoop fs -ls "$CRAWL_PATH"/segments/ | grep segments |  sed -e "s/\//\\n/g" | egrep 20[0-9]+ | sort -n | tail -n 1`
+  fi
+
+  echo "Operating on segment : $SEGMENT"
+
+  # fetching the segment
+  echo "Fetching : $SEGMENT"
+  __bin_nutch fetch $commonOptions -D fetcher.timelimit.mins=$timeLimitFetch "$CRAWL_PATH"/segments/$SEGMENT -noParsing -threads $numThreads
+
+  # parsing the segment
+  echo "Parsing : $SEGMENT"
+  # enable the skipping of records for the parsing so that a dodgy document
+  # so that it does not fail the full task
+  skipRecordsOptions="-D mapreduce.task.skip.start.attempts=2 -D mapreduce.map.skip.maxrecords=1"
+  __bin_nutch parse $commonOptions $skipRecordsOptions "$CRAWL_PATH"/segments/$SEGMENT
+
+  # updatedb with this segment
+  echo "CrawlDB update"
+  __bin_nutch updatedb $commonOptions "$CRAWL_PATH"/crawldb  "$CRAWL_PATH"/segments/$SEGMENT
+
+# note that the link inversion - indexing routine can be done within the main loop
+# on a per segment basis
+  echo "Link inversion"
+  __bin_nutch invertlinks "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT
+
+  echo "Dedup on crawldb"
+  __bin_nutch dedup "$CRAWL_PATH"/crawldb
+
+  if $INDEXFLAG; then
+      echo "Indexing $SEGMENT to index"
+      __bin_nutch index $JAVA_PROPERTIES "$CRAWL_PATH"/crawldb -linkdb "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT
+
+      echo "Cleaning up index if possible"
+      __bin_nutch clean $JAVA_PROPERTIES "$CRAWL_PATH"/crawldb
+  else
+      echo "Skipping indexing ..."
+  fi
+
+  #######################################################
+  # The following commands fall into WebGraph territory
+  # and should be uncommented based on your requirements
+  #######################################################
+  #echo "Building WebGraph within $CRAWL_PATH on all segments in $CRAWL_PATH/segments/"
+  #__bin_nutch webgraph $commonOptions -filter -normalize -segmentDir "$CRAWL_PATH"/segments/ -webgraphdb "$CRAWL_PATH"
+
+  #echo "Running Loops Job on WebGraph within $CRAWL_PATH"
+  #__bin_nutch org.apache.nutch.scoring.webgraph.Loops $commonOptions -webgraphdb "$CRAWL_PATH"
+
+  #echo "Running LinkRank Algorithm on WebGraph within $CRAWL_PATH"
+  #__bin_nutch linkrank $commonOptions -webgraphdb "$CRAWL_PATH"
+
+  #echo "Running ScoreUpdater Job with $CRAWL_PATH/crawldb and  WebGraph within $CRAWL_PATH"
+  #__bin_nutch scoreupdater $commonOptions -crawldb "$CRAWL_PATH"/crawldb -webgraphdb "$CRAWL_PATH"
+
+  #echo "Running NodeDumper on WebGraph within $CRAWL_PATH and dumping output to $CRAWL_PATH/dump/scores"
+  #__bin_nutch nodedumper $commonOptions -scores -topn 1000 -webgraphdb "$CRAWL_PATH" -output "$CRAWL_PATH"/dump/scores
+
+done
+
+exit 0

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/bin/nutch
----------------------------------------------------------------------
diff --git a/bin/nutch b/bin/nutch
new file mode 100755
index 0000000..1649069
--- /dev/null
+++ b/bin/nutch
@@ -0,0 +1,324 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# 
+# The Nutch command script
+#
+# Environment Variables
+#
+#   NUTCH_JAVA_HOME The java implementation to use.  Overrides JAVA_HOME.
+#
+#   NUTCH_HEAPSIZE  The maximum amount of heap to use, in MB. 
+#                   Default is 1000.
+#
+#   NUTCH_OPTS      Extra Java runtime options.
+#                   Multiple options must be separated by white space.
+#
+#   NUTCH_LOG_DIR   Log directory (default: $NUTCH_HOME/logs)
+#
+#   NUTCH_LOGFILE   Log file (default: hadoop.log)
+#
+#   NUTCH_CONF_DIR  Path(s) to configuration files (default: $NUTCH_HOME/conf).
+#                   Multiple paths must be separated by a colon ':'.
+#
+cygwin=false
+case "`uname`" in
+CYGWIN*) cygwin=true;;
+esac
+
+# resolve links - $0 may be a softlink
+THIS="$0"
+while [ -h "$THIS" ]; do
+  ls=`ls -ld "$THIS"`
+  link=`expr "$ls" : '.*-> \(.*\)$'`
+  if expr "$link" : '.*/.*' > /dev/null; then
+    THIS="$link"
+  else
+    THIS=`dirname "$THIS"`/"$link"
+  fi
+done
+
+# if no args specified, show usage
+if [ $# = 0 ]; then
+  echo "nutch 1.12"
+  echo "Usage: nutch COMMAND"
+  echo "where COMMAND is one of:"
+  echo "  readdb            read / dump crawl db"
+  echo "  mergedb           merge crawldb-s, with optional filtering"
+  echo "  readlinkdb        read / dump link db"
+  echo "  inject            inject new urls into the database"
+  echo "  generate          generate new segments to fetch from crawl db"
+  echo "  freegen           generate new segments to fetch from text files"
+  echo "  fetch             fetch a segment's pages"
+  echo "  parse             parse a segment's pages"
+  echo "  readseg           read / dump segment data"
+  echo "  mergesegs         merge several segments, with optional filtering and slicing"
+  echo "  updatedb          update crawl db from segments after fetching"
+  echo "  invertlinks       create a linkdb from parsed segments"
+  echo "  mergelinkdb       merge linkdb-s, with optional filtering"
+  echo "  index             run the plugin-based indexer on parsed segments and linkdb"
+  echo "  dedup             deduplicate entries in the crawldb and give them a special status"
+  echo "  dump              exports crawled data from segments into files"
+  echo "  commoncrawldump   exports crawled data from segments into common crawl data format encoded as CBOR"
+  echo "  solrindex         run the solr indexer on parsed segments and linkdb - DEPRECATED use the index command instead"
+  echo "  solrdedup         remove duplicates from solr - DEPRECATED use the dedup command instead"
+  echo "  solrclean         remove HTTP 301 and 404 documents from solr - DEPRECATED use the clean command instead"
+  echo "  clean             remove HTTP 301 and 404 documents and duplicates from indexing backends configured via plugins"
+  echo "  parsechecker      check the parser for a given url"
+  echo "  indexchecker      check the indexing filters for a given url"
+  echo "  filterchecker     check url filters for a given url"
+  echo "  normalizerchecker check url normalizers for a given url"
+  echo "  domainstats       calculate domain statistics from crawldb"
+  echo "  protocolstats     calculate protocol status code stats from crawldb"
+  echo "  crawlcomplete     calculate crawl completion stats from crawldb"
+  echo "  webgraph          generate a web graph from existing segments"
+  echo "  linkrank          run a link analysis program on the generated web graph"
+  echo "  scoreupdater      updates the crawldb with linkrank scores"
+  echo "  nodedumper        dumps the web graph's node scores"
+  echo "  plugin            load a plugin and run one of its classes main()"
+  echo "  junit             runs the given JUnit test"
+  echo "  startserver       runs the Nutch Server on localhost:8081"
+  echo "  webapp            run a local Nutch Web Application on locahost:8080"
+  echo "  warc              exports crawled data from segments at the WARC format"
+  echo "  updatehostdb      update the host db with records from the crawl db"
+  echo "  readhostdb        read / dump host db"
+  echo " or"
+  echo "  CLASSNAME         run the class named CLASSNAME"
+  echo "Most commands print help when invoked w/o parameters."
+  exit 1
+fi
+
+# get arguments
+COMMAND=$1
+shift
+
+# some directories
+THIS_DIR="`dirname "$THIS"`"
+NUTCH_HOME="`cd "$THIS_DIR/.." ; pwd`"
+
+# some Java parameters
+if [ "$NUTCH_JAVA_HOME" != "" ]; then
+  #echo "run java in $NUTCH_JAVA_HOME"
+  JAVA_HOME="$NUTCH_JAVA_HOME"
+fi
+  
+if [ "$JAVA_HOME" = "" ]; then
+  echo "Error: JAVA_HOME is not set."
+  exit 1
+fi
+
+local=true
+
+# NUTCH_JOB 
+if [ -f "${NUTCH_HOME}"/*nutch*.job ]; then
+  local=false
+  for f in "$NUTCH_HOME"/*nutch*.job; do
+    NUTCH_JOB="$f"
+  done
+  # cygwin path translation
+  if $cygwin; then
+	NUTCH_JOB="`cygpath -p -w "$NUTCH_JOB"`"
+  fi
+fi
+
+JAVA="$JAVA_HOME/bin/java"
+JAVA_HEAP_MAX=-Xmx1000m 
+
+# check envvars which might override default args
+if [ "$NUTCH_HEAPSIZE" != "" ]; then
+  #echo "run with heapsize $NUTCH_HEAPSIZE"
+  JAVA_HEAP_MAX="-Xmx""$NUTCH_HEAPSIZE""m"
+  #echo $JAVA_HEAP_MAX
+fi
+
+# CLASSPATH initially contains $NUTCH_CONF_DIR, or defaults to $NUTCH_HOME/conf
+CLASSPATH="${NUTCH_CONF_DIR:=$NUTCH_HOME/conf}"
+CLASSPATH="${CLASSPATH}:$JAVA_HOME/lib/tools.jar"
+
+# so that filenames w/ spaces are handled correctly in loops below
+IFS=
+
+# add libs to CLASSPATH
+if $local; then
+  for f in "$NUTCH_HOME"/lib/*.jar; do
+   CLASSPATH="${CLASSPATH}:$f";
+  done
+  # local runtime
+  # add plugins to classpath
+  if [ -d "$NUTCH_HOME/plugins" ]; then
+     CLASSPATH="${NUTCH_HOME}:${CLASSPATH}"
+  fi
+fi
+
+# cygwin path translation
+if $cygwin; then
+  CLASSPATH="`cygpath -p -w "$CLASSPATH"`"
+fi
+
+# setup 'java.library.path' for native-hadoop code if necessary
+# used only in local mode 
+JAVA_LIBRARY_PATH=''
+if [ -d "${NUTCH_HOME}/lib/native" ]; then
+
+  JAVA_PLATFORM=`"${JAVA}" -classpath "$CLASSPATH" org.apache.hadoop.util.PlatformName | sed -e 's/ /_/g'`
+
+  if [ -d "${NUTCH_HOME}/lib/native" ]; then
+    if [ "x$JAVA_LIBRARY_PATH" != "x" ]; then
+      JAVA_LIBRARY_PATH="${JAVA_LIBRARY_PATH}:${NUTCH_HOME}/lib/native/${JAVA_PLATFORM}"
+    else
+      JAVA_LIBRARY_PATH="${NUTCH_HOME}/lib/native/${JAVA_PLATFORM}"
+    fi
+  fi
+fi
+
+if [ $cygwin = true -a "X${JAVA_LIBRARY_PATH}" != "X" ]; then
+  JAVA_LIBRARY_PATH="`cygpath -p -w "$JAVA_LIBRARY_PATH"`"
+fi
+
+# restore ordinary behaviour
+unset IFS
+
+# default log directory & file
+if [ "$NUTCH_LOG_DIR" = "" ]; then
+  NUTCH_LOG_DIR="$NUTCH_HOME/logs"
+fi
+if [ "$NUTCH_LOGFILE" = "" ]; then
+  NUTCH_LOGFILE='hadoop.log'
+fi
+
+#Fix log path under cygwin
+if $cygwin; then
+  NUTCH_LOG_DIR="`cygpath -p -w "$NUTCH_LOG_DIR"`"
+fi
+
+NUTCH_OPTS=($NUTCH_OPTS -Dhadoop.log.dir="$NUTCH_LOG_DIR")
+NUTCH_OPTS=("${NUTCH_OPTS[@]}" -Dhadoop.log.file="$NUTCH_LOGFILE")
+
+if [ "x$JAVA_LIBRARY_PATH" != "x" ]; then
+  NUTCH_OPTS=("${NUTCH_OPTS[@]}" -Djava.library.path="$JAVA_LIBRARY_PATH")
+fi
+
+# figure out which class to run
+if [ "$COMMAND" = "crawl" ] ; then
+  echo "Command $COMMAND is deprecated, please use bin/crawl instead"
+  exit -1
+elif [ "$COMMAND" = "inject" ] ; then
+  CLASS=org.apache.nutch.crawl.Injector
+elif [ "$COMMAND" = "generate" ] ; then
+  CLASS=org.apache.nutch.crawl.Generator
+elif [ "$COMMAND" = "freegen" ] ; then
+  CLASS=org.apache.nutch.tools.FreeGenerator
+elif [ "$COMMAND" = "fetch" ] ; then
+  CLASS=org.apache.nutch.fetcher.Fetcher
+elif [ "$COMMAND" = "parse" ] ; then
+  CLASS=org.apache.nutch.parse.ParseSegment
+elif [ "$COMMAND" = "readdb" ] ; then
+  CLASS=org.apache.nutch.crawl.CrawlDbReader
+elif [ "$COMMAND" = "mergedb" ] ; then
+  CLASS=org.apache.nutch.crawl.CrawlDbMerger
+elif [ "$COMMAND" = "readlinkdb" ] ; then
+  CLASS=org.apache.nutch.crawl.LinkDbReader
+elif [ "$COMMAND" = "readseg" ] ; then
+  CLASS=org.apache.nutch.segment.SegmentReader
+elif [ "$COMMAND" = "mergesegs" ] ; then
+  CLASS=org.apache.nutch.segment.SegmentMerger
+elif [ "$COMMAND" = "updatedb" ] ; then
+  CLASS=org.apache.nutch.crawl.CrawlDb
+elif [ "$COMMAND" = "invertlinks" ] ; then
+  CLASS=org.apache.nutch.crawl.LinkDb
+elif [ "$COMMAND" = "mergelinkdb" ] ; then
+  CLASS=org.apache.nutch.crawl.LinkDbMerger
+elif [ "$COMMAND" = "dump" ] ; then
+  CLASS=org.apache.nutch.tools.FileDumper
+elif [ "$COMMAND" = "commoncrawldump" ] ; then
+  CLASS=org.apache.nutch.tools.CommonCrawlDataDumper
+elif [ "$COMMAND" = "solrindex" ] ; then
+  CLASS="org.apache.nutch.indexer.IndexingJob -D solr.server.url=$1"
+  shift
+elif [ "$COMMAND" = "index" ] ; then
+  CLASS=org.apache.nutch.indexer.IndexingJob
+elif [ "$COMMAND" = "solrdedup" ] ; then
+  echo "Command $COMMAND is deprecated, please use dedup instead"
+  exit -1
+elif [ "$COMMAND" = "dedup" ] ; then
+  CLASS=org.apache.nutch.crawl.DeduplicationJob
+elif [ "$COMMAND" = "solrclean" ] ; then
+  CLASS="org.apache.nutch.indexer.CleaningJob -D solr.server.url=$2 $1"
+  shift; shift
+elif [ "$COMMAND" = "clean" ] ; then
+  CLASS=org.apache.nutch.indexer.CleaningJob
+elif [ "$COMMAND" = "parsechecker" ] ; then
+  CLASS=org.apache.nutch.parse.ParserChecker
+elif [ "$COMMAND" = "indexchecker" ] ; then
+  CLASS=org.apache.nutch.indexer.IndexingFiltersChecker
+elif [ "$COMMAND" = "filterchecker" ] ; then
+  CLASS=org.apache.nutch.net.URLFilterChecker
+elif [ "$COMMAND" = "normalizerchecker" ] ; then
+  CLASS=org.apache.nutch.net.URLNormalizerChecker
+elif [ "$COMMAND" = "domainstats" ] ; then 
+  CLASS=org.apache.nutch.util.domain.DomainStatistics
+elif [ "$COMMAND" = "protocolstats" ] ; then
+   CLASS=org.apache.nutch.util.ProtocolStatusStatistics
+elif [ "$COMMAND" = "crawlcomplete" ] ; then
+  CLASS=org.apache.nutch.util.CrawlCompletionStats
+elif [ "$COMMAND" = "webgraph" ] ; then
+  CLASS=org.apache.nutch.scoring.webgraph.WebGraph
+elif [ "$COMMAND" = "linkrank" ] ; then
+  CLASS=org.apache.nutch.scoring.webgraph.LinkRank
+elif [ "$COMMAND" = "scoreupdater" ] ; then
+  CLASS=org.apache.nutch.scoring.webgraph.ScoreUpdater
+elif [ "$COMMAND" = "nodedumper" ] ; then
+  CLASS=org.apache.nutch.scoring.webgraph.NodeDumper
+elif [ "$COMMAND" = "plugin" ] ; then
+  CLASS=org.apache.nutch.plugin.PluginRepository
+elif [ "$COMMAND" = "junit" ] ; then
+  CLASSPATH="$CLASSPATH:$NUTCH_HOME/test/classes/"
+  if $local; then
+    for f in "$NUTCH_HOME"/test/lib/*.jar; do
+      CLASSPATH="${CLASSPATH}:$f";
+    done
+  fi
+  CLASS=org.junit.runner.JUnitCore
+elif [ "$COMMAND" = "startserver" ] ; then
+  CLASS=org.apache.nutch.service.NutchServer
+elif [ "$COMMAND" = "webapp" ] ; then
+  CLASS=org.apache.nutch.webui.NutchUiServer
+elif [ "$COMMAND" = "warc" ] ; then
+  CLASS=org.apache.nutch.tools.warc.WARCExporter
+elif [ "$COMMAND" = "updatehostdb" ] ; then
+  CLASS=org.apache.nutch.hostdb.UpdateHostDb
+elif [ "$COMMAND" = "readhostdb" ] ; then
+  CLASS=org.apache.nutch.hostdb.ReadHostDb
+else
+  CLASS=$COMMAND
+fi
+
+# distributed mode
+EXEC_CALL=(hadoop jar "$NUTCH_JOB")
+
+if $local; then
+ EXEC_CALL=("$JAVA" $JAVA_HEAP_MAX "${NUTCH_OPTS[@]}" -classpath "$CLASSPATH")
+else
+ # check that hadoop can be found on the path
+ if [ $(which hadoop | wc -l ) -eq 0 ]; then
+    echo "Can't find Hadoop executable. Add HADOOP_COMMON_HOME/bin to the path or run in local mode."
+    exit -1;
+ fi
+fi
+
+# run it
+exec "${EXEC_CALL[@]}" $CLASS "$@"
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-core/pom.xml b/nutch-core/pom.xml
new file mode 100644
index 0000000..62e2e58
--- /dev/null
+++ b/nutch-core/pom.xml
@@ -0,0 +1,522 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-parent</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>nutch-core</artifactId>
+    <packaging>jar</packaging>
+
+    <name>Apache Nutch</name>
+    <description>Nutch is an open source web-search software.
+        It builds on Hadoop, Tika and Solr, adding web-specifics,
+        such as a crawler, a link-graph database etc.
+    </description>
+    <url>http://nutch.apache.org</url>
+    <licenses>
+    <license>
+      <name>The Apache Software License, Version 2.0</name>
+      <url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
+      <distribution>repo</distribution>
+    </license>
+  </licenses>
+
+    <scm>
+    <developerConnection>scm:git:https://git-wip-us.apache.org/repos/asf/nutch.git</developerConnection>
+    <connection>scm:git:http://git-wip-us.apache.org/repos/asf/nutch.git</connection>
+    <url>https://git-wip-us.apache.org/repos/asf/nutch.git</url>
+  </scm>
+
+  <pluginRepositories>
+    <pluginRepository>
+      <id>miredot</id>
+      <name>MireDot Releases</name>
+      <url>http://nexus.qmino.com/content/repositories/miredot</url>
+    </pluginRepository>
+  </pluginRepositories>
+
+  <developers>
+    <developer>
+      <id>mattmann</id>
+      <name>Chris A. Mattmann</name>
+      <email>mattmann@apache.org</email>
+    </developer>
+    <developer>
+      <id>jnioche</id>
+      <name>Julien Nioche</name>
+      <email>jnioche@apache.org</email>
+    </developer>
+    <developer>
+      <id>lewismc</id>
+      <name>Lewis John McGibbney</name>
+      <email>lewismc@apache.org</email>
+    </developer>
+    <developer>
+      <id>markus</id>
+      <name>Markus Jelsma</name>
+      <email>markus@apache.org</email>
+    </developer>
+    <developer>
+      <id>fenglu</id>
+      <name>Feng Lu</name>
+      <email>fenglu@apache.org</email>
+    </developer>
+    <developer>
+      <id>kiranch</id>
+      <name>Kiran Chitturi</name>
+      <email>kiranch@apache.org</email>
+    </developer>
+    <developer>
+      <id>tejasp</id>
+      <name>Tejas Patil</name>
+      <email>tejasp@apache.org</email>
+    </developer>
+    <developer>
+      <id>talat</id>
+      <name>Talat Uyarer</name>
+      <email>talat@apache.org</email>
+    </developer>
+    <developer>
+      <id>snagel</id>
+      <name>Sebastian Nagel</name>
+      <email>snagel@apache.org</email>
+    </developer>
+    <developer>
+      <id>thammegowda</id>
+      <name>Thamme Gowda</name>
+      <email>thammegowda@apache.org</email>
+    </developer>
+  </developers>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+        <slf4j.version>1.7.12</slf4j.version>
+        <junit.version>4.12</junit.version>
+        <dir.root>${project.parent.basedir}</dir.root>
+        <libs.dir>${dir.local}${file.separator}lib</libs.dir>
+    </properties>
+
+    <dependencies>
+        <dependency>
+            <groupId>org.slf4j</groupId>
+            <artifactId>slf4j-api</artifactId>
+            <version>${slf4j.version}</version>
+            <optional>true</optional>
+        </dependency>
+        <dependency>
+            <groupId>org.slf4j</groupId>
+            <artifactId>slf4j-log4j12</artifactId>
+            <version>${slf4j.version}</version>
+            <optional>true</optional>
+        </dependency>
+        <dependency>
+            <groupId>commons-lang</groupId>
+            <artifactId>commons-lang</artifactId>
+            <version>2.6</version>
+            <optional>true</optional>
+        </dependency>
+        <dependency>
+            <groupId>commons-collections</groupId>
+            <artifactId>commons-collections</artifactId>
+            <version>3.2.1</version>
+            <optional>true</optional>
+        </dependency>
+        <dependency>
+            <groupId>commons-httpclient</groupId>
+            <artifactId>commons-httpclient</artifactId>
+            <version>3.1</version>
+            <optional>true</optional>
+        </dependency>
+        <dependency>
+            <groupId>commons-codec</groupId>
+            <artifactId>commons-codec</artifactId>
+            <version>1.10</version>
+            <optional>true</optional>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.commons</groupId>
+            <artifactId>commons-compress</artifactId>
+            <version>1.9</version>
+            <optional>true</optional>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.commons</groupId>
+            <artifactId>commons-jexl</artifactId>
+            <version>2.1.1</version>
+            <scope>compile</scope>
+        </dependency>
+        <dependency>
+            <groupId>com.tdunning</groupId>
+            <artifactId>t-digest</artifactId>
+            <version>3.1</version>
+            <scope>compile</scope>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.hadoop</groupId>
+            <artifactId>hadoop-common</artifactId>
+            <version>2.4.0</version>
+            <optional>true</optional>
+            <exclusions>
+                <exclusion>
+                    <groupId>hsqldb</groupId>
+                    <artifactId>*</artifactId>
+                </exclusion>
+                <exclusion>
+                    <groupId>net.sf.kosmosfs</groupId>
+                    <artifactId>*</artifactId>
+                </exclusion>
+                <exclusion>
+                    <groupId>net.java.dev.jets3t</groupId>
+                    <artifactId>*</artifactId>
+                </exclusion>
+                <exclusion>
+                    <groupId>org.eclipse.jdt</groupId>
+                    <artifactId>*</artifactId>
+                </exclusion>
+                <exclusion>
+                    <groupId>org.mortbay.jetty</groupId>
+                    <artifactId>*</artifactId>
+                </exclusion>
+                <exclusion>
+                    <groupId>ant</groupId>
+                    <artifactId>*</artifactId>
+                </exclusion>
+            </exclusions>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.hadoop</groupId>
+            <artifactId>hadoop-hdfs</artifactId>
+            <version>2.4.0</version>
+            <optional>true</optional>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.hadoop</groupId>
+            <artifactId>hadoop-mapreduce-client-core</artifactId>
+            <version>2.4.0</version>
+            <optional>true</optional>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.hadoop</groupId>
+            <artifactId>hadoop-mapreduce-client-jobclient</artifactId>
+            <version>2.4.0</version>
+            <optional>true</optional>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.tika</groupId>
+            <artifactId>tika-core</artifactId>
+            <version>1.12</version>
+            <scope>compile</scope>
+        </dependency>
+        <dependency>
+            <groupId>com.ibm.icu</groupId>
+            <artifactId>icu4j</artifactId>
+            <version>55.1</version>
+            <scope>compile</scope>
+        </dependency>
+        <dependency>
+            <groupId>xerces</groupId>
+            <artifactId>xercesImpl</artifactId>
+            <version>2.11.0</version>
+            <scope>compile</scope>
+        </dependency>
+        <dependency>
+            <groupId>xerces</groupId>
+            <artifactId>xmlParserAPIs</artifactId>
+            <version>2.6.2</version>
+            <scope>compile</scope>
+        </dependency>
+        <dependency>
+            <groupId>oro</groupId>
+            <artifactId>oro</artifactId>
+            <version>2.0.8</version>
+            <scope>compile</scope>
+        </dependency>
+        <dependency>
+            <groupId>com.google.guava</groupId>
+            <artifactId>guava</artifactId>
+            <version>16.0.1</version>
+            <scope>compile</scope>
+        </dependency>
+        <dependency>
+            <groupId>com.github.crawler-commons</groupId>
+            <artifactId>crawler-commons</artifactId>
+            <version>0.6</version>
+            <scope>compile</scope>
+        </dependency>
+        <dependency>
+            <groupId>com.martinkl.warc</groupId>
+            <artifactId>warc-hadoop</artifactId>
+            <version>0.1.0</version>
+            <scope>compile</scope>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.cxf</groupId>
+            <artifactId>cxf-rt-frontend-jaxws</artifactId>
+            <version>3.0.4</version>
+            <optional>true</optional>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.cxf</groupId>
+            <artifactId>cxf-rt-frontend-jaxrs</artifactId>
+            <version>3.0.4</version>
+            <optional>true</optional>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.cxf</groupId>
+            <artifactId>cxf-rt-transports-http</artifactId>
+            <version>3.0.4</version>
+            <optional>true</optional>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.cxf</groupId>
+            <artifactId>cxf-rt-transports-http-jetty</artifactId>
+            <version>3.0.4</version>
+            <optional>true</optional>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.cxf</groupId>
+            <artifactId>cxf-rt-rs-client</artifactId>
+            <version>3.0.4</version>
+            <optional>true</optional>
+        </dependency>
+        <dependency>
+            <groupId>com.fasterxml.jackson.core</groupId>
+            <artifactId>jackson-databind</artifactId>
+            <version>2.5.1</version>
+            <optional>true</optional>
+        </dependency>
+        <dependency>
+            <groupId>com.fasterxml.jackson.dataformat</groupId>
+            <artifactId>jackson-dataformat-cbor</artifactId>
+            <version>2.5.1</version>
+            <optional>true</optional>
+        </dependency>
+        <dependency>
+            <groupId>com.fasterxml.jackson.jaxrs</groupId>
+            <artifactId>jackson-jaxrs-json-provider</artifactId>
+            <version>2.5.1</version>
+            <optional>true</optional>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.lucene</groupId>
+            <artifactId>lucene-analyzers-common</artifactId>
+            <version>4.10.2</version>
+            <optional>true</optional>
+        </dependency>
+        <dependency>
+            <groupId>org.netpreserve.commons</groupId>
+            <artifactId>webarchive-commons</artifactId>
+            <version>1.1.5</version>
+            <optional>true</optional>
+            <exclusions>
+                <exclusion>
+                    <groupId>*</groupId>
+                    <artifactId>hadoop-core</artifactId>
+                </exclusion>
+                <exclusion>
+                    <groupId>com.google.guava</groupId>
+                    <artifactId>*</artifactId>
+                </exclusion>
+                <exclusion>
+                    <groupId>junit</groupId>
+                    <artifactId>*</artifactId>
+                </exclusion>
+            </exclusions>
+        </dependency>
+        <dependency>
+            <groupId>junit</groupId>
+            <artifactId>junit</artifactId>
+            <version>4.11</version>
+            <optional>true</optional>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.mrunit</groupId>
+            <artifactId>mrunit</artifactId>
+            <version>1.1.0</version>
+            <classifier>hadoop2</classifier>
+            <optional>true</optional>
+            <exclusions>
+                <exclusion>
+                    <groupId>log4j</groupId>
+                    <artifactId>log4j</artifactId>
+                </exclusion>
+            </exclusions>
+        </dependency>
+        <dependency>
+            <groupId>org.mortbay.jetty</groupId>
+            <artifactId>jetty-client</artifactId>
+            <version>6.1.22</version>
+            <optional>true</optional>
+        </dependency>
+        <dependency>
+            <groupId>org.mortbay.jetty</groupId>
+            <artifactId>jetty</artifactId>
+            <version>6.1.22</version>
+            <optional>true</optional>
+        </dependency>
+        <dependency>
+            <groupId>org.mortbay.jetty</groupId>
+            <artifactId>jetty-util</artifactId>
+            <version>6.1.22</version>
+            <optional>true</optional>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.commons</groupId>
+            <artifactId>commons-collections4</artifactId>
+            <version>4.0</version>
+            <optional>true</optional>
+        </dependency>
+        <dependency>
+            <groupId>org.springframework</groupId>
+            <artifactId>spring-core</artifactId>
+            <version>4.0.4.RELEASE</version>
+            <optional>true</optional>
+        </dependency>
+        <dependency>
+            <groupId>org.springframework</groupId>
+            <artifactId>spring-context</artifactId>
+            <version>4.0.4.RELEASE</version>
+            <optional>true</optional>
+        </dependency>
+        <dependency>
+            <groupId>org.springframework</groupId>
+            <artifactId>spring-web</artifactId>
+            <version>4.0.4.RELEASE</version>
+            <optional>true</optional>
+        </dependency>
+        <dependency>
+            <groupId>com.sun.jersey</groupId>
+            <artifactId>jersey-client</artifactId>
+            <version>1.8</version>
+            <optional>true</optional>
+        </dependency>
+        <dependency>
+            <groupId>com.j256.ormlite</groupId>
+            <artifactId>ormlite-jdbc</artifactId>
+            <version>4.48</version>
+            <optional>true</optional>
+        </dependency>
+        <dependency>
+            <groupId>com.h2database</groupId>
+            <artifactId>h2</artifactId>
+            <version>1.4.180</version>
+            <optional>true</optional>
+        </dependency>
+        <dependency>
+            <groupId>org.eclipse.persistence</groupId>
+            <artifactId>javax.persistence</artifactId>
+            <version>2.0.0</version>
+            <optional>true</optional>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.wicket</groupId>
+            <artifactId>wicket-core</artifactId>
+            <version>6.16.0</version>
+            <optional>true</optional>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.wicket</groupId>
+            <artifactId>wicket-spring</artifactId>
+            <version>6.16.0</version>
+            <optional>true</optional>
+        </dependency>
+        <dependency>
+            <groupId>de.agilecoders.wicket</groupId>
+            <artifactId>wicket-bootstrap-core</artifactId>
+            <version>0.9.2</version>
+            <optional>true</optional>
+        </dependency>
+        <dependency>
+            <groupId>de.agilecoders.wicket</groupId>
+            <artifactId>wicket-bootstrap-extensions</artifactId>
+            <version>0.9.2</version>
+            <optional>true</optional>
+        </dependency>
+    </dependencies>
+
+    <build>
+        <resources>
+            <resource>
+                <directory>${project.parent.basedir}${file.separator}conf</directory>
+            </resource>
+        </resources>
+        <plugins>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-jar-plugin</artifactId>
+                <version>2.6</version>
+                <executions>
+                    <execution>
+                        <goals>
+                            <goal>test-jar</goal>
+                        </goals>
+                    </execution>
+                </executions>
+            </plugin>
+            <plugin>
+                <artifactId>maven-resources-plugin</artifactId>
+                <version>3.0.1</version>
+                <executions>
+                    <execution>
+                        <id>copy-resources</id>
+                        <phase>package</phase>
+                        <goals>
+                            <goal>copy-resources</goal>
+                        </goals>
+                        <configuration>
+                            <outputDirectory>${libs.dir}</outputDirectory>
+                            <resources>
+                                <resource>
+                                    <directory>${project.build.directory}</directory>
+                                    <include>${build.finalName}.jar</include>
+                                </resource>
+                                <resource>
+                                    <directory>${project.basedir}</directory>
+                                    <include>plugin.xml</include>
+                                </resource>
+                            </resources>
+                        </configuration>
+                    </execution>
+                </executions>
+            </plugin>
+            <plugin>
+                <artifactId>maven-surefire-plugin</artifactId>
+                <version>2.19.1</version>
+                <configuration>
+                    <excludedGroups>org.apache.nutch.test.IntegrationTest</excludedGroups>
+                </configuration>
+            </plugin>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-failsafe-plugin</artifactId>
+                <version>2.19.1</version>
+                <configuration>
+                    <systemPropertyVariables>
+                        <plugin.folders>../runtime/local/plugins</plugin.folders>
+                    </systemPropertyVariables>
+                </configuration>
+            </plugin>
+        </plugins>
+    </build>
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/crawl/AbstractFetchSchedule.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/crawl/AbstractFetchSchedule.java b/nutch-core/src/main/java/org/apache/nutch/crawl/AbstractFetchSchedule.java
new file mode 100755
index 0000000..c259419
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/crawl/AbstractFetchSchedule.java
@@ -0,0 +1,227 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+
+/**
+ * This class provides common methods for implementations of
+ * <code>FetchSchedule</code>.
+ * 
+ * @author Andrzej Bialecki
+ */
+public abstract class AbstractFetchSchedule extends Configured implements
+    FetchSchedule {
+  private static final Logger LOG = LoggerFactory
+      .getLogger(AbstractFetchSchedule.class);
+
+  protected int defaultInterval;
+  protected int maxInterval;
+
+  public AbstractFetchSchedule() {
+    super(null);
+  }
+
+  public AbstractFetchSchedule(Configuration conf) {
+    super(conf);
+  }
+
+  public void setConf(Configuration conf) {
+    super.setConf(conf);
+    if (conf == null)
+      return;
+    defaultInterval = conf.getInt("db.fetch.interval.default", 0);
+    maxInterval = conf.getInt("db.fetch.interval.max", 0);
+    LOG.info("defaultInterval=" + defaultInterval);
+    LOG.info("maxInterval=" + maxInterval);
+  }
+
+  /**
+   * Initialize fetch schedule related data. Implementations should at least set
+   * the <code>fetchTime</code> and <code>fetchInterval</code>. The default
+   * implementation sets the <code>fetchTime</code> to now, using the default
+   * <code>fetchInterval</code>.
+   * 
+   * @param url
+   *          URL of the page.
+   * 
+   * @param datum
+   *          datum instance to be initialized (modified in place).
+   */
+  public CrawlDatum initializeSchedule(Text url, CrawlDatum datum) {
+    datum.setFetchTime(System.currentTimeMillis());
+    datum.setFetchInterval(defaultInterval);
+    datum.setRetriesSinceFetch(0);
+    return datum;
+  }
+
+  /**
+   * Sets the <code>fetchInterval</code> and <code>fetchTime</code> on a
+   * successfully fetched page. NOTE: this implementation resets the retry
+   * counter - extending classes should call super.setFetchSchedule() to
+   * preserve this behavior.
+   */
+  public CrawlDatum setFetchSchedule(Text url, CrawlDatum datum,
+      long prevFetchTime, long prevModifiedTime, long fetchTime,
+      long modifiedTime, int state) {
+    datum.setRetriesSinceFetch(0);
+    return datum;
+  }
+
+  /**
+   * This method specifies how to schedule refetching of pages marked as GONE.
+   * Default implementation increases fetchInterval by 50% but the value may
+   * never exceed <code>maxInterval</code>.
+   * 
+   * @param url
+   *          URL of the page.
+   * 
+   * @param datum
+   *          datum instance to be adjusted.
+   * 
+   * @return adjusted page information, including all original information.
+   *         NOTE: this may be a different instance than @see CrawlDatum, but
+   *         implementations should make sure that it contains at least all
+   *         information from @see CrawlDatum.
+   */
+  public CrawlDatum setPageGoneSchedule(Text url, CrawlDatum datum,
+      long prevFetchTime, long prevModifiedTime, long fetchTime) {
+    // no page is truly GONE ... just increase the interval by 50%
+    // and try much later.
+    if ((datum.getFetchInterval() * 1.5f) < maxInterval)
+      datum.setFetchInterval(datum.getFetchInterval() * 1.5f);
+    else
+      datum.setFetchInterval(maxInterval * 0.9f);
+    datum.setFetchTime(fetchTime + (long) datum.getFetchInterval() * 1000);
+    return datum;
+  }
+
+  /**
+   * This method adjusts the fetch schedule if fetching needs to be re-tried due
+   * to transient errors. The default implementation sets the next fetch time 1
+   * day in the future and increases the retry counter.
+   * 
+   * @param url
+   *          URL of the page.
+   * 
+   * @param datum
+   *          page information.
+   * 
+   * @param prevFetchTime
+   *          previous fetch time.
+   * 
+   * @param prevModifiedTime
+   *          previous modified time.
+   * 
+   * @param fetchTime
+   *          current fetch time.
+   * 
+   * @return adjusted page information, including all original information.
+   *         NOTE: this may be a different instance than @see CrawlDatum, but
+   *         implementations should make sure that it contains at least all
+   *         information from @see CrawlDatum.
+   */
+  public CrawlDatum setPageRetrySchedule(Text url, CrawlDatum datum,
+      long prevFetchTime, long prevModifiedTime, long fetchTime) {
+    datum.setFetchTime(fetchTime + (long) SECONDS_PER_DAY * 1000);
+    datum.setRetriesSinceFetch(datum.getRetriesSinceFetch() + 1);
+    return datum;
+  }
+
+  /**
+   * This method return the last fetch time of the CrawlDatum
+   * 
+   * @return the date as a long.
+   */
+  public long calculateLastFetchTime(CrawlDatum datum) {
+    return datum.getFetchTime() - (long) datum.getFetchInterval() * 1000;
+  }
+
+  /**
+   * This method provides information whether the page is suitable for selection
+   * in the current fetchlist. NOTE: a true return value does not guarantee that
+   * the page will be fetched, it just allows it to be included in the further
+   * selection process based on scores. The default implementation checks
+   * <code>fetchTime</code>, if it is higher than the <code>curTime</code> it
+   * returns false, and true otherwise. It will also check that fetchTime is not
+   * too remote (more than <code>maxInterval</code>, in which case it lowers the
+   * interval and returns true.
+   * 
+   * @param url
+   *          URL of the page.
+   * 
+   * @param datum
+   *          datum instance.
+   * 
+   * @param curTime
+   *          reference time (usually set to the time when the fetchlist
+   *          generation process was started).
+   * 
+   * @return true, if the page should be considered for inclusion in the current
+   *         fetchlist, otherwise false.
+   */
+  public boolean shouldFetch(Text url, CrawlDatum datum, long curTime) {
+    // pages are never truly GONE - we have to check them from time to time.
+    // pages with too long fetchInterval are adjusted so that they fit within
+    // maximum fetchInterval (segment retention period).
+    if (datum.getFetchTime() - curTime > (long) maxInterval * 1000) {
+      if (datum.getFetchInterval() > maxInterval) {
+        datum.setFetchInterval(maxInterval * 0.9f);
+      }
+      datum.setFetchTime(curTime);
+    }
+    if (datum.getFetchTime() > curTime) {
+      return false; // not time yet
+    }
+    return true;
+  }
+
+  /**
+   * This method resets fetchTime, fetchInterval, modifiedTime,
+   * retriesSinceFetch and page signature, so that it forces refetching.
+   * 
+   * @param url
+   *          URL of the page.
+   * 
+   * @param datum
+   *          datum instance.
+   * 
+   * @param asap
+   *          if true, force refetch as soon as possible - this sets the
+   *          fetchTime to now. If false, force refetch whenever the next fetch
+   *          time is set.
+   */
+  public CrawlDatum forceRefetch(Text url, CrawlDatum datum, boolean asap) {
+    // reduce fetchInterval so that it fits within the max value
+    if (datum.getFetchInterval() > maxInterval)
+      datum.setFetchInterval(maxInterval * 0.9f);
+    datum.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
+    datum.setRetriesSinceFetch(0);
+    datum.setSignature(null);
+    datum.setModifiedTime(0L);
+    if (asap)
+      datum.setFetchTime(System.currentTimeMillis());
+    return datum;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java b/nutch-core/src/main/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
new file mode 100755
index 0000000..08cad34
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
@@ -0,0 +1,203 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.FloatWritable;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.util.NutchConfiguration;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * This class implements an adaptive re-fetch algorithm. This works as follows:
+ * <ul>
+ * <li>for pages that has changed since the last fetchTime, decrease their
+ * fetchInterval by a factor of DEC_FACTOR (default value is 0.2f).</li>
+ * <li>for pages that haven't changed since the last fetchTime, increase their
+ * fetchInterval by a factor of INC_FACTOR (default value is 0.2f).<br>
+ * If SYNC_DELTA property is true, then:
+ * <ul>
+ * <li>calculate a <code>delta = fetchTime - modifiedTime</code></li>
+ * <li>try to synchronize with the time of change, by shifting the next
+ * fetchTime by a fraction of the difference between the last modification time
+ * and the last fetch time. I.e. the next fetch time will be set to
+ * <code>fetchTime + fetchInterval - delta * SYNC_DELTA_RATE</code></li>
+ * <li>if the adjusted fetch interval is bigger than the delta, then
+ * <code>fetchInterval = delta</code>.</li>
+ * </ul>
+ * </li>
+ * <li>the minimum value of fetchInterval may not be smaller than MIN_INTERVAL
+ * (default is 1 minute).</li>
+ * <li>the maximum value of fetchInterval may not be bigger than MAX_INTERVAL
+ * (default is 365 days).</li>
+ * </ul>
+ * <p>
+ * NOTE: values of DEC_FACTOR and INC_FACTOR higher than 0.4f may destabilize
+ * the algorithm, so that the fetch interval either increases or decreases
+ * infinitely, with little relevance to the page changes. Please use
+ * {@link #main(String[])} method to test the values before applying them in a
+ * production system.
+ * </p>
+ * 
+ * @author Andrzej Bialecki
+ */
+public class AdaptiveFetchSchedule extends AbstractFetchSchedule {
+
+  // Loggg
+  public static final Logger LOG = LoggerFactory
+      .getLogger(AbstractFetchSchedule.class);
+
+  protected float INC_RATE;
+
+  protected float DEC_RATE;
+
+  private float MAX_INTERVAL;
+
+  private float MIN_INTERVAL;
+
+  private boolean SYNC_DELTA;
+
+  private double SYNC_DELTA_RATE;
+
+  public void setConf(Configuration conf) {
+    super.setConf(conf);
+    if (conf == null)
+      return;
+    INC_RATE = conf.getFloat("db.fetch.schedule.adaptive.inc_rate", 0.2f);
+    DEC_RATE = conf.getFloat("db.fetch.schedule.adaptive.dec_rate", 0.2f);
+    MIN_INTERVAL = conf.getFloat("db.fetch.schedule.adaptive.min_interval", (float) 60.0);
+    MAX_INTERVAL = conf.getFloat("db.fetch.schedule.adaptive.max_interval",
+        (float) SECONDS_PER_DAY * 365); // 1 year
+    SYNC_DELTA = conf.getBoolean("db.fetch.schedule.adaptive.sync_delta", true);
+    SYNC_DELTA_RATE = conf.getFloat(
+        "db.fetch.schedule.adaptive.sync_delta_rate", 0.2f);
+  }
+
+  @Override
+  public CrawlDatum setFetchSchedule(Text url, CrawlDatum datum,
+      long prevFetchTime, long prevModifiedTime, long fetchTime,
+      long modifiedTime, int state) {
+    super.setFetchSchedule(url, datum, prevFetchTime, prevModifiedTime,
+        fetchTime, modifiedTime, state);
+
+    float interval = datum.getFetchInterval();
+    long refTime = fetchTime;
+
+    // https://issues.apache.org/jira/browse/NUTCH-1430
+    interval = (interval == 0) ? defaultInterval : interval;
+
+    if (datum.getMetaData().containsKey(Nutch.WRITABLE_FIXED_INTERVAL_KEY)) {
+      // Is fetch interval preset in CrawlDatum MD? Then use preset interval
+      FloatWritable customIntervalWritable = (FloatWritable) (datum
+          .getMetaData().get(Nutch.WRITABLE_FIXED_INTERVAL_KEY));
+      interval = customIntervalWritable.get();
+    } else {
+      if (modifiedTime <= 0)
+        modifiedTime = fetchTime;
+      switch (state) {
+      case FetchSchedule.STATUS_MODIFIED:
+        interval *= (1.0f - DEC_RATE);
+        break;
+      case FetchSchedule.STATUS_NOTMODIFIED:
+        interval *= (1.0f + INC_RATE);
+        break;
+      case FetchSchedule.STATUS_UNKNOWN:
+        break;
+      }
+      if (SYNC_DELTA) {
+        // try to synchronize with the time of change
+        long delta = (fetchTime - modifiedTime) / 1000L;
+        if (delta > interval)
+          interval = delta;
+        refTime = fetchTime - Math.round(delta * SYNC_DELTA_RATE * 1000);
+      }
+      if (interval < MIN_INTERVAL) {
+        interval = MIN_INTERVAL;
+      } else if (interval > MAX_INTERVAL) {
+        interval = MAX_INTERVAL;
+      }
+    }
+
+    datum.setFetchInterval(interval);
+    datum.setFetchTime(refTime + Math.round(interval * 1000.0));
+    datum.setModifiedTime(modifiedTime);
+    return datum;
+  }
+
+  public static void main(String[] args) throws Exception {
+    FetchSchedule fs = new AdaptiveFetchSchedule();
+    fs.setConf(NutchConfiguration.create());
+    // we start the time at 0, for simplicity
+    long curTime = 0;
+    long delta = 1000L * 3600L * 24L; // 2 hours
+    // we trigger the update of the page every 30 days
+    long update = 1000L * 3600L * 24L * 30L; // 30 days
+    boolean changed = true;
+    long lastModified = 0;
+    int miss = 0;
+    int totalMiss = 0;
+    int maxMiss = 0;
+    int fetchCnt = 0;
+    int changeCnt = 0;
+    // initial fetchInterval is 10 days
+    CrawlDatum p = new CrawlDatum(1, 3600 * 24 * 30, 1.0f);
+    p.setFetchTime(0);
+    LOG.info(p.toString());
+    // let's move the timeline a couple of deltas
+    for (int i = 0; i < 10000; i++) {
+      if (lastModified + update < curTime) {
+        // System.out.println("i=" + i + ", lastModified=" + lastModified +
+        // ", update=" + update + ", curTime=" + curTime);
+        changed = true;
+        changeCnt++;
+        lastModified = curTime;
+      }
+      LOG.info(i + ". " + changed + "\twill fetch at "
+          + (p.getFetchTime() / delta) + "\tinterval "
+          + (p.getFetchInterval() / SECONDS_PER_DAY) + " days" + "\t missed "
+          + miss);
+      if (p.getFetchTime() <= curTime) {
+        fetchCnt++;
+        fs.setFetchSchedule(new Text("http://www.example.com"), p, p
+            .getFetchTime(), p.getModifiedTime(), curTime, lastModified,
+            changed ? FetchSchedule.STATUS_MODIFIED
+                : FetchSchedule.STATUS_NOTMODIFIED);
+        LOG.info("\tfetched & adjusted: " + "\twill fetch at "
+            + (p.getFetchTime() / delta) + "\tinterval "
+            + (p.getFetchInterval() / SECONDS_PER_DAY) + " days");
+        if (!changed)
+          miss++;
+        if (miss > maxMiss)
+          maxMiss = miss;
+        changed = false;
+        totalMiss += miss;
+        miss = 0;
+      }
+      if (changed)
+        miss++;
+      curTime += delta;
+    }
+    LOG.info("Total missed: " + totalMiss + ", max miss: " + maxMiss);
+    LOG.info("Page changed " + changeCnt + " times, fetched " + fetchCnt
+        + " times.");
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/crawl/CrawlDatum.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/crawl/CrawlDatum.java b/nutch-core/src/main/java/org/apache/nutch/crawl/CrawlDatum.java
new file mode 100644
index 0000000..7fe3e1e
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/crawl/CrawlDatum.java
@@ -0,0 +1,572 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+import java.io.*;
+import java.util.*;
+import java.util.Map.Entry;
+
+import org.apache.commons.jexl2.JexlContext;
+import org.apache.commons.jexl2.Expression;
+import org.apache.commons.jexl2.JexlEngine;
+import org.apache.commons.jexl2.MapContext;
+
+import org.apache.hadoop.io.*;
+import org.apache.nutch.util.*;
+
+/* The crawl state of a url. */
+public class CrawlDatum implements WritableComparable<CrawlDatum>, Cloneable {
+  public static final String GENERATE_DIR_NAME = "crawl_generate";
+  public static final String FETCH_DIR_NAME = "crawl_fetch";
+  public static final String PARSE_DIR_NAME = "crawl_parse";
+
+  private final static byte CUR_VERSION = 7;
+
+  /** Compatibility values for on-the-fly conversion from versions < 5. */
+  private static final byte OLD_STATUS_SIGNATURE = 0;
+  private static final byte OLD_STATUS_DB_UNFETCHED = 1;
+  private static final byte OLD_STATUS_DB_FETCHED = 2;
+  private static final byte OLD_STATUS_DB_GONE = 3;
+  private static final byte OLD_STATUS_LINKED = 4;
+  private static final byte OLD_STATUS_FETCH_SUCCESS = 5;
+  private static final byte OLD_STATUS_FETCH_RETRY = 6;
+  private static final byte OLD_STATUS_FETCH_GONE = 7;
+
+  private static HashMap<Byte, Byte> oldToNew = new HashMap<Byte, Byte>();
+
+  /** Page was not fetched yet. */
+  public static final byte STATUS_DB_UNFETCHED = 0x01;
+  /** Page was successfully fetched. */
+  public static final byte STATUS_DB_FETCHED = 0x02;
+  /** Page no longer exists. */
+  public static final byte STATUS_DB_GONE = 0x03;
+  /** Page temporarily redirects to other page. */
+  public static final byte STATUS_DB_REDIR_TEMP = 0x04;
+  /** Page permanently redirects to other page. */
+  public static final byte STATUS_DB_REDIR_PERM = 0x05;
+  /** Page was successfully fetched and found not modified. */
+  public static final byte STATUS_DB_NOTMODIFIED = 0x06;
+  public static final byte STATUS_DB_DUPLICATE = 0x07;
+
+  /** Maximum value of DB-related status. */
+  public static final byte STATUS_DB_MAX = 0x1f;
+
+  /** Fetching was successful. */
+  public static final byte STATUS_FETCH_SUCCESS = 0x21;
+  /** Fetching unsuccessful, needs to be retried (transient errors). */
+  public static final byte STATUS_FETCH_RETRY = 0x22;
+  /** Fetching temporarily redirected to other page. */
+  public static final byte STATUS_FETCH_REDIR_TEMP = 0x23;
+  /** Fetching permanently redirected to other page. */
+  public static final byte STATUS_FETCH_REDIR_PERM = 0x24;
+  /** Fetching unsuccessful - page is gone. */
+  public static final byte STATUS_FETCH_GONE = 0x25;
+  /** Fetching successful - page is not modified. */
+  public static final byte STATUS_FETCH_NOTMODIFIED = 0x26;
+
+  /** Maximum value of fetch-related status. */
+  public static final byte STATUS_FETCH_MAX = 0x3f;
+
+  /** Page signature. */
+  public static final byte STATUS_SIGNATURE = 0x41;
+  /** Page was newly injected. */
+  public static final byte STATUS_INJECTED = 0x42;
+  /** Page discovered through a link. */
+  public static final byte STATUS_LINKED = 0x43;
+  /** Page got metadata from a parser */
+  public static final byte STATUS_PARSE_META = 0x44;
+
+  public static final HashMap<Byte, String> statNames = new HashMap<Byte, String>();
+  static {
+    statNames.put(STATUS_DB_UNFETCHED, "db_unfetched");
+    statNames.put(STATUS_DB_FETCHED, "db_fetched");
+    statNames.put(STATUS_DB_GONE, "db_gone");
+    statNames.put(STATUS_DB_REDIR_TEMP, "db_redir_temp");
+    statNames.put(STATUS_DB_REDIR_PERM, "db_redir_perm");
+    statNames.put(STATUS_DB_NOTMODIFIED, "db_notmodified");
+    statNames.put(STATUS_DB_DUPLICATE, "db_duplicate");
+    statNames.put(STATUS_SIGNATURE, "signature");
+    statNames.put(STATUS_INJECTED, "injected");
+    statNames.put(STATUS_LINKED, "linked");
+    statNames.put(STATUS_FETCH_SUCCESS, "fetch_success");
+    statNames.put(STATUS_FETCH_RETRY, "fetch_retry");
+    statNames.put(STATUS_FETCH_REDIR_TEMP, "fetch_redir_temp");
+    statNames.put(STATUS_FETCH_REDIR_PERM, "fetch_redir_perm");
+    statNames.put(STATUS_FETCH_GONE, "fetch_gone");
+    statNames.put(STATUS_FETCH_NOTMODIFIED, "fetch_notmodified");
+    statNames.put(STATUS_PARSE_META, "parse_metadata");
+
+    oldToNew.put(OLD_STATUS_DB_UNFETCHED, STATUS_DB_UNFETCHED);
+    oldToNew.put(OLD_STATUS_DB_FETCHED, STATUS_DB_FETCHED);
+    oldToNew.put(OLD_STATUS_DB_GONE, STATUS_DB_GONE);
+    oldToNew.put(OLD_STATUS_FETCH_GONE, STATUS_FETCH_GONE);
+    oldToNew.put(OLD_STATUS_FETCH_SUCCESS, STATUS_FETCH_SUCCESS);
+    oldToNew.put(OLD_STATUS_FETCH_RETRY, STATUS_FETCH_RETRY);
+    oldToNew.put(OLD_STATUS_LINKED, STATUS_LINKED);
+    oldToNew.put(OLD_STATUS_SIGNATURE, STATUS_SIGNATURE);
+  }
+
+  private byte status;
+  private long fetchTime = System.currentTimeMillis();
+  private byte retries;
+  private int fetchInterval;
+  private float score = 0.0f;
+  private byte[] signature = null;
+  private long modifiedTime;
+  private org.apache.hadoop.io.MapWritable metaData;
+
+  public static boolean hasDbStatus(CrawlDatum datum) {
+    if (datum.status <= STATUS_DB_MAX)
+      return true;
+    return false;
+  }
+
+  public static boolean hasFetchStatus(CrawlDatum datum) {
+    if (datum.status > STATUS_DB_MAX && datum.status <= STATUS_FETCH_MAX)
+      return true;
+    return false;
+  }
+
+  public CrawlDatum() {
+  }
+
+  public CrawlDatum(int status, int fetchInterval) {
+    this();
+    this.status = (byte) status;
+    this.fetchInterval = fetchInterval;
+  }
+
+  public CrawlDatum(int status, int fetchInterval, float score) {
+    this(status, fetchInterval);
+    this.score = score;
+  }
+
+  //
+  // accessor methods
+  //
+
+  public byte getStatus() {
+    return status;
+  }
+
+  public static String getStatusName(byte value) {
+    String res = statNames.get(value);
+    if (res == null)
+      res = "unknown";
+    return res;
+  }
+
+  public void setStatus(int status) {
+    this.status = (byte) status;
+  }
+
+  /**
+   * Returns either the time of the last fetch, or the next fetch time,
+   * depending on whether Fetcher or CrawlDbReducer set the time.
+   */
+  public long getFetchTime() {
+    return fetchTime;
+  }
+
+  /**
+   * Sets either the time of the last fetch or the next fetch time, depending on
+   * whether Fetcher or CrawlDbReducer set the time.
+   */
+  public void setFetchTime(long fetchTime) {
+    this.fetchTime = fetchTime;
+  }
+
+  public long getModifiedTime() {
+    return modifiedTime;
+  }
+
+  public void setModifiedTime(long modifiedTime) {
+    this.modifiedTime = modifiedTime;
+  }
+
+  public byte getRetriesSinceFetch() {
+    return retries;
+  }
+
+  public void setRetriesSinceFetch(int retries) {
+    this.retries = (byte) retries;
+  }
+
+  public int getFetchInterval() {
+    return fetchInterval;
+  }
+
+  public void setFetchInterval(int fetchInterval) {
+    this.fetchInterval = fetchInterval;
+  }
+
+  public void setFetchInterval(float fetchInterval) {
+    this.fetchInterval = Math.round(fetchInterval);
+  }
+
+  public float getScore() {
+    return score;
+  }
+
+  public void setScore(float score) {
+    this.score = score;
+  }
+
+  public byte[] getSignature() {
+    return signature;
+  }
+
+  public void setSignature(byte[] signature) {
+    if (signature != null && signature.length > 256)
+      throw new RuntimeException("Max signature length (256) exceeded: "
+          + signature.length);
+    this.signature = signature;
+  }
+
+  public void setMetaData(org.apache.hadoop.io.MapWritable mapWritable) {
+    this.metaData = new org.apache.hadoop.io.MapWritable(mapWritable);
+  }
+
+  /**
+   * Add all metadata from other CrawlDatum to this CrawlDatum.
+   * 
+   * @param other
+   *          CrawlDatum
+   */
+  public void putAllMetaData(CrawlDatum other) {
+    for (Entry<Writable, Writable> e : other.getMetaData().entrySet()) {
+      getMetaData().put(e.getKey(), e.getValue());
+    }
+  }
+
+  /**
+   * returns a MapWritable if it was set or read in @see readFields(DataInput),
+   * returns empty map in case CrawlDatum was freshly created (lazily
+   * instantiated).
+   */
+  public org.apache.hadoop.io.MapWritable getMetaData() {
+    if (this.metaData == null)
+      this.metaData = new org.apache.hadoop.io.MapWritable();
+    return this.metaData;
+  }
+
+  //
+  // writable methods
+  //
+
+  public static CrawlDatum read(DataInput in) throws IOException {
+    CrawlDatum result = new CrawlDatum();
+    result.readFields(in);
+    return result;
+  }
+
+  public void readFields(DataInput in) throws IOException {
+    byte version = in.readByte(); // read version
+    if (version > CUR_VERSION) // check version
+      throw new VersionMismatchException(CUR_VERSION, version);
+
+    status = in.readByte();
+    fetchTime = in.readLong();
+    retries = in.readByte();
+    if (version > 5) {
+      fetchInterval = in.readInt();
+    } else
+      fetchInterval = Math.round(in.readFloat());
+    score = in.readFloat();
+    if (version > 2) {
+      modifiedTime = in.readLong();
+      int cnt = in.readByte();
+      if (cnt > 0) {
+        signature = new byte[cnt];
+        in.readFully(signature);
+      } else
+        signature = null;
+    }
+
+    if (version > 3) {
+      boolean hasMetadata = false;
+      if (version < 7) {
+        org.apache.hadoop.io.MapWritable oldMetaData = new org.apache.hadoop.io.MapWritable();
+        if (in.readBoolean()) {
+          hasMetadata = true;
+          metaData = new org.apache.hadoop.io.MapWritable();
+          oldMetaData.readFields(in);
+        }
+        for (Writable key : oldMetaData.keySet()) {
+          metaData.put(key, oldMetaData.get(key));
+        }
+      } else {
+        if (in.readBoolean()) {
+          hasMetadata = true;
+          metaData = new org.apache.hadoop.io.MapWritable();
+          metaData.readFields(in);
+        }
+      }
+      if (hasMetadata == false)
+        metaData = null;
+    }
+    // translate status codes
+    if (version < 5) {
+      if (oldToNew.containsKey(status))
+        status = oldToNew.get(status);
+      else
+        status = STATUS_DB_UNFETCHED;
+
+    }
+  }
+
+  /** The number of bytes into a CrawlDatum that the score is stored. */
+  private static final int SCORE_OFFSET = 1 + 1 + 8 + 1 + 4;
+  private static final int SIG_OFFSET = SCORE_OFFSET + 4 + 8;
+
+  public void write(DataOutput out) throws IOException {
+    out.writeByte(CUR_VERSION); // store current version
+    out.writeByte(status);
+    out.writeLong(fetchTime);
+    out.writeByte(retries);
+    out.writeInt(fetchInterval);
+    out.writeFloat(score);
+    out.writeLong(modifiedTime);
+    if (signature == null) {
+      out.writeByte(0);
+    } else {
+      out.writeByte(signature.length);
+      out.write(signature);
+    }
+    if (metaData != null && metaData.size() > 0) {
+      out.writeBoolean(true);
+      metaData.write(out);
+    } else {
+      out.writeBoolean(false);
+    }
+  }
+
+  /** Copy the contents of another instance into this instance. */
+  public void set(CrawlDatum that) {
+    this.status = that.status;
+    this.fetchTime = that.fetchTime;
+    this.retries = that.retries;
+    this.fetchInterval = that.fetchInterval;
+    this.score = that.score;
+    this.modifiedTime = that.modifiedTime;
+    this.signature = that.signature;
+    if (that.metaData != null) {
+      this.metaData = new org.apache.hadoop.io.MapWritable(that.metaData); // make
+                                                                           // a
+                                                                           // deep
+                                                                           // copy
+    } else {
+      this.metaData = null;
+    }
+  }
+
+  //
+  // compare methods
+  //
+
+  /** Sort by decreasing score. */
+  public int compareTo(CrawlDatum that) {
+    if (that.score != this.score)
+      return (that.score - this.score) > 0 ? 1 : -1;
+    if (that.status != this.status)
+      return this.status - that.status;
+    if (that.fetchTime != this.fetchTime)
+      return (that.fetchTime - this.fetchTime) > 0 ? 1 : -1;
+    if (that.retries != this.retries)
+      return that.retries - this.retries;
+    if (that.fetchInterval != this.fetchInterval)
+      return (that.fetchInterval - this.fetchInterval) > 0 ? 1 : -1;
+    if (that.modifiedTime != this.modifiedTime)
+      return (that.modifiedTime - this.modifiedTime) > 0 ? 1 : -1;
+    return SignatureComparator._compare(this, that);
+  }
+
+  /** A Comparator optimized for CrawlDatum. */
+  public static class Comparator extends WritableComparator {
+    public Comparator() {
+      super(CrawlDatum.class);
+    }
+
+    public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
+      float score1 = readFloat(b1, s1 + SCORE_OFFSET);
+      float score2 = readFloat(b2, s2 + SCORE_OFFSET);
+      if (score2 != score1) {
+        return (score2 - score1) > 0 ? 1 : -1;
+      }
+      int status1 = b1[s1 + 1];
+      int status2 = b2[s2 + 1];
+      if (status2 != status1)
+        return status1 - status2;
+      long fetchTime1 = readLong(b1, s1 + 1 + 1);
+      long fetchTime2 = readLong(b2, s2 + 1 + 1);
+      if (fetchTime2 != fetchTime1)
+        return (fetchTime2 - fetchTime1) > 0 ? 1 : -1;
+      int retries1 = b1[s1 + 1 + 1 + 8];
+      int retries2 = b2[s2 + 1 + 1 + 8];
+      if (retries2 != retries1)
+        return retries2 - retries1;
+      int fetchInterval1 = readInt(b1, s1 + 1 + 1 + 8 + 1);
+      int fetchInterval2 = readInt(b2, s2 + 1 + 1 + 8 + 1);
+      if (fetchInterval2 != fetchInterval1)
+        return (fetchInterval2 - fetchInterval1) > 0 ? 1 : -1;
+      long modifiedTime1 = readLong(b1, s1 + SCORE_OFFSET + 4);
+      long modifiedTime2 = readLong(b2, s2 + SCORE_OFFSET + 4);
+      if (modifiedTime2 != modifiedTime1)
+        return (modifiedTime2 - modifiedTime1) > 0 ? 1 : -1;
+      int sigl1 = b1[s1 + SIG_OFFSET];
+      int sigl2 = b2[s2 + SIG_OFFSET];
+      return SignatureComparator._compare(b1, SIG_OFFSET, sigl1, b2,
+          SIG_OFFSET, sigl2);
+    }
+  }
+
+  static { // register this comparator
+    WritableComparator.define(CrawlDatum.class, new Comparator());
+  }
+
+  //
+  // basic methods
+  //
+
+  public String toString() {
+    StringBuilder buf = new StringBuilder();
+    buf.append("Version: " + CUR_VERSION + "\n");
+    buf.append("Status: " + getStatus() + " (" + getStatusName(getStatus())
+        + ")\n");
+    buf.append("Fetch time: " + new Date(getFetchTime()) + "\n");
+    buf.append("Modified time: " + new Date(getModifiedTime()) + "\n");
+    buf.append("Retries since fetch: " + getRetriesSinceFetch() + "\n");
+    buf.append("Retry interval: " + getFetchInterval() + " seconds ("
+        + (getFetchInterval() / FetchSchedule.SECONDS_PER_DAY) + " days)\n");
+    buf.append("Score: " + getScore() + "\n");
+    buf.append("Signature: " + StringUtil.toHexString(getSignature()) + "\n");
+    buf.append("Metadata: \n ");
+    if (metaData != null) {
+      for (Entry<Writable, Writable> e : metaData.entrySet()) {
+        buf.append("\t");
+        buf.append(e.getKey());
+        buf.append("=");
+        buf.append(e.getValue());
+        buf.append("\n");
+      }
+    }
+    return buf.toString();
+  }
+
+  private boolean metadataEquals(org.apache.hadoop.io.MapWritable otherMetaData) {
+    if (metaData == null || metaData.size() == 0) {
+      return otherMetaData == null || otherMetaData.size() == 0;
+    }
+    if (otherMetaData == null) {
+      // we already know that the current object is not null or empty
+      return false;
+    }
+    HashSet<Entry<Writable, Writable>> set1 = new HashSet<Entry<Writable, Writable>>(
+        metaData.entrySet());
+    HashSet<Entry<Writable, Writable>> set2 = new HashSet<Entry<Writable, Writable>>(
+        otherMetaData.entrySet());
+    return set1.equals(set2);
+  }
+
+  public boolean equals(Object o) {
+    if (!(o instanceof CrawlDatum))
+      return false;
+    CrawlDatum other = (CrawlDatum) o;
+    boolean res = (this.status == other.status)
+        && (this.fetchTime == other.fetchTime)
+        && (this.modifiedTime == other.modifiedTime)
+        && (this.retries == other.retries)
+        && (this.fetchInterval == other.fetchInterval)
+        && (SignatureComparator._compare(this.signature, other.signature) == 0)
+        && (this.score == other.score);
+    if (!res)
+      return res;
+    return metadataEquals(other.metaData);
+  }
+
+  public int hashCode() {
+    int res = 0;
+    if (signature != null) {
+      for (int i = 0; i < signature.length / 4; i += 4) {
+        res ^= (signature[i] << 24 + signature[i + 1] << 16 + signature[i + 2] << 8 + signature[i + 3]);
+      }
+    }
+    if (metaData != null) {
+      res ^= metaData.entrySet().hashCode();
+    }
+    return res ^ status ^ ((int) fetchTime) ^ ((int) modifiedTime) ^ retries
+        ^ fetchInterval ^ Float.floatToIntBits(score);
+  }
+
+  public Object clone() {
+    try {
+      return super.clone();
+    } catch (CloneNotSupportedException e) {
+      throw new RuntimeException(e);
+    }
+  }
+  
+  public boolean evaluate(Expression expr) {
+    if (expr != null) {
+      // Create a context and add data
+      JexlContext jcontext = new MapContext();
+      
+      // https://issues.apache.org/jira/browse/NUTCH-2229
+      jcontext.set("status", getStatusName(getStatus()));
+      jcontext.set("fetchTime", (long)(getFetchTime()));
+      jcontext.set("modifiedTime", (long)(getModifiedTime()));
+      jcontext.set("retries", getRetriesSinceFetch());
+      jcontext.set("interval", new Integer(getFetchInterval()));
+      jcontext.set("score", getScore());
+      jcontext.set("signature", StringUtil.toHexString(getSignature()));
+            
+      // Set metadata variables
+      for (Map.Entry<Writable, Writable> entry : getMetaData().entrySet()) {
+        Object value = entry.getValue();
+        
+        if (value instanceof FloatWritable) {
+          FloatWritable fvalue = (FloatWritable)value;
+          Text tkey = (Text)entry.getKey();
+          jcontext.set(tkey.toString(), fvalue.get());
+        }
+        
+        if (value instanceof IntWritable) {
+          IntWritable ivalue = (IntWritable)value;
+          Text tkey = (Text)entry.getKey();
+          jcontext.set(tkey.toString(), ivalue.get());
+        }
+        
+        if (value instanceof Text) {
+          Text tvalue = (Text)value;
+          Text tkey = (Text)entry.getKey();     
+          jcontext.set(tkey.toString().replace("-", "_"), tvalue.toString());
+        }
+      }
+                  
+      try {
+        if (Boolean.TRUE.equals(expr.evaluate(jcontext))) {
+          return true;
+        }
+      } catch (Exception e) {
+        //
+      }
+    }
+
+    return false;
+  }
+}
\ No newline at end of file

[09/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build for nutch-core and nutch-plugins

Posted by th...@apache.org.

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parsefilter-regex/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parsefilter-regex/plugin.xml b/nutch-plugins/parsefilter-regex/plugin.xml
new file mode 100644
index 0000000..0725492
--- /dev/null
+++ b/nutch-plugins/parsefilter-regex/plugin.xml
@@ -0,0 +1,42 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="parsefilter-regex"
+   name="Regex Parse Filter"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="parsefilter-regex.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+   <extension id="org.apache.nutch.htmlparsefilter.regex"
+        name="Nutch Parser Filter" point="org.apache.nutch.parse.HtmlParseFilter">
+      <implementation id="RegexParseFilter" 
+                      class="org.apache.nutch.parsefilter.regex.RegexParseFilter">
+          <parameter name="file" value="regex-parsefilter.txt"/>
+      </implementation>
+   </extension>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parsefilter-regex/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parsefilter-regex/pom.xml b/nutch-plugins/parsefilter-regex/pom.xml
new file mode 100644
index 0000000..19b6452
--- /dev/null
+++ b/nutch-plugins/parsefilter-regex/pom.xml
@@ -0,0 +1,38 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>parsefilter-regex</artifactId>
+    <packaging>jar</packaging>
+
+    <name>parsefilter-regex</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parsefilter-regex/src/main/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parsefilter-regex/src/main/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java b/nutch-plugins/parsefilter-regex/src/main/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java
new file mode 100644
index 0000000..0752c91
--- /dev/null
+++ b/nutch-plugins/parsefilter-regex/src/main/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java
@@ -0,0 +1,199 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parsefilter.regex;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.FileReader;
+import java.io.Reader;
+import java.io.StringReader;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.parse.HTMLMetaTags;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.HtmlParseFilter;
+import org.apache.nutch.parse.ParseResult;
+import org.apache.nutch.plugin.Extension;
+import org.apache.nutch.plugin.PluginRepository;
+import org.apache.nutch.protocol.Content;
+
+import org.apache.commons.lang.StringUtils;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.w3c.dom.*;
+
+/**
+ * RegexParseFilter. If a regular expression matches either HTML or 
+ * extracted text, a configurable field is set to true.
+ */
+public class RegexParseFilter implements HtmlParseFilter {
+  
+  private static final Logger LOG = LoggerFactory.getLogger(RegexParseFilter.class);
+  private static String attributeFile = null;
+  private String regexFile = null;
+  
+  private Configuration conf;
+  private DocumentFragment doc;
+  
+  private static final Map<String,RegexRule> rules = new HashMap<String,RegexRule>();
+  
+  public RegexParseFilter() {}
+  
+  public RegexParseFilter(String regexFile) {
+    this.regexFile = regexFile;
+  }
+
+  public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) {
+    Parse parse = parseResult.get(content.getUrl());
+    String html = new String(content.getContent());
+    String text = parse.getText();
+    
+    for (Map.Entry<String, RegexRule> entry : rules.entrySet()) {
+      String field = entry.getKey();
+      RegexRule regexRule = entry.getValue();
+      
+      String source = null;
+      if (regexRule.source.equalsIgnoreCase("html")) {
+        source = html;
+      }
+      if (regexRule.source.equalsIgnoreCase("text")) {
+        source = text;
+      }
+      
+      if (source == null) {
+        LOG.error("source for regex rule: " + field + " misconfigured");
+      }
+      
+      if (matches(source, regexRule.regex)) {
+        parse.getData().getParseMeta().set(field, "true");
+      } else {
+        parse.getData().getParseMeta().set(field, "false");
+      }
+    }
+    
+    return parseResult;
+  }
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+
+    // get the extensions for domain urlfilter
+    String pluginName = "parsefilter-regex";
+    Extension[] extensions = PluginRepository.get(conf).getExtensionPoint(
+      HtmlParseFilter.class.getName()).getExtensions();
+    for (int i = 0; i < extensions.length; i++) {
+      Extension extension = extensions[i];
+      if (extension.getDescriptor().getPluginId().equals(pluginName)) {
+        attributeFile = extension.getAttribute("file");
+        break;
+      }
+    }
+
+    // handle blank non empty input
+    if (attributeFile != null && attributeFile.trim().equals("")) {
+      attributeFile = null;
+    }
+
+    if (attributeFile != null) {
+      if (LOG.isInfoEnabled()) {
+        LOG.info("Attribute \"file\" is defined for plugin " + pluginName
+          + " as " + attributeFile);
+      }
+    }
+    else {
+      if (LOG.isWarnEnabled()) {
+        LOG.warn("Attribute \"file\" is not defined in plugin.xml for plugin "
+          + pluginName);
+      }
+    }
+
+    // domain file and attribute "file" take precedence if defined
+    String file = conf.get("parsefilter.regex.file");
+    String stringRules = conf.get("parsefilter.regex.rules");
+    if (regexFile != null) {
+      file = regexFile;
+    }
+    else if (attributeFile != null) {
+      file = attributeFile;
+    }
+    Reader reader = null;
+    if (stringRules != null) { // takes precedence over files
+      reader = new StringReader(stringRules);
+    } else {
+      reader = conf.getConfResourceAsReader(file);
+    }
+    try {
+      if (reader == null) {
+        reader = new FileReader(file);
+      }
+      readConfiguration(reader);
+    }
+    catch (IOException e) {
+      LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
+    }
+  }
+
+  public Configuration getConf() {
+    return this.conf;
+  }
+  
+  private boolean matches(String value, Pattern pattern) {
+    if (value != null) {
+      Matcher matcher = pattern.matcher(value);
+      return matcher.find();
+    }
+       
+    return false;
+  }
+  
+  private synchronized void readConfiguration(Reader configReader) throws IOException {
+    if (rules.size() > 0) {
+      return;
+    }
+
+    String line;
+    BufferedReader reader = new BufferedReader(configReader);
+    while ((line = reader.readLine()) != null) {
+      if (StringUtils.isNotBlank(line) && !line.startsWith("#")) {
+        line = line.trim();
+        String[] parts = line.split("\t");
+
+        String field = parts[0].trim();
+        String source = parts[1].trim();
+        String regex = parts[2].trim();
+        
+        rules.put(field, new RegexRule(source, regex));
+      }
+    }
+  }
+  
+  private static class RegexRule {
+    public RegexRule(String source, String regex) {
+      this.source = source;
+      this.regex = Pattern.compile(regex);
+    }
+    String source;
+    Pattern regex;
+  }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parsefilter-regex/src/main/java/org/apache/nutch/parsefilter/regex/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parsefilter-regex/src/main/java/org/apache/nutch/parsefilter/regex/package-info.java b/nutch-plugins/parsefilter-regex/src/main/java/org/apache/nutch/parsefilter/regex/package-info.java
new file mode 100644
index 0000000..f8f46ee
--- /dev/null
+++ b/nutch-plugins/parsefilter-regex/src/main/java/org/apache/nutch/parsefilter/regex/package-info.java
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * RegexParseFilter. If a regular expression matches either HTML or 
+ * extracted text, a configurable field is set to true.
+ */
+package org.apache.nutch.parsefilter.regex;
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parsefilter-regex/src/test/java/org/apache/nutch/parsefilter/regex/TestRegexParseFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parsefilter-regex/src/test/java/org/apache/nutch/parsefilter/regex/TestRegexParseFilter.java b/nutch-plugins/parsefilter-regex/src/test/java/org/apache/nutch/parsefilter/regex/TestRegexParseFilter.java
new file mode 100644
index 0000000..9bd7149
--- /dev/null
+++ b/nutch-plugins/parsefilter-regex/src/test/java/org/apache/nutch/parsefilter/regex/TestRegexParseFilter.java
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parsefilter.regex;
+
+import java.net.MalformedURLException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.ParseResult;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.NutchConfiguration;
+import junit.framework.TestCase;
+
+public class TestRegexParseFilter extends TestCase {
+
+  private final static String SEPARATOR = System.getProperty("file.separator");
+  private final static String SAMPLES = System.getProperty("test.data", ".");
+
+  public void testPositiveFilter() throws Exception {
+    Configuration conf = NutchConfiguration.create();
+
+    String file = SAMPLES + SEPARATOR + "regex-parsefilter.txt";
+    RegexParseFilter filter = new RegexParseFilter(file);
+    filter.setConf(conf);
+
+    String url = "http://nutch.apache.org/";
+    String html = "<body><html><h1>nutch</h1><p>this is the extracted text blablabla</p></body></html>";
+    Content content = new Content(url, url, html.getBytes("UTF-8"), "text/html", new Metadata(), conf);
+    Parse parse = new ParseImpl("nutch this is the extracted text blablabla", new ParseData());
+    
+    ParseResult result = ParseResult.createParseResult(url, parse);
+    result = filter.filter(content, result, null, null);
+
+    Metadata meta = parse.getData().getParseMeta();
+    
+    assertEquals("true", meta.get("first"));
+    assertEquals("true", meta.get("second"));
+  }
+  
+  public void testNegativeFilter() throws Exception {
+    Configuration conf = NutchConfiguration.create();
+
+    String file = SAMPLES + SEPARATOR + "regex-parsefilter.txt";
+    RegexParseFilter filter = new RegexParseFilter(file);
+    filter.setConf(conf);
+
+    String url = "http://nutch.apache.org/";
+    String html = "<body><html><h2>nutch</h2><p>this is the extracted text no bla</p></body></html>";
+    Content content = new Content(url, url, html.getBytes("UTF-8"), "text/html", new Metadata(), conf);
+    Parse parse = new ParseImpl("nutch this is the extracted text bla", new ParseData());
+    
+    ParseResult result = ParseResult.createParseResult(url, parse);
+    result = filter.filter(content, result, null, null);
+
+    Metadata meta = parse.getData().getParseMeta();
+    
+    assertEquals("false", meta.get("first"));
+    assertEquals("false", meta.get("second"));
+  }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parsefilter-regex/src/test/resources/regex-parsefilter.txt
----------------------------------------------------------------------
diff --git a/nutch-plugins/parsefilter-regex/src/test/resources/regex-parsefilter.txt b/nutch-plugins/parsefilter-regex/src/test/resources/regex-parsefilter.txt
new file mode 100644
index 0000000..9d15cd8
--- /dev/null
+++ b/nutch-plugins/parsefilter-regex/src/test/resources/regex-parsefilter.txt
@@ -0,0 +1,10 @@
+# Example configuration file for parsefilter-regex
+#
+# Parse metadata field <name> is set to true if the HTML matches the regex. The
+# source can either be html or text. If source is html, the regex is applied to
+# the entire HTML tree. If source is text, the regex is applied to the
+# extracted text.
+#
+# format: <name>\t<source>\t<regex>\n
+first	html	h1
+second	text	blablabla

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/plugin.dtd
----------------------------------------------------------------------
diff --git a/nutch-plugins/plugin.dtd b/nutch-plugins/plugin.dtd
new file mode 100644
index 0000000..9b67da7
--- /dev/null
+++ b/nutch-plugins/plugin.dtd
@@ -0,0 +1,206 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+ ! Licensed to the Apache Software Foundation (ASF) under one or more
+ ! contributor license agreements.  See the NOTICE file distributed with
+ ! this work for additional information regarding copyright ownership.
+ ! The ASF licenses this file to You under the Apache License, Version 2.0
+ ! (the "License"); you may not use this file except in compliance with
+ ! the License.  You may obtain a copy of the License at
+ !
+ !     http://www.apache.org/licenses/LICENSE-2.0
+ !
+ ! Unless required by applicable law or agreed to in writing, software
+ ! distributed under the License is distributed on an "AS IS" BASIS,
+ ! WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ! See the License for the specific language governing permissions and
+ ! limitations under the License.
+ !
+ !
+ !  Document   : plugin.dtd
+ !  Created on : 14 avril 2006, 22:14
+ !  Author     : Chris Mattmann, Jerome Charron
+ !  Description: Nutch plug-in manifest DTD
+ !
+ !  PUBLIC ID  : -//Apache Software Fundation//DTD Nutch Plugin Manifest 1.0//EN
+ !  SYSTEM ID  : http://lucene.apache.org/nutch/plugin.dtd
+-->
+
+
+
+<!--
+ ! The <plugin> element defines the body of the manifest.
+ ! It optionally contains definitions for the plug-in runtime,
+ ! definitions of other plug-ins required by this one,
+ ! declarations of any new extension points being introduced by the plug-in,
+ ! as well as configuration of functional extensions
+ ! (configured into extension points defined by other plug-ins,
+ ! or introduced by this plug-in).
+ !-->
+<!ELEMENT plugin (runtime?, requires?, extension-point*, extension*)>
+
+<!-- A user displayable name for the plug-in -->
+<!ATTLIST plugin name CDATA #REQUIRED>
+
+<!-- 
+ ! A unique identifier for the plug-in.
+ ! To minimize potential for naming collisions,
+ ! the identifier should be derived from the internet domain id
+ ! of the supplying provider (reversing the domain name tokens and
+ ! appending additional name tokens separated by dot [.]).
+ ! For example, provider nutch.org could define plug-in identifier
+ ! org.nutch.myplugin
+ !-->
+<!ATTLIST plugin id CDATA #REQUIRED>
+
+<!--
+ ! The plug-in version number.
+ ! NOTE : Version numbers compatibility are not yet implemented.
+ !-->
+<!ATTLIST plugin version CDATA #REQUIRED>
+
+<!-- The user-displayable name of the provider supplying the plug-in. -->
+<!ATTLIST plugin provider-name CDATA #IMPLIED>
+
+<!--
+ ! The name of the plug-in class for this plug-in.
+ ! The class must be a subclass of org.apache.nutch.plugin.Plugin
+ !-->
+<!ATTLIST plugin class CDATA #IMPLIED>
+
+
+<!-- 
+ ! The <requires> section of the manifest declares
+ ! any dependencies on other plug-ins.
+ !-->
+<!ELEMENT requires (import+)>
+
+
+<!-- Each dependency is specified using an <import> element. -->
+<!ELEMENT import EMPTY>
+
+<!-- The identifier of the required plug-in. -->
+<!ATTLIST import plugin CDATA #REQUIRED>
+
+
+<!--
+ ! The <runtime> section of the manifest contains a definition of one or more
+ ! libraries that make up the plug-in runtime.
+ ! The referenced libraries are used by the plugin execution mechanisms
+ ! (the plug-in class loader) to load and execute the correct code required by
+ ! the plug-in.
+ !-->
+<!ELEMENT runtime (library+)>
+
+
+<!--
+ !The <library> elements collectively define the plug-in runtime.
+ ! At least one <library> must be specified.
+ !-->
+<!ELEMENT library (export*)>
+
+<!--
+ ! A string reference to a library file or directory containing classes
+ ! (relative to the plug-in install directory).
+ ! Directory references must contain trailing file separator.
+ !-->
+<!ATTLIST library name CDATA #REQUIRED>
+
+
+<!--
+ ! Each <library> element can specify which portion
+ ! of the library should be exported.
+ ! The export rules are specified as a set of export masks.
+ ! By default (no export rules specified),
+ ! the library is considered to be private.
+ ! Each export mask is specified using the name attribute.
+ !-->
+<!ELEMENT export EMPTY>
+
+<!--
+ ! The export mask can have the following values:
+ !   * - indicates all contents of library are exported (public)
+ !   package.name.* - indicates all classes in the specified package
+ !                    are exported. The matching rules are the same as in the
+ !                    Java import statement.
+ !   package.name.ClassName - fully qualified java class name
+ !
+ ! NOTE : export mask is not yet implemented in Nutch.
+ !-->
+<!ATTLIST export name CDATA #REQUIRED>
+
+
+<!--
+ ! Nutch's architecture is based on the notion of configurable extension points.
+ ! Nutch itself predefines a set of extension points that cover the task of
+ ! extending it (for example, adding parser, indexing filter, ...).
+ ! In addition to the predefined extension points, each supplied plug-in can
+ ! declare additional extension points. By declaring an extension point the
+ ! plug-in is essentially advertising the ability to configure the plug-in
+ ! function with externally supplied extensions.
+ !-->
+<!ELEMENT extension-point EMPTY>
+
+<!-- A user-displayable name for the extension point. -->
+<!ATTLIST extension-point name CDATA #REQUIRED>
+
+<!-- A simple id, unique within this plug-in -->
+<!ATTLIST extension-point id CDATA #REQUIRED>
+
+
+<!--
+ ! Actual extensions are configured into extension points
+ ! (predefined, or newly declared in this plug-in) in the <extension> section.
+ !
+ ! The configuration information is specified by at least one implementation
+ ! with some parameters.
+ !-->
+<!ELEMENT extension (implementation+)>
+
+<!-- 
+ ! A reference to an extension point being configured.
+ ! The extension point can be one defined in this plug-in or another plug-in.
+ !-->
+<!ATTLIST extension point CDATA #REQUIRED>
+
+<!--
+ ! Optional identifier for this extension point configuration instance.
+ ! This is used by extension points that need to uniquely identify
+ ! (rather than just enumerate) the specific configured extensions.
+ ! The identifier is specified as a simple token unique within the definition
+ ! of the declaring plug-in. When used globally, the extension identifier
+ ! is qualified by the plug-in identifier.
+ ! FIXME : Seems it is never read in the code.
+ !-->
+<!ATTLIST extension id CDATA #IMPLIED>
+
+<!--
+ ! A user-displayable name for the extension.
+ ! FIXME : Seems it is never read in the code.
+ !-->
+<!ATTLIST extension name CDATA #IMPLIED>
+
+
+<!--
+ ! Defines a specific implementation for the extension.
+ ! This implementation can define some special name/value parameters
+ ! used at runtime.
+ !-->
+<!ELEMENT implementation (parameter*)>
+
+<!-- A unique identifier for this implementation -->
+<!ATTLIST implementation id CDATA #REQUIRED>
+
+<!-- The fully-qualified Java Class that implements this extension-point -->
+<!ATTLIST implementation class CDATA #REQUIRED>
+
+
+<!-- Defines a name/value parameter -->
+<!ELEMENT parameter EMPTY>
+
+<!-- The parameter's name (should be unique for an extension) -->
+<!ATTLIST parameter name CDATA #REQUIRED>
+
+<!-- The parameter's value -->
+<!ATTLIST parameter value CDATA #REQUIRED> 
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/plugin/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/plugin/pom.xml b/nutch-plugins/plugin/pom.xml
new file mode 100644
index 0000000..2ac06ee
--- /dev/null
+++ b/nutch-plugins/plugin/pom.xml
@@ -0,0 +1,38 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>plugin</artifactId>
+    <packaging>jar</packaging>
+
+    <name>plugin</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/pom.xml b/nutch-plugins/pom.xml
new file mode 100644
index 0000000..e07f487
--- /dev/null
+++ b/nutch-plugins/pom.xml
@@ -0,0 +1,164 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-parent</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>nutch-plugins</artifactId>
+    <packaging>pom</packaging>
+
+    <name>nutch-plugins</name>
+    <url>http://nutch.apache.org</url>
+
+    <modules>
+        <!--<module>indexer-solr</module>-->
+        <module>creativecommons</module>
+        <module>feed</module>
+        <module>headings</module>
+        <module>index-anchor</module>
+        <module>index-basic</module>
+        <module>index-geoip</module>
+        <module>index-links</module>
+        <module>index-metadata</module>
+        <module>index-more</module>
+        <module>index-replace</module>
+        <module>index-static</module>
+        <module>indexer-cloudsearch</module>
+        <module>indexer-dummy</module>
+        <module>indexer-elastic</module>
+        <module>indexer-solr</module>
+        <module>language-identifier</module>
+        <module>lib-htmlunit</module>
+        <module>lib-http</module>
+        <module>lib-nekohtml</module>
+        <module>lib-regex-filter</module>
+        <module>lib-selenium</module>
+        <module>lib-xml</module>
+        <module>microformats-reltag</module>
+        <module>mimetype-filter</module>
+        <module>nutch-extensionpoints</module>
+        <module>parse-ext</module>
+        <module>parse-html</module>
+        <module>parse-js</module>
+        <module>parse-metatags</module>
+        <module>parse-replace</module>
+        <module>parse-swf</module>
+        <module>parse-tika</module>
+        <module>parse-zip</module>
+        <module>parsefilter-naivebayes</module>
+        <module>parsefilter-regex</module>
+        <module>plugin</module>
+        <module>protocol-file</module>
+        <module>protocol-ftp</module>
+        <module>protocol-htmlunit</module>
+        <module>protocol-http</module>
+        <module>protocol-httpclient</module>
+        <module>protocol-interactiveselenium</module>
+        <module>protocol-selenium</module>
+        <module>scoring-depth</module>
+        <module>scoring-link</module>
+        <module>scoring-opic</module>
+        <module>scoring-similarity</module>
+        <module>subcollection</module>
+        <module>tld</module>
+        <module>urlfilter-automaton</module>
+        <module>urlfilter-domain</module>
+        <module>urlfilter-domainblacklist</module>
+        <module>urlfilter-ignoreexempt</module>
+        <module>urlfilter-prefix</module>
+        <module>urlfilter-regex</module>
+        <module>urlfilter-suffix</module>
+        <module>urlfilter-validator</module>
+        <module>urlmeta</module>
+        <module>urlnormalizer-ajax</module>
+        <module>urlnormalizer-basic</module>
+        <module>urlnormalizer-host</module>
+        <module>urlnormalizer-pass</module>
+        <module>urlnormalizer-protocol</module>
+        <module>urlnormalizer-querystring</module>
+        <module>urlnormalizer-regex</module>
+        <module>urlnormalizer-slash</module>
+    </modules>
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+        <!-- Note : an additional level is for the child modules (defined ahead in hierarchy)-->
+        <dir.root>..${file.separator}..${file.separator}</dir.root>
+        <libs.dir>${dir.local.plugins}${file.separator}${project.artifactId}</libs.dir>
+    </properties>
+    <dependencies>
+        <dependency>
+            <groupId>org.apache.nutch</groupId>
+            <artifactId>nutch-core</artifactId>
+            <version>${project.parent.version}</version>
+            <scope>provided</scope>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.nutch</groupId>
+            <artifactId>nutch-core</artifactId>
+            <version>${project.parent.version}</version>
+            <scope>test</scope>
+            <type>test-jar</type>
+        </dependency>
+    </dependencies>
+    <build>
+        <finalName>${project.artifactId}</finalName>
+        <plugins>
+            <plugin>
+                <artifactId>maven-resources-plugin</artifactId>
+                <version>3.0.1</version>
+                <executions>
+                    <execution>
+                        <id>copy-resources</id>
+                        <phase>package</phase>
+                        <goals>
+                            <goal>copy-resources</goal>
+                        </goals>
+                        <configuration>
+                            <outputDirectory>${libs.dir}</outputDirectory>
+                            <resources>
+                                <resource>
+                                    <directory>${project.build.directory}</directory>
+                                    <include>${build.finalName}.jar</include>
+                                </resource>
+                                <resource>
+                                    <directory>${project.basedir}</directory>
+                                    <include>plugin.xml</include>
+                                </resource>
+                            </resources>
+                        </configuration>
+                    </execution>
+                </executions>
+            </plugin>
+            <plugin>
+                <artifactId>maven-surefire-plugin</artifactId>
+                <version>2.19.1</version>
+            </plugin>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-failsafe-plugin</artifactId>
+                <version>2.19.1</version>
+            </plugin>
+        </plugins>
+    </build>
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-file/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-file/build.xml b/nutch-plugins/protocol-file/build.xml
new file mode 100644
index 0000000..121b1fe
--- /dev/null
+++ b/nutch-plugins/protocol-file/build.xml
@@ -0,0 +1,29 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="protocol-file" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+  
+ <!-- for junit test -->
+  <mkdir dir="${build.test}/data"/>
+  <copy todir="${build.test}/data">
+    <fileset dir="sample">
+      <include name="*.txt"/>
+    </fileset>
+  </copy>
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-file/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-file/ivy.xml b/nutch-plugins/protocol-file/ivy.xml
new file mode 100644
index 0000000..1a86d68
--- /dev/null
+++ b/nutch-plugins/protocol-file/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-file/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-file/plugin.xml b/nutch-plugins/protocol-file/plugin.xml
new file mode 100644
index 0000000..1647ce4
--- /dev/null
+++ b/nutch-plugins/protocol-file/plugin.xml
@@ -0,0 +1,46 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="protocol-file"
+   name="File Protocol Plug-in"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+
+   <runtime>
+      <library name="protocol-file.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+   <extension id="org.apache.nutch.protocol.file"
+              name="FileProtocol"
+              point="org.apache.nutch.protocol.Protocol">
+
+      <implementation id="org.apache.nutch.protocol.file.File"
+                      class="org.apache.nutch.protocol.file.File">
+        <parameter name="protocolName" value="file"/>
+      </implementation>
+
+   </extension>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-file/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-file/pom.xml b/nutch-plugins/protocol-file/pom.xml
new file mode 100644
index 0000000..2ab2f75
--- /dev/null
+++ b/nutch-plugins/protocol-file/pom.xml
@@ -0,0 +1,38 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>protocol-file</artifactId>
+    <packaging>jar</packaging>
+
+    <name>protocol-file</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-file/src/main/java/org/apache/nutch/protocol/file/File.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-file/src/main/java/org/apache/nutch/protocol/file/File.java b/nutch-plugins/protocol-file/src/main/java/org/apache/nutch/protocol/file/File.java
new file mode 100644
index 0000000..2712218
--- /dev/null
+++ b/nutch-plugins/protocol-file/src/main/java/org/apache/nutch/protocol/file/File.java
@@ -0,0 +1,228 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol.file;
+
+import java.net.URL;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolOutput;
+import org.apache.nutch.protocol.ProtocolStatus;
+import org.apache.nutch.protocol.RobotRulesParser;
+import org.apache.nutch.util.NutchConfiguration;
+
+import crawlercommons.robots.BaseRobotRules;
+
+/**
+ * This class is a protocol plugin used for file: scheme. It creates
+ * {@link FileResponse} object and gets the content of the url from it.
+ * Configurable parameters are {@code file.content.limit} and
+ * {@code file.crawl.parent} in nutch-default.xml defined under
+ * "file properties" section.
+ * 
+ * @author John Xing
+ */
+public class File implements Protocol {
+
+  public static final Logger LOG = LoggerFactory.getLogger(File.class);
+
+  static final int MAX_REDIRECTS = 5;
+
+  int maxContentLength;
+  boolean crawlParents;
+
+  /**
+   * if true return a redirect for symbolic links and do not resolve the links
+   * internally
+   */
+  boolean symlinksAsRedirects = true;
+
+  private Configuration conf;
+
+  public File() {
+  }
+
+  /**
+   * Set the {@link Configuration} object
+   */
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    this.maxContentLength = conf.getInt("file.content.limit", 64 * 1024);
+    this.crawlParents = conf.getBoolean("file.crawl.parent", true);
+    this.symlinksAsRedirects = conf.getBoolean(
+        "file.crawl.redirect_noncanonical", true);
+  }
+
+  /**
+   * Get the {@link Configuration} object
+   */
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+  /**
+   * Set the length after at which content is truncated.
+   */
+  public void setMaxContentLength(int maxContentLength) {
+    this.maxContentLength = maxContentLength;
+  }
+
+  /**
+   * Creates a {@link FileResponse} object corresponding to the url and return a
+   * {@link ProtocolOutput} object as per the content received
+   * 
+   * @param url
+   *          Text containing the url
+   * @param datum
+   *          The CrawlDatum object corresponding to the url
+   * 
+   * @return {@link ProtocolOutput} object for the content of the file indicated
+   *         by url
+   */
+  public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) {
+    String urlString = url.toString();
+    try {
+      URL u = new URL(urlString);
+
+      int redirects = 0;
+
+      while (true) {
+        FileResponse response;
+        response = new FileResponse(u, datum, this, getConf()); // make a
+                                                                // request
+
+        int code = response.getCode();
+
+        if (code == 200) { // got a good response
+          return new ProtocolOutput(response.toContent()); // return it
+
+        } else if (code == 304) { // got not modified
+          return new ProtocolOutput(response.toContent(),
+              ProtocolStatus.STATUS_NOTMODIFIED);
+
+        } else if (code == 401) { // access denied / no read permissions
+          return new ProtocolOutput(response.toContent(), new ProtocolStatus(
+              ProtocolStatus.ACCESS_DENIED));
+
+        } else if (code == 404) { // no such file
+          return new ProtocolOutput(response.toContent(),
+              ProtocolStatus.STATUS_NOTFOUND);
+
+        } else if (code >= 300 && code < 400) { // handle redirect
+          u = new URL(response.getHeader("Location"));
+          if (LOG.isTraceEnabled()) {
+            LOG.trace("redirect to " + u);
+          }
+          if (symlinksAsRedirects) {
+            return new ProtocolOutput(response.toContent(), new ProtocolStatus(
+                ProtocolStatus.MOVED, u));
+          } else if (redirects == MAX_REDIRECTS) {
+            LOG.trace("Too many redirects: {}", url);
+            return new ProtocolOutput(response.toContent(), new ProtocolStatus(
+                ProtocolStatus.REDIR_EXCEEDED, u));
+          }
+          redirects++;
+
+        } else { // convert to exception
+          throw new FileError(code);
+        }
+      }
+    } catch (Exception e) {
+      e.printStackTrace();
+      return new ProtocolOutput(null, new ProtocolStatus(e));
+    }
+  }
+
+  /**
+   * Quick way for running this class. Useful for debugging.
+   */
+  public static void main(String[] args) throws Exception {
+    int maxContentLength = Integer.MIN_VALUE;
+    String logLevel = "info";
+    boolean dumpContent = false;
+    String urlString = null;
+
+    String usage = "Usage: File [-logLevel level] [-maxContentLength L] [-dumpContent] url";
+
+    if (args.length == 0) {
+      System.err.println(usage);
+      System.exit(-1);
+    }
+
+    for (int i = 0; i < args.length; i++) {
+      if (args[i].equals("-logLevel")) {
+        logLevel = args[++i];
+      } else if (args[i].equals("-maxContentLength")) {
+        maxContentLength = Integer.parseInt(args[++i]);
+      } else if (args[i].equals("-dumpContent")) {
+        dumpContent = true;
+      } else if (i != args.length - 1) {
+        System.err.println(usage);
+        System.exit(-1);
+      } else
+        urlString = args[i];
+    }
+
+    File file = new File();
+    file.setConf(NutchConfiguration.create());
+
+    if (maxContentLength != Integer.MIN_VALUE) // set maxContentLength
+      file.setMaxContentLength(maxContentLength);
+
+    // set log level
+    // LOG.setLevel(Level.parse((new String(logLevel)).toUpperCase()));
+
+    ProtocolOutput output = file.getProtocolOutput(new Text(urlString),
+        new CrawlDatum());
+    Content content = output.getContent();
+
+    System.err.println("URL: " + content.getUrl());
+    System.err.println("Status: " + output.getStatus());
+    System.err.println("Content-Type: " + content.getContentType());
+    System.err.println("Content-Length: "
+        + content.getMetadata().get(Response.CONTENT_LENGTH));
+    System.err.println("Last-Modified: "
+        + content.getMetadata().get(Response.LAST_MODIFIED));
+    String redirectLocation = content.getMetadata().get("Location");
+    if (redirectLocation != null) {
+      System.err.println("Location: " + redirectLocation);
+    }
+
+    if (dumpContent) {
+      System.out.print(new String(content.getContent()));
+    }
+
+    file = null;
+  }
+
+  /**
+   * No robots parsing is done for file protocol. So this returns a set of empty
+   * rules which will allow every url.
+   */
+  public BaseRobotRules getRobotRules(Text url, CrawlDatum datum) {
+    return RobotRulesParser.EMPTY_RULES;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-file/src/main/java/org/apache/nutch/protocol/file/FileError.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-file/src/main/java/org/apache/nutch/protocol/file/FileError.java b/nutch-plugins/protocol-file/src/main/java/org/apache/nutch/protocol/file/FileError.java
new file mode 100644
index 0000000..4fef340
--- /dev/null
+++ b/nutch-plugins/protocol-file/src/main/java/org/apache/nutch/protocol/file/FileError.java
@@ -0,0 +1,36 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol.file;
+
+/**
+ * Thrown for File error codes.
+ */
+public class FileError extends FileException {
+
+  private int code;
+
+  public int getCode(int code) {
+    return code;
+  }
+
+  public FileError(int code) {
+    super("File Error: " + code);
+    this.code = code;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-file/src/main/java/org/apache/nutch/protocol/file/FileException.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-file/src/main/java/org/apache/nutch/protocol/file/FileException.java b/nutch-plugins/protocol-file/src/main/java/org/apache/nutch/protocol/file/FileException.java
new file mode 100644
index 0000000..f0467de
--- /dev/null
+++ b/nutch-plugins/protocol-file/src/main/java/org/apache/nutch/protocol/file/FileException.java
@@ -0,0 +1,40 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol.file;
+
+import org.apache.nutch.protocol.ProtocolException;
+
+public class FileException extends ProtocolException {
+
+  public FileException() {
+    super();
+  }
+
+  public FileException(String message) {
+    super(message);
+  }
+
+  public FileException(String message, Throwable cause) {
+    super(message, cause);
+  }
+
+  public FileException(Throwable cause) {
+    super(cause);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-file/src/main/java/org/apache/nutch/protocol/file/FileResponse.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-file/src/main/java/org/apache/nutch/protocol/file/FileResponse.java b/nutch-plugins/protocol-file/src/main/java/org/apache/nutch/protocol/file/FileResponse.java
new file mode 100644
index 0000000..b6e74ff
--- /dev/null
+++ b/nutch-plugins/protocol-file/src/main/java/org/apache/nutch/protocol/file/FileResponse.java
@@ -0,0 +1,317 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol.file;
+
+// JDK imports
+import java.net.URL;
+import java.io.IOException;
+import java.io.UnsupportedEncodingException;
+
+// Nutch imports
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.MimeUtil;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.net.protocols.HttpDateFormat;
+import org.apache.nutch.net.protocols.Response;
+
+// Tika imports
+import org.apache.tika.Tika;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
+
+/************************************
+ * FileResponse.java mimics file replies as http response. It tries its best to
+ * follow http's way for headers, response codes as well as exceptions.
+ * 
+ * Comments: (1) java.net.URL and java.net.URLConnection can handle file:
+ * scheme. However they are not flexible enough, so not used in this
+ * implementation.
+ * 
+ * (2) java.io.File is used for its abstractness across platforms. Warning:
+ * java.io.File API (1.4.2) does not elaborate on how special files, such as
+ * /dev/* in unix and /proc/* on linux, are treated. Tests show (a)
+ * java.io.File.isFile() return false for /dev/* (b) java.io.File.isFile()
+ * return true for /proc/* (c) java.io.File.length() return 0 for /proc/* We are
+ * probably oaky for now. Could be buggy here. How about special files on
+ * windows?
+ * 
+ * (3) java.io.File API (1.4.2) does not seem to know unix hard link files. They
+ * are just treated as individual files.
+ * 
+ * (4) No funcy POSIX file attributes yet. May never need?
+ * 
+ * @author John Xing
+ ***********************************/
+public class FileResponse {
+
+  private String orig;
+  private String base;
+  private byte[] content;
+  private static final byte[] EMPTY_CONTENT = new byte[0];
+  private int code;
+  private Metadata headers = new Metadata();
+
+  private final File file;
+  private Configuration conf;
+
+  private MimeUtil MIME;
+  private Tika tika;
+
+  /** Returns the response code. */
+  public int getCode() {
+    return code;
+  }
+
+  /** Returns the value of a named header. */
+  public String getHeader(String name) {
+    return headers.get(name);
+  }
+
+  public byte[] getContent() {
+    return content;
+  }
+
+  public Content toContent() {
+    return new Content(orig, base, (content != null ? content : EMPTY_CONTENT),
+        getHeader(Response.CONTENT_TYPE), headers, this.conf);
+  }
+
+  /**
+   * Default public constructor
+   * 
+   * @param url
+   * @param datum
+   * @param file
+   * @param conf
+   * @throws FileException
+   * @throws IOException
+   */
+  public FileResponse(URL url, CrawlDatum datum, File file, Configuration conf)
+      throws FileException, IOException {
+
+    this.orig = url.toString();
+    this.base = url.toString();
+    this.file = file;
+    this.conf = conf;
+
+    MIME = new MimeUtil(conf);
+    tika = new Tika();
+
+    if (!"file".equals(url.getProtocol()))
+      throw new FileException("Not a file url:" + url);
+
+    if (File.LOG.isTraceEnabled()) {
+      File.LOG.trace("fetching " + url);
+    }
+
+    if (url.getPath() != url.getFile()) {
+      if (File.LOG.isWarnEnabled()) {
+        File.LOG.warn("url.getPath() != url.getFile(): " + url);
+      }
+    }
+
+    String path = "".equals(url.getPath()) ? "/" : url.getPath();
+
+    try {
+      // specify the encoding via the config later?
+      path = java.net.URLDecoder.decode(path, "UTF-8");
+    } catch (UnsupportedEncodingException ex) {
+    }
+
+    try {
+
+      this.content = null;
+
+      // url.toURI() is only in j2se 1.5.0
+      // java.io.File f = new java.io.File(url.toURI());
+      java.io.File f = new java.io.File(path);
+
+      if (!f.exists()) {
+        this.code = 404; // http Not Found
+        return;
+      }
+
+      if (!f.canRead()) {
+        this.code = 401; // http Unauthorized
+        return;
+      }
+
+      // symbolic link or relative path on unix
+      // fix me: what's the consequence on windows platform
+      // where case is insensitive
+      if (!f.equals(f.getCanonicalFile())) {
+        // set headers
+        // hdrs.put("Location", f.getCanonicalFile().toURI());
+        //
+        // we want to automatically escape characters that are illegal in URLs.
+        // It is recommended that new code convert an abstract pathname into a
+        // URL
+        // by first converting it into a URI, via the toURI method, and then
+        // converting the URI into a URL via the URI.toURL method.
+        headers.set(Response.LOCATION, f.getCanonicalFile().toURI().toURL()
+            .toString());
+
+        this.code = 300; // http redirect
+        return;
+      }
+      if (f.lastModified() <= datum.getModifiedTime()) {
+        this.code = 304;
+        this.headers.set("Last-Modified",
+            HttpDateFormat.toString(f.lastModified()));
+        return;
+      }
+
+      if (f.isDirectory()) {
+        getDirAsHttpResponse(f);
+      } else if (f.isFile()) {
+        getFileAsHttpResponse(f);
+      } else {
+        this.code = 500; // http Internal Server Error
+        return;
+      }
+
+    } catch (IOException e) {
+      throw e;
+    }
+
+  }
+
+  // get file as http response
+  private void getFileAsHttpResponse(java.io.File f) throws FileException,
+      IOException {
+
+    // ignore file of size larger than
+    // Integer.MAX_VALUE = 2^31-1 = 2147483647
+    long size = f.length();
+    if (size > Integer.MAX_VALUE) {
+      throw new FileException("file is too large, size: " + size);
+      // or we can do this?
+      // this.code = 400; // http Bad request
+      // return;
+    }
+
+    // capture content
+    int len = (int) size;
+
+    if (this.file.maxContentLength >= 0 && len > this.file.maxContentLength)
+      len = this.file.maxContentLength;
+
+    this.content = new byte[len];
+
+    java.io.InputStream is = new java.io.FileInputStream(f);
+    int offset = 0;
+    int n = 0;
+    while (offset < len
+        && (n = is.read(this.content, offset, len - offset)) >= 0) {
+      offset += n;
+    }
+    if (offset < len) { // keep whatever already have, but issue a warning
+      if (File.LOG.isWarnEnabled()) {
+        File.LOG.warn("not enough bytes read from file: " + f.getPath());
+      }
+    }
+    is.close();
+
+    // set headers
+    headers.set(Response.CONTENT_LENGTH, new Long(size).toString());
+    headers.set(Response.LAST_MODIFIED,
+        HttpDateFormat.toString(f.lastModified()));
+
+    String mimeType = tika.detect(f);
+
+    headers.set(Response.CONTENT_TYPE, mimeType != null ? mimeType : "");
+
+    // response code
+    this.code = 200; // http OK
+  }
+
+  /**
+   * get dir list as http response
+   * 
+   * @param f
+   * @throws IOException
+   */
+  private void getDirAsHttpResponse(java.io.File f) throws IOException {
+
+    String path = f.toString();
+    if (this.file.crawlParents)
+      this.content = list2html(f.listFiles(), path, "/".equals(path) ? false
+          : true);
+    else
+      this.content = list2html(f.listFiles(), path, false);
+
+    // set headers
+    headers.set(Response.CONTENT_LENGTH,
+        new Integer(this.content.length).toString());
+    headers.set(Response.CONTENT_TYPE, "text/html");
+    headers.set(Response.LAST_MODIFIED,
+        HttpDateFormat.toString(f.lastModified()));
+
+    // response code
+    this.code = 200; // http OK
+  }
+
+  /**
+   * generate html page from dir list
+   * 
+   * @param list
+   * @param path
+   * @param includeDotDot
+   * @return
+   */
+  private byte[] list2html(java.io.File[] list, String path,
+      boolean includeDotDot) {
+
+    StringBuffer x = new StringBuffer("<html><head>");
+    x.append("<title>Index of " + path + "</title></head>\n");
+    x.append("<body><h1>Index of " + path + "</h1><pre>\n");
+
+    if (includeDotDot) {
+      x.append("<a href='../'>../</a>\t-\t-\t-\n");
+    }
+
+    // fix me: we might want to sort list here! but not now.
+
+    java.io.File f;
+    for (int i = 0; i < list.length; i++) {
+      f = list[i];
+      String name = f.getName();
+      String time = HttpDateFormat.toString(f.lastModified());
+      if (f.isDirectory()) {
+        // java 1.4.2 api says dir itself and parent dir are not listed
+        // so the following is not needed.
+        // if (name.equals(".") || name.equals(".."))
+        // continue;
+        x.append("<a href='" + name + "/" + "'>" + name + "/</a>\t");
+        x.append(time + "\t-\n");
+      } else if (f.isFile()) {
+        x.append("<a href='" + name + "'>" + name + "</a>\t");
+        x.append(time + "\t" + f.length() + "\n");
+      } else {
+        // ignore any other
+      }
+    }
+
+    x.append("</pre></body></html>\n");
+
+    return new String(x).getBytes();
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-file/src/main/java/org/apache/nutch/protocol/file/package.html
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-file/src/main/java/org/apache/nutch/protocol/file/package.html b/nutch-plugins/protocol-file/src/main/java/org/apache/nutch/protocol/file/package.html
new file mode 100644
index 0000000..221c79c
--- /dev/null
+++ b/nutch-plugins/protocol-file/src/main/java/org/apache/nutch/protocol/file/package.html
@@ -0,0 +1,5 @@
+<html>
+<body>
+<p>Protocol plugin which supports retrieving local file resources.</p><p></p>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-file/src/test/java/org/apache/nutch/protocol/file/TestProtocolFile.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-file/src/test/java/org/apache/nutch/protocol/file/TestProtocolFile.java b/nutch-plugins/protocol-file/src/test/java/org/apache/nutch/protocol/file/TestProtocolFile.java
new file mode 100644
index 0000000..5f95377
--- /dev/null
+++ b/nutch-plugins/protocol-file/src/test/java/org/apache/nutch/protocol/file/TestProtocolFile.java
@@ -0,0 +1,99 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol.file;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+
+// Nutch imports
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.protocol.ProtocolOutput;
+import org.apache.nutch.protocol.ProtocolStatus;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+/**
+ * @author mattmann
+ * @version $Revision$
+ * 
+ *          <p>
+ *          Unit tests for the {@link File}Protocol.
+ *          </p>
+ *          .
+ */
+public class TestProtocolFile {
+
+  private String fileSeparator = System.getProperty("file.separator");
+  private String sampleDir = System.getProperty("test.data", ".");
+
+  private static final String[] testTextFiles = new String[] {
+      "testprotocolfile.txt", "testprotocolfile_(encoded).txt",
+      "testprotocolfile_%28encoded%29.txt" };
+
+  private static final CrawlDatum datum = new CrawlDatum();
+
+  private static final String expectedMimeType = "text/plain";
+
+  private Configuration conf;
+
+  @Before
+  public void setUp() {
+    conf = NutchConfiguration.create();
+  }
+
+  @Test
+  public void testSetContentType() throws ProtocolException {
+    for (String testTextFile : testTextFiles) {
+      setContentType(testTextFile);
+    }
+  }
+
+  /**
+   * Tests the setting of the <code>Response.CONTENT_TYPE</code> metadata field.
+   * 
+   * @since NUTCH-384
+   * 
+   */
+  public void setContentType(String testTextFile) throws ProtocolException {
+    String urlString = "file:" + sampleDir + fileSeparator + testTextFile;
+    Assert.assertNotNull(urlString);
+    Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
+    ProtocolOutput output = protocol.getProtocolOutput(new Text(urlString),
+        datum);
+    Assert.assertNotNull(output);
+    Assert.assertEquals("Status code: [" + output.getStatus().getCode()
+        + "], not equal to: [" + ProtocolStatus.SUCCESS + "]: args: ["
+        + output.getStatus().getArgs() + "]", ProtocolStatus.SUCCESS, output
+        .getStatus().getCode());
+    Assert.assertNotNull(output.getContent());
+    Assert.assertNotNull(output.getContent().getContentType());
+    Assert.assertEquals(expectedMimeType, output.getContent().getContentType());
+    Assert.assertNotNull(output.getContent().getMetadata());
+    Assert.assertEquals(expectedMimeType, output.getContent().getMetadata()
+        .get(Response.CONTENT_TYPE));
+
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-file/src/test/resources/testprotocolfile.txt
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-file/src/test/resources/testprotocolfile.txt b/nutch-plugins/protocol-file/src/test/resources/testprotocolfile.txt
new file mode 100644
index 0000000..fbe8a8a
--- /dev/null
+++ b/nutch-plugins/protocol-file/src/test/resources/testprotocolfile.txt
@@ -0,0 +1 @@
+Protocol File Test

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-file/src/test/resources/testprotocolfile_(encoded).txt
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-file/src/test/resources/testprotocolfile_(encoded).txt b/nutch-plugins/protocol-file/src/test/resources/testprotocolfile_(encoded).txt
new file mode 100644
index 0000000..fbe8a8a
--- /dev/null
+++ b/nutch-plugins/protocol-file/src/test/resources/testprotocolfile_(encoded).txt
@@ -0,0 +1 @@
+Protocol File Test

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-ftp/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-ftp/build.xml b/nutch-plugins/protocol-ftp/build.xml
new file mode 100644
index 0000000..79314d4
--- /dev/null
+++ b/nutch-plugins/protocol-ftp/build.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="protocol-ftp" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-ftp/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-ftp/ivy.xml b/nutch-plugins/protocol-ftp/ivy.xml
new file mode 100644
index 0000000..214c445
--- /dev/null
+++ b/nutch-plugins/protocol-ftp/ivy.xml
@@ -0,0 +1,42 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+    <dependency org="commons-net" name="commons-net" rev="1.2.2" conf="*->master"/>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-ftp/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-ftp/plugin.xml b/nutch-plugins/protocol-ftp/plugin.xml
new file mode 100644
index 0000000..1421e37
--- /dev/null
+++ b/nutch-plugins/protocol-ftp/plugin.xml
@@ -0,0 +1,46 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="protocol-ftp"
+   name="Ftp Protocol Plug-in"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="protocol-ftp.jar">
+         <export name="*"/>
+      </library>
+      <library name="commons-net-1.2.0-dev.jar"/>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+   <extension id="org.apache.nutch.protocol.ftp"
+              name="FtpProtocol"
+              point="org.apache.nutch.protocol.Protocol">
+
+      <implementation id="org.apache.nutch.protocol.ftp.Ftp"
+                      class="org.apache.nutch.protocol.ftp.Ftp">
+        <parameter name="protocolName" value="ftp"/>
+      </implementation>
+      
+   </extension>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-ftp/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-ftp/pom.xml b/nutch-plugins/protocol-ftp/pom.xml
new file mode 100644
index 0000000..fe9a61b
--- /dev/null
+++ b/nutch-plugins/protocol-ftp/pom.xml
@@ -0,0 +1,38 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>protocol-ftp</artifactId>
+    <packaging>jar</packaging>
+
+    <name>protocol-ftp</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+</project>

[04/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build for nutch-core and nutch-plugins

Posted by th...@apache.org.

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-link/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-link/plugin.xml b/nutch-plugins/scoring-link/plugin.xml
new file mode 100644
index 0000000..2b1c1e1
--- /dev/null
+++ b/nutch-plugins/scoring-link/plugin.xml
@@ -0,0 +1,39 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="scoring-link"
+   name="Link Analysis Scoring Plug-in"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+
+   <runtime>
+      <library name="scoring-link.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <extension id="org.apache.nutch.scoring.link"
+              name="LinkAnalysisScoring"
+              point="org.apache.nutch.scoring.ScoringFilter">
+
+      <implementation id="org.apache.nutch.scoring.link.LinkAnalysisScoringFilter"
+        class="org.apache.nutch.scoring.link.LinkAnalysisScoringFilter" />
+   </extension>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-link/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-link/pom.xml b/nutch-plugins/scoring-link/pom.xml
new file mode 100644
index 0000000..3c7041e
--- /dev/null
+++ b/nutch-plugins/scoring-link/pom.xml
@@ -0,0 +1,38 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>scoring-link</artifactId>
+    <packaging>jar</packaging>
+
+    <name>scoring-link</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-link/src/main/java/org/apache/nutch/scoring/link/LinkAnalysisScoringFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-link/src/main/java/org/apache/nutch/scoring/link/LinkAnalysisScoringFilter.java b/nutch-plugins/scoring-link/src/main/java/org/apache/nutch/scoring/link/LinkAnalysisScoringFilter.java
new file mode 100644
index 0000000..a143f46
--- /dev/null
+++ b/nutch-plugins/scoring-link/src/main/java/org/apache/nutch/scoring/link/LinkAnalysisScoringFilter.java
@@ -0,0 +1,95 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.scoring.link;
+
+import java.util.Collection;
+import java.util.List;
+import java.util.Map.Entry;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.scoring.ScoringFilter;
+import org.apache.nutch.scoring.ScoringFilterException;
+
+public class LinkAnalysisScoringFilter implements ScoringFilter {
+
+  private Configuration conf;
+  private float normalizedScore = 1.00f;
+
+  public LinkAnalysisScoringFilter() {
+
+  }
+
+  public Configuration getConf() {
+    return conf;
+  }
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    normalizedScore = conf.getFloat("link.analyze.normalize.score", 1.00f);
+  }
+
+  public CrawlDatum distributeScoreToOutlinks(Text fromUrl,
+      ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets,
+      CrawlDatum adjust, int allCount) throws ScoringFilterException {
+    return adjust;
+  }
+
+  public float generatorSortValue(Text url, CrawlDatum datum, float initSort)
+      throws ScoringFilterException {
+    return datum.getScore() * initSort;
+  }
+
+  public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum,
+      CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore)
+      throws ScoringFilterException {
+    return (normalizedScore * dbDatum.getScore());
+  }
+
+  public void initialScore(Text url, CrawlDatum datum)
+      throws ScoringFilterException {
+    datum.setScore(0.0f);
+  }
+
+  public void injectedScore(Text url, CrawlDatum datum)
+      throws ScoringFilterException {
+  }
+
+  public void passScoreAfterParsing(Text url, Content content, Parse parse)
+      throws ScoringFilterException {
+    parse.getData().getContentMeta()
+        .set(Nutch.SCORE_KEY, content.getMetadata().get(Nutch.SCORE_KEY));
+  }
+
+  public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content)
+      throws ScoringFilterException {
+    content.getMetadata().set(Nutch.SCORE_KEY, "" + datum.getScore());
+  }
+
+  public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum,
+      List<CrawlDatum> inlinked) throws ScoringFilterException {
+    // nothing to do
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-link/src/main/java/org/apache/nutch/scoring/link/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-link/src/main/java/org/apache/nutch/scoring/link/package-info.java b/nutch-plugins/scoring-link/src/main/java/org/apache/nutch/scoring/link/package-info.java
new file mode 100644
index 0000000..9dc0c35
--- /dev/null
+++ b/nutch-plugins/scoring-link/src/main/java/org/apache/nutch/scoring/link/package-info.java
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Scoring filter used in conjunction with
+ * {@link org.apache.nutch.scoring.webgraph.WebGraph}.
+ */
+package org.apache.nutch.scoring.link;
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-opic/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-opic/build.xml b/nutch-plugins/scoring-opic/build.xml
new file mode 100644
index 0000000..137dab4
--- /dev/null
+++ b/nutch-plugins/scoring-opic/build.xml
@@ -0,0 +1,27 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="scoring-opic" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+  <!-- Deploy Unit test dependencies -->
+  <target name="deps-test">
+    <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
+  </target>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-opic/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-opic/ivy.xml b/nutch-plugins/scoring-opic/ivy.xml
new file mode 100644
index 0000000..1a86d68
--- /dev/null
+++ b/nutch-plugins/scoring-opic/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-opic/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-opic/plugin.xml b/nutch-plugins/scoring-opic/plugin.xml
new file mode 100644
index 0000000..3805a31
--- /dev/null
+++ b/nutch-plugins/scoring-opic/plugin.xml
@@ -0,0 +1,39 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="scoring-opic"
+   name="OPIC Scoring Plug-in"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+
+   <runtime>
+      <library name="scoring-opic.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <extension id="org.apache.nutch.scoring.opic"
+              name="OPICScoring"
+              point="org.apache.nutch.scoring.ScoringFilter">
+
+      <implementation id="org.apache.nutch.scoring.opic.OPICScoringFilter"
+                      class="org.apache.nutch.scoring.opic.OPICScoringFilter" />
+   </extension>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-opic/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-opic/pom.xml b/nutch-plugins/scoring-opic/pom.xml
new file mode 100644
index 0000000..58e0786
--- /dev/null
+++ b/nutch-plugins/scoring-opic/pom.xml
@@ -0,0 +1,38 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>scoring-opic</artifactId>
+    <packaging>jar</packaging>
+
+    <name>scoring-opic</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-opic/src/main/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-opic/src/main/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java b/nutch-plugins/scoring-opic/src/main/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java
new file mode 100644
index 0000000..e943d06
--- /dev/null
+++ b/nutch-plugins/scoring-opic/src/main/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java
@@ -0,0 +1,173 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.scoring.opic;
+
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.Collection;
+import java.util.List;
+import java.util.Map.Entry;
+
+// Slf4j Logging imports
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.scoring.ScoringFilter;
+import org.apache.nutch.scoring.ScoringFilterException;
+
+/**
+ * This plugin implements a variant of an Online Page Importance Computation
+ * (OPIC) score, described in this paper: <a
+ * href="http://www2003.org/cdrom/papers/refereed/p007/p7-abiteboul.html"/>
+ * Abiteboul, Serge and Preda, Mihai and Cobena, Gregory (2003), Adaptive
+ * On-Line Page Importance Computation </a>.
+ * 
+ * @author Andrzej Bialecki
+ */
+public class OPICScoringFilter implements ScoringFilter {
+
+  private final static Logger LOG = LoggerFactory
+      .getLogger(OPICScoringFilter.class);
+
+  private Configuration conf;
+  private float scoreInjected;
+  private float scorePower;
+  private float internalScoreFactor;
+  private float externalScoreFactor;
+  private boolean countFiltered;
+
+  public Configuration getConf() {
+    return conf;
+  }
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    scorePower = conf.getFloat("indexer.score.power", 0.5f);
+    internalScoreFactor = conf.getFloat("db.score.link.internal", 1.0f);
+    externalScoreFactor = conf.getFloat("db.score.link.external", 1.0f);
+    countFiltered = conf.getBoolean("db.score.count.filtered", false);
+  }
+
+  public void injectedScore(Text url, CrawlDatum datum)
+      throws ScoringFilterException {
+  }
+
+  /**
+   * Set to 0.0f (unknown value) - inlink contributions will bring it to a
+   * correct level. Newly discovered pages have at least one inlink.
+   */
+  public void initialScore(Text url, CrawlDatum datum)
+      throws ScoringFilterException {
+    datum.setScore(0.0f);
+  }
+
+  /** Use {@link CrawlDatum#getScore()}. */
+  public float generatorSortValue(Text url, CrawlDatum datum, float initSort)
+      throws ScoringFilterException {
+    return datum.getScore() * initSort;
+  }
+
+  /** Increase the score by a sum of inlinked scores. */
+  public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum,
+      List<CrawlDatum> inlinked) throws ScoringFilterException {
+    float adjust = 0.0f;
+    for (int i = 0; i < inlinked.size(); i++) {
+      CrawlDatum linked = inlinked.get(i);
+      adjust += linked.getScore();
+    }
+    if (old == null)
+      old = datum;
+    datum.setScore(old.getScore() + adjust);
+  }
+
+  /** Store a float value of CrawlDatum.getScore() under Fetcher.SCORE_KEY. */
+  public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content) {
+    content.getMetadata().set(Nutch.SCORE_KEY, "" + datum.getScore());
+  }
+
+  /** Copy the value from Content metadata under Fetcher.SCORE_KEY to parseData. */
+  public void passScoreAfterParsing(Text url, Content content, Parse parse) {
+    parse.getData().getContentMeta()
+        .set(Nutch.SCORE_KEY, content.getMetadata().get(Nutch.SCORE_KEY));
+  }
+
+  /**
+   * Get a float value from Fetcher.SCORE_KEY, divide it by the number of
+   * outlinks and apply.
+   */
+  public CrawlDatum distributeScoreToOutlinks(Text fromUrl,
+      ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets,
+      CrawlDatum adjust, int allCount) throws ScoringFilterException {
+    float score = scoreInjected;
+    String scoreString = parseData.getContentMeta().get(Nutch.SCORE_KEY);
+    if (scoreString != null) {
+      try {
+        score = Float.parseFloat(scoreString);
+      } catch (Exception e) {
+        LOG.error("Error: ", e);
+      }
+    }
+    int validCount = targets.size();
+    if (countFiltered) {
+      score /= allCount;
+    } else {
+      if (validCount == 0) {
+        // no outlinks to distribute score, so just return adjust
+        return adjust;
+      }
+      score /= validCount;
+    }
+    // internal and external score factor
+    float internalScore = score * internalScoreFactor;
+    float externalScore = score * externalScoreFactor;
+    for (Entry<Text, CrawlDatum> target : targets) {
+      try {
+        String toHost = new URL(target.getKey().toString()).getHost();
+        String fromHost = new URL(fromUrl.toString()).getHost();
+        if (toHost.equalsIgnoreCase(fromHost)) {
+          target.getValue().setScore(internalScore);
+        } else {
+          target.getValue().setScore(externalScore);
+        }
+      } catch (MalformedURLException e) {
+        LOG.error("Error: ", e);
+        target.getValue().setScore(externalScore);
+      }
+    }
+    // XXX (ab) no adjustment? I think this is contrary to the algorithm descr.
+    // XXX in the paper, where page "loses" its score if it's distributed to
+    // XXX linked pages...
+    return adjust;
+  }
+
+  /** Dampen the boost value by scorePower. */
+  public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum,
+      CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore)
+      throws ScoringFilterException {
+    return (float) Math.pow(dbDatum.getScore(), scorePower) * initScore;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-opic/src/main/java/org/apache/nutch/scoring/opic/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-opic/src/main/java/org/apache/nutch/scoring/opic/package-info.java b/nutch-plugins/scoring-opic/src/main/java/org/apache/nutch/scoring/opic/package-info.java
new file mode 100644
index 0000000..26f6cbe
--- /dev/null
+++ b/nutch-plugins/scoring-opic/src/main/java/org/apache/nutch/scoring/opic/package-info.java
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Scoring filter implementing a variant of the Online Page Importance Computation
+ * (OPIC) algorithm.
+ */
+package org.apache.nutch.scoring.opic;
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-similarity/build-ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-similarity/build-ivy.xml b/nutch-plugins/scoring-similarity/build-ivy.xml
new file mode 100644
index 0000000..50fbb96
--- /dev/null
+++ b/nutch-plugins/scoring-similarity/build-ivy.xml
@@ -0,0 +1,54 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="scoring-similarity" default="deps-jar" xmlns:ivy="antlib:org.apache.ivy.ant">
+
+    <property name="ivy.install.version" value="2.1.0" />
+    <condition property="ivy.home" value="${env.IVY_HOME}">
+      <isset property="env.IVY_HOME" />
+    </condition>
+    <property name="ivy.home" value="${user.home}/.ant" />
+    <property name="ivy.checksums" value="" />
+    <property name="ivy.jar.dir" value="${ivy.home}/lib" />
+    <property name="ivy.jar.file" value="${ivy.jar.dir}/ivy.jar" />
+
+    <target name="download-ivy" unless="offline">
+
+        <mkdir dir="${ivy.jar.dir}"/>
+        <!-- download Ivy from web site so that it can be used even without any special installation -->
+        <get src="http://repo2.maven.org/maven2/org/apache/ivy/ivy/${ivy.install.version}/ivy-${ivy.install.version}.jar" 
+             dest="${ivy.jar.file}" usetimestamp="true"/>
+    </target>
+
+    <target name="init-ivy" depends="download-ivy">
+      <!-- try to load ivy here from ivy home, in case the user has not already dropped
+              it into ant's lib dir (note that the latter copy will always take precedence).
+              We will not fail as long as local lib dir exists (it may be empty) and
+              ivy is in at least one of ant's lib dir or the local lib dir. -->
+        <path id="ivy.lib.path">
+            <fileset dir="${ivy.jar.dir}" includes="*.jar"/>
+
+        </path>
+        <taskdef resource="org/apache/ivy/ant/antlib.xml"
+                 uri="antlib:org.apache.ivy.ant" classpathref="ivy.lib.path"/>
+    </target>
+
+  <target name="deps-jar" depends="init-ivy">
+    <ivy:retrieve pattern="lib/[artifact]-[revision].[ext]"/>
+  </target>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-similarity/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-similarity/build.xml b/nutch-plugins/scoring-similarity/build.xml
new file mode 100644
index 0000000..66ac8f3
--- /dev/null
+++ b/nutch-plugins/scoring-similarity/build.xml
@@ -0,0 +1,27 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="scoring-similarity" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+  <!-- Deploy Unit test dependencies -->
+  <target name="deps-test">
+    <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
+  </target>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-similarity/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-similarity/ivy.xml b/nutch-plugins/scoring-similarity/ivy.xml
new file mode 100644
index 0000000..be0a1de
--- /dev/null
+++ b/nutch-plugins/scoring-similarity/ivy.xml
@@ -0,0 +1,42 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+    <dependency org="org.apache.lucene" name="lucene-analyzers-common" rev="5.5.0" conf="*->default"/>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-similarity/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-similarity/plugin.xml b/nutch-plugins/scoring-similarity/plugin.xml
new file mode 100644
index 0000000..9639c18
--- /dev/null
+++ b/nutch-plugins/scoring-similarity/plugin.xml
@@ -0,0 +1,45 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="scoring-similarity"
+   name="Similarity based Scoring Plug-in"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+
+   <runtime>
+      <library name="scoring-similarity.jar">
+         <export name="*"/>
+      </library>
+      <library name="lucene-analyzers-common-5.5.0.jar"/>
+      <library name="lucene-core-5.5.0.jar"/>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+   
+   <extension id="org.apache.nutch.scoring.similarity"
+              name="SimilarityScoring"
+              point="org.apache.nutch.scoring.ScoringFilter">
+
+      <implementation id="scoring-similarity"
+                      class="org.apache.nutch.scoring.similarity.SimilarityScoringFilter" />
+   </extension>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-similarity/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-similarity/pom.xml b/nutch-plugins/scoring-similarity/pom.xml
new file mode 100644
index 0000000..b1f7cb7
--- /dev/null
+++ b/nutch-plugins/scoring-similarity/pom.xml
@@ -0,0 +1,45 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>scoring-similarity</artifactId>
+    <packaging>jar</packaging>
+
+    <name>scoring-similarity</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+    <dependencies>
+        <dependency>
+            <groupId>org.apache.lucene</groupId>
+            <artifactId>lucene-analyzers-common</artifactId>
+            <version>5.5.0</version>
+        </dependency>
+    </dependencies>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/SimilarityModel.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/SimilarityModel.java b/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/SimilarityModel.java
new file mode 100644
index 0000000..f44fabd
--- /dev/null
+++ b/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/SimilarityModel.java
@@ -0,0 +1,38 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.scoring.similarity;
+
+import java.util.Collection;
+import java.util.Map.Entry;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.protocol.Content;
+
+public interface SimilarityModel {
+
+  public void setConf(Configuration conf);
+  
+  public float setURLScoreAfterParsing(Text url, Content content, Parse parse);
+  
+  public CrawlDatum distributeScoreToOutlinks(Text fromUrl,
+      ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets,
+      CrawlDatum adjust, int allCount);
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/SimilarityScoringFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/SimilarityScoringFilter.java b/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/SimilarityScoringFilter.java
new file mode 100644
index 0000000..0f905b8
--- /dev/null
+++ b/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/SimilarityScoringFilter.java
@@ -0,0 +1,70 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.scoring.similarity;
+
+import java.util.Collection;
+import java.util.List;
+import java.util.Map.Entry;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.scoring.AbstractScoringFilter;
+import org.apache.nutch.scoring.ScoringFilterException;
+import org.apache.nutch.scoring.similarity.cosine.CosineSimilarity;
+
+public class SimilarityScoringFilter extends AbstractScoringFilter {
+
+  private Configuration conf;
+  private SimilarityModel similarityModel;
+  @Override
+  public Configuration getConf() {
+    return conf;
+  }
+
+  @Override
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    switch(conf.get("scoring.similarity.model","cosine")){
+    case "cosine":
+      similarityModel = (SimilarityModel) new CosineSimilarity();
+      break;
+    }
+    similarityModel.setConf(conf);
+  }
+
+  @Override
+  public void passScoreAfterParsing(Text url, Content content, Parse parse)
+      throws ScoringFilterException {
+
+    float score = similarityModel.setURLScoreAfterParsing(url, content, parse);
+    parse.getData().getContentMeta()
+    .set(Nutch.SCORE_KEY, score+"");
+  }
+
+  @Override
+  public CrawlDatum distributeScoreToOutlinks(Text fromUrl,
+      ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets,
+      CrawlDatum adjust, int allCount) throws ScoringFilterException {
+    similarityModel.distributeScoreToOutlinks(fromUrl, parseData, targets, adjust, allCount);
+    return adjust;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/cosine/CosineSimilarity.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/cosine/CosineSimilarity.java b/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/cosine/CosineSimilarity.java
new file mode 100644
index 0000000..9853b34
--- /dev/null
+++ b/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/cosine/CosineSimilarity.java
@@ -0,0 +1,84 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.scoring.similarity.cosine;
+
+import java.io.IOException;
+import java.util.Collection;
+import java.util.Map.Entry;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.scoring.similarity.SimilarityModel;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class CosineSimilarity implements SimilarityModel{
+
+  private Configuration conf; 
+  private final static Logger LOG = LoggerFactory
+      .getLogger(CosineSimilarity.class);
+
+  @Override
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+  }
+
+  @Override
+  public float setURLScoreAfterParsing(Text url, Content content, Parse parse) {
+    float score = 1;
+
+    try {
+      if(!Model.isModelCreated){
+        Model.createModel(conf);
+      }
+      String metatags = parse.getData().getParseMeta().get("metatag.keyword");
+      String metaDescription = parse.getData().getParseMeta().get("metatag.description");
+      int[] ngramArr = Model.retrieveNgrams(conf);
+      int mingram = ngramArr[0];
+      int maxgram = ngramArr[1];
+      DocVector docVector = Model.createDocVector(parse.getText()+metaDescription+metatags, mingram, maxgram);
+      if(docVector!=null){
+        score = Model.computeCosineSimilarity(docVector);
+        LOG.info("Setting score of {} to {}",url, score);
+      }
+      else {
+        throw new Exception("Could not create DocVector from parsed text");
+      }
+    } catch (Exception e) {
+      LOG.error("Error creating Cosine Model, setting scores of urls to 1 : {}", StringUtils.stringifyException(e));
+    }
+    return score;
+  }
+
+  @Override
+  public CrawlDatum distributeScoreToOutlinks(Text fromUrl, ParseData parseData,
+      Collection<Entry<Text, CrawlDatum>> targets, CrawlDatum adjust,
+      int allCount) {
+    float score = Float.parseFloat(parseData.getContentMeta().get(Nutch.SCORE_KEY));
+    for (Entry<Text, CrawlDatum> target : targets) {
+      target.getValue().setScore(score);
+    }
+    return adjust;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/cosine/DocVector.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/cosine/DocVector.java b/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/cosine/DocVector.java
new file mode 100644
index 0000000..33c3a23
--- /dev/null
+++ b/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/cosine/DocVector.java
@@ -0,0 +1,57 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.scoring.similarity.cosine;
+
+import java.util.HashMap;
+import java.util.Map;
+
+public class DocVector {
+
+  public HashMap<Integer, Long> termVector;
+  public HashMap<String, Integer> termFreqVector;
+
+  public DocVector() {
+    termFreqVector = new HashMap<>();
+  }
+
+  public void setTermFreqVector(HashMap<String, Integer> termFreqVector) {
+    this.termFreqVector = termFreqVector;
+  }
+  
+  public void setVectorEntry(int pos, long freq) {
+    termVector.put(pos, freq);
+  }
+  
+  public float dotProduct(DocVector docVector) {
+    float product = 0;
+    for(Map.Entry<String, Integer> entry : termFreqVector.entrySet()) {
+      if(docVector.termFreqVector.containsKey(entry.getKey())) {
+        product += docVector.termFreqVector.get(entry.getKey())*entry.getValue();
+      }
+    }
+    return product;
+  }
+  
+  public float getL2Norm() {
+    float sum = 0;
+    for(Map.Entry<String, Integer> entry : termFreqVector.entrySet()) {
+      sum += entry.getValue()*entry.getValue();
+    }
+    return (float) Math.sqrt(sum);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/cosine/Model.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/cosine/Model.java b/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/cosine/Model.java
new file mode 100644
index 0000000..d8180f2
--- /dev/null
+++ b/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/cosine/Model.java
@@ -0,0 +1,190 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.scoring.similarity.cosine;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.nutch.scoring.similarity.util.LuceneAnalyzerUtil.StemFilterType;
+import org.apache.nutch.scoring.similarity.util.LuceneTokenizer;
+import org.apache.nutch.scoring.similarity.util.LuceneTokenizer.TokenizerType;
+import org.apache.tika.Tika;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * This class creates a model used to store Document vector representation of the corpus. 
+ *
+ */
+public class Model {
+
+  //Currently only one file, but in future could accept a corpus hence an ArrayList
+  public static ArrayList<DocVector> docVectors = new ArrayList<>(); 
+  private static final Logger LOG = LoggerFactory.getLogger(Model.class);
+  public static boolean isModelCreated = false;
+  private static List<String> stopWords;
+
+  public static synchronized void createModel(Configuration conf) throws IOException {
+    if(isModelCreated) {
+      LOG.info("Model exists, skipping model creation");
+      return;
+    }
+    LOG.info("Creating Cosine model");
+    try {
+      //If user has specified a stopword file other than the template
+      if(!conf.get("scoring.similarity.stopword.file").equals("stopwords.txt.template")) {
+        stopWords = new ArrayList<String>();
+        String stopWord;
+        BufferedReader br = new BufferedReader(conf.getConfResourceAsReader((conf.get("scoring.similarity.stopword.file"))));
+        while ((stopWord = br.readLine()) != null) {
+          stopWords.add(stopWord);
+        }
+        LOG.info("Loaded custom stopwords from {}",conf.get("scoring.similarity.stopword.file"));
+      }
+
+      int[] ngramArr = retrieveNgrams(conf);
+      int mingram = ngramArr[0];
+      int maxgram = ngramArr[1];
+      LOG.info("Value of mingram: {} maxgram: {}", mingram, maxgram);
+
+      // TODO : Allow for corpus of documents to be provided as gold standard. 
+      String line;
+      StringBuilder sb = new StringBuilder();
+      BufferedReader br = new BufferedReader(conf.getConfResourceAsReader((conf.get("cosine.goldstandard.file"))));
+      while ((line = br.readLine()) != null) {
+        sb.append(line);
+      }
+      DocVector goldStandard = createDocVector(sb.toString(), mingram, maxgram);
+      if(goldStandard!=null)
+        docVectors.add(goldStandard);
+      else {
+        throw new Exception("Could not create DocVector for goldstandard");
+      }
+    } catch (Exception e) {
+      LOG.warn("Failed to add {} to model : {}",conf.get("cosine.goldstandard.file","goldstandard.txt.template"), 
+          StringUtils.stringifyException(e));
+    }
+    if(docVectors.size()>0) {
+      LOG.info("Cosine model creation complete");
+      isModelCreated = true;
+    }
+    else
+      LOG.info("Cosine model creation failed");
+  }
+
+  /**
+   * Used to create a DocVector from given String text. Used during the parse stage of the crawl 
+   * cycle to create a DocVector of the currently parsed page from the parseText attribute value
+   * @param content The text to tokenize
+   * @param mingram Value of mingram for tokenizing
+   * @param maxgram Value of maxgram for tokenizing
+   */
+  public static DocVector createDocVector(String content, int mingram, int maxgram) {
+    LuceneTokenizer tokenizer;
+
+    if(mingram > 1 && maxgram > 1){
+      LOG.info("Using Ngram Cosine Model, user specified mingram value : {} maxgram value : {}", mingram, maxgram);
+      tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, StemFilterType.PORTERSTEM_FILTER, mingram, maxgram);
+    } else if (mingram > 1) {
+      maxgram = mingram;
+      LOG.info("Using Ngram Cosine Model, user specified mingram value : {} maxgram value : {}", mingram, maxgram);
+      tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, StemFilterType.PORTERSTEM_FILTER, mingram, maxgram);
+    }
+    else if(stopWords!=null) {
+      tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, stopWords, true, 
+          StemFilterType.PORTERSTEM_FILTER);
+    }
+    else {
+      tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, true, 
+          StemFilterType.PORTERSTEM_FILTER);
+    }
+    TokenStream tStream = tokenizer.getTokenStream();
+    HashMap<String, Integer> termVector = new HashMap<>();
+    try {
+      CharTermAttribute charTermAttribute = tStream.addAttribute(CharTermAttribute.class);
+      tStream.reset();
+      while(tStream.incrementToken()) {
+        String term = charTermAttribute.toString();
+        LOG.debug(term);
+        if(termVector.containsKey(term)) {
+          int count = termVector.get(term);
+          count++;
+          termVector.put(term, count);
+        }
+        else {
+          termVector.put(term, 1);
+        }
+      }
+      DocVector docVector = new DocVector();
+      docVector.setTermFreqVector(termVector);
+      return docVector;
+    } catch (IOException e) {
+      LOG.error("Error creating DocVector : {}",StringUtils.stringifyException(e));
+    }
+    return null;
+  }
+
+  public static float computeCosineSimilarity(DocVector docVector) {
+    float scores[] = new float[docVectors.size()];
+    int i=0;
+    float maxScore = 0;
+    for(DocVector corpusDoc : docVectors) {
+      float numerator = docVector.dotProduct(corpusDoc);
+      float denominator = docVector.getL2Norm()*corpusDoc.getL2Norm();
+      float currentScore = numerator/denominator;
+      scores[i++] = currentScore;
+      maxScore = (currentScore>maxScore)? currentScore : maxScore;
+    }
+    // Returning the max score amongst all documents in the corpus
+    return maxScore;
+  }
+
+  /**
+   * Retrieves mingram and maxgram from configuration
+   * @param conf Configuration to retrieve mingram and maxgram
+   * @return ngram array as mingram at first index and maxgram at second index
+     */
+  public static int[] retrieveNgrams(Configuration conf){
+    int[] ngramArr = new int[2];
+    //Check if user has specified mingram or ngram for ngram cosine model
+    String[] ngramStr = conf.getStrings("scoring.similarity.ngrams", "1,1");
+    //mingram
+    ngramArr[0] = Integer.parseInt(ngramStr[0]);
+    int maxgram;
+    if (ngramStr.length > 1) {
+      //maxgram
+      ngramArr[1] = Integer.parseInt(ngramStr[1]);
+    } else {
+      //maxgram
+      ngramArr[1] = ngramArr[0];
+    }
+    return ngramArr;
+  }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/cosine/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/cosine/package-info.java b/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/cosine/package-info.java
new file mode 100644
index 0000000..70ae4ab
--- /dev/null
+++ b/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/cosine/package-info.java
@@ -0,0 +1,7 @@
+/**
+ * 
+ */
+/** Implements the cosine similarity metric for scoring relevant documents 
+ *
+ */
+package org.apache.nutch.scoring.similarity.cosine;
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/util/LuceneAnalyzerUtil.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/util/LuceneAnalyzerUtil.java b/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/util/LuceneAnalyzerUtil.java
new file mode 100644
index 0000000..4b519bc
--- /dev/null
+++ b/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/util/LuceneAnalyzerUtil.java
@@ -0,0 +1,93 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.scoring.similarity.util;
+
+import java.io.Reader;
+import java.util.List;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.LowerCaseFilter;
+import org.apache.lucene.analysis.core.StopFilter;
+import org.apache.lucene.analysis.en.EnglishMinimalStemFilter;
+import org.apache.lucene.analysis.en.PorterStemFilter;
+import org.apache.lucene.analysis.standard.ClassicTokenizer;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.analysis.util.CharArraySet;
+
+/**
+ * Creates a custom analyzer based on user provided inputs
+ *
+ */
+public class LuceneAnalyzerUtil extends Analyzer{ 
+  
+  public static enum StemFilterType { PORTERSTEM_FILTER, ENGLISHMINIMALSTEM_FILTER, NONE }
+  
+  private static StemFilterType stemFilterType;
+  private static CharArraySet stopSet;
+  
+  
+  /**
+   * Creates an analyzer instance based on Lucene default stopword set if @param useStopFilter is set to true
+   */
+  public LuceneAnalyzerUtil(StemFilterType stemFilterType, boolean useStopFilter) {
+    LuceneAnalyzerUtil.stemFilterType = stemFilterType;
+    if(useStopFilter) {
+      stopSet = StandardAnalyzer.STOP_WORDS_SET;
+    }
+    else {
+      stopSet = null;
+    }
+  }
+  
+  /**
+   * Creates an analyzer instance based on user provided stop words. If @param addToDefault is set to true, then 
+   * user provided stop words will be added to the Lucene default stopset.
+   */
+  public LuceneAnalyzerUtil(StemFilterType stemFilterType, List<String> stopWords, boolean addToDefault) {
+    LuceneAnalyzerUtil.stemFilterType = stemFilterType;
+    if(addToDefault) {
+      stopSet.addAll(stopWords);
+    }
+    else {
+      stopSet = StopFilter.makeStopSet(stopWords);
+    }
+  }
+    
+  @Override
+  protected TokenStreamComponents createComponents(String fieldName) {
+    Tokenizer source = new ClassicTokenizer();
+    TokenStream filter = new LowerCaseFilter(source);
+    if(stopSet != null) {
+      filter = new StopFilter(filter, stopSet);
+    }
+    
+    switch(stemFilterType){
+    case PORTERSTEM_FILTER:
+      filter = new PorterStemFilter(filter);
+      break;
+    case ENGLISHMINIMALSTEM_FILTER:
+      filter = new EnglishMinimalStemFilter(filter);
+      break;
+    default:
+      break;        
+    }
+    return new TokenStreamComponents(source, filter);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java b/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java
new file mode 100644
index 0000000..acb987c
--- /dev/null
+++ b/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java
@@ -0,0 +1,166 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.scoring.similarity.util;
+
+import java.io.StringReader;
+import java.util.List;
+
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.core.LowerCaseFilter;
+import org.apache.lucene.analysis.core.StopFilter;
+import org.apache.lucene.analysis.en.EnglishMinimalStemFilter;
+import org.apache.lucene.analysis.en.PorterStemFilter;
+import org.apache.lucene.analysis.standard.ClassicTokenizer;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.analysis.shingle.ShingleFilter;
+import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.nutch.scoring.similarity.util.LuceneAnalyzerUtil.StemFilterType;
+
+public class LuceneTokenizer {
+
+  private TokenStream tokenStream; 
+  private TokenizerType tokenizer;
+  private StemFilterType stemFilterType;
+  private CharArraySet stopSet = null;
+
+  public static enum TokenizerType {CLASSIC, STANDARD}
+
+  /**
+   * Creates a tokenizer based on param values
+   * @param content - The text to tokenize
+   * @param tokenizer - the type of tokenizer to use CLASSIC or DEFAULT 
+   * @param useStopFilter - if set to true the token stream will be filtered using default Lucene stopset 
+   * @param stemFilterType - Type of stemming to perform 
+   */
+  public LuceneTokenizer(String content, TokenizerType tokenizer, boolean useStopFilter, StemFilterType stemFilterType) {
+    this.tokenizer = tokenizer;
+    this.stemFilterType = stemFilterType;
+    if(useStopFilter) {
+      stopSet = StandardAnalyzer.STOP_WORDS_SET;
+    }
+    tokenStream = createTokenStream(content);
+  }
+
+  /**
+   * Creates a tokenizer based on param values
+   * @param content - The text to tokenize
+   * @param tokenizer - the type of tokenizer to use CLASSIC or DEFAULT 
+   * @param stopSet - Provide a set of user defined stop words
+   * @param addToDefault - If set to true, the stopSet words will be added to the Lucene default stop set.
+   * If false, then only the user provided words will be used as the stop set
+   * @param stemFilterType
+   */
+  public LuceneTokenizer(String content, TokenizerType tokenizer, List<String> stopWords, boolean addToDefault, StemFilterType stemFilterType) {
+    this.tokenizer = tokenizer;
+    this.stemFilterType = stemFilterType;
+    if(addToDefault) {
+      CharArraySet stopSet = CharArraySet.copy(StandardAnalyzer.STOP_WORDS_SET);;
+      for(String word : stopWords){
+        stopSet.add(word);
+      }
+      this.stopSet = stopSet;
+    }
+    else {
+      stopSet = new CharArraySet(stopWords, true);
+    }
+    tokenStream = createTokenStream(content);
+  }
+
+  /**
+   * Returns the tokenStream created by the Tokenizer
+   * @return
+   */
+  public TokenStream getTokenStream() {
+    return tokenStream;
+  }
+  
+  /**
+   * Creates a tokenizer for the ngram model based on param values
+   * @param content - The text to tokenize
+   * @param tokenizer - the type of tokenizer to use CLASSIC or DEFAULT 
+   * @param stemFilterType - Type of stemming to perform
+   * @param mingram - Value of mingram for tokenizing
+   * @param maxgram - Value of maxgram for tokenizing
+   */
+  public LuceneTokenizer(String content, TokenizerType tokenizer, StemFilterType stemFilterType, int mingram, int maxgram) {
+    this.tokenizer = tokenizer;
+    this.stemFilterType = stemFilterType;
+    tokenStream = createNGramTokenStream(content, mingram, maxgram);
+  }
+  
+  private TokenStream createTokenStream(String content) {
+    tokenStream = generateTokenStreamFromText(content, tokenizer);
+    tokenStream = new LowerCaseFilter(tokenStream);
+    if(stopSet != null) {
+      tokenStream = applyStopFilter(stopSet);
+    }
+    tokenStream = applyStemmer(stemFilterType);
+    return tokenStream;
+  }
+
+  private TokenStream generateTokenStreamFromText(String content, TokenizerType tokenizerType){
+    Tokenizer tokenizer = null;
+    switch(tokenizerType){
+    case CLASSIC:
+      tokenizer = new ClassicTokenizer();
+      break;
+
+    case STANDARD:
+    default:
+      tokenizer = new StandardTokenizer();
+    }
+
+    tokenizer.setReader(new StringReader(content));
+
+    tokenStream = tokenizer;
+
+    return tokenStream;
+  }
+
+  private TokenStream createNGramTokenStream(String content, int mingram, int maxgram) {
+    Tokenizer tokenizer = new StandardTokenizer();
+    tokenizer.setReader(new StringReader(content));
+    tokenStream = new LowerCaseFilter(tokenizer);
+    tokenStream = applyStemmer(stemFilterType);
+    ShingleFilter shingleFilter = new ShingleFilter(tokenStream, mingram, maxgram);
+    shingleFilter.setOutputUnigrams(false);
+    tokenStream = (TokenStream)shingleFilter;
+    return tokenStream;
+  }
+
+  private TokenStream applyStopFilter(CharArraySet stopWords) {
+    tokenStream = new StopFilter(tokenStream, stopWords); 
+    return tokenStream;
+  }
+
+  private TokenStream applyStemmer(StemFilterType stemFilterType) {
+    switch(stemFilterType){
+    case ENGLISHMINIMALSTEM_FILTER:
+      tokenStream = new EnglishMinimalStemFilter(tokenStream);
+      break;
+    case PORTERSTEM_FILTER:
+      tokenStream = new PorterStemFilter(tokenStream);
+      break;
+    default:
+      break;
+    }
+
+    return tokenStream; 
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/util/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/util/package-info.java b/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/util/package-info.java
new file mode 100644
index 0000000..f660977
--- /dev/null
+++ b/nutch-plugins/scoring-similarity/src/main/java/org/apache/nutch/scoring/similarity/util/package-info.java
@@ -0,0 +1,24 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * 
+ */
+/**
+ * Utility package for Lucene functions
+ *
+ */
+package org.apache.nutch.scoring.similarity.util;
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/subcollection/README.txt
----------------------------------------------------------------------
diff --git a/nutch-plugins/subcollection/README.txt b/nutch-plugins/subcollection/README.txt
new file mode 100644
index 0000000..6b79d16
--- /dev/null
+++ b/nutch-plugins/subcollection/README.txt
@@ -0,0 +1,10 @@
+For brief description about this plugin see
+src/java/org/apache/nutch/collection/package.html
+
+Basically:
+You need to enable this during indexing and during searching
+
+After indexing you can limit your searches to certain
+subcollection with keyword subcollection, eg. 
+
+"subcollection:nutch hadoop"

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/subcollection/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/subcollection/build.xml b/nutch-plugins/subcollection/build.xml
new file mode 100644
index 0000000..77beac6
--- /dev/null
+++ b/nutch-plugins/subcollection/build.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="subcollection" default="jar">
+
+  <import file="../build-plugin.xml"/>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/subcollection/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/subcollection/ivy.xml b/nutch-plugins/subcollection/ivy.xml
new file mode 100644
index 0000000..1a86d68
--- /dev/null
+++ b/nutch-plugins/subcollection/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/subcollection/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/subcollection/plugin.xml b/nutch-plugins/subcollection/plugin.xml
new file mode 100644
index 0000000..ca2cf2f
--- /dev/null
+++ b/nutch-plugins/subcollection/plugin.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="subcollection"
+   name="Subcollection indexing and query filter"
+   version="1.0.0"
+   provider-name="apache.org">
+
+   <runtime>
+      <library name="subcollection.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+   
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+   <extension id="org.apache.nutch.indexer.subcollection.indexing"
+              name="Subcollection Indexing Filter"
+              point="org.apache.nutch.indexer.IndexingFilter">
+      <implementation id="SubcollectionIndexingFilter"
+                      class="org.apache.nutch.indexer.subcollection.SubcollectionIndexingFilter"/>
+                      
+   </extension>
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/subcollection/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/subcollection/pom.xml b/nutch-plugins/subcollection/pom.xml
new file mode 100644
index 0000000..d8e3a97
--- /dev/null
+++ b/nutch-plugins/subcollection/pom.xml
@@ -0,0 +1,38 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>subcollection</artifactId>
+    <packaging>jar</packaging>
+
+    <name>subcollection</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/subcollection/src/main/java/org/apache/nutch/collection/CollectionManager.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/subcollection/src/main/java/org/apache/nutch/collection/CollectionManager.java b/nutch-plugins/subcollection/src/main/java/org/apache/nutch/collection/CollectionManager.java
new file mode 100644
index 0000000..0dff3f8
--- /dev/null
+++ b/nutch-plugins/subcollection/src/main/java/org/apache/nutch/collection/CollectionManager.java
@@ -0,0 +1,240 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.collection;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.nutch.util.DomUtil;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.ObjectCache;
+import org.apache.xerces.dom.DocumentImpl;
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.NodeList;
+
+public class CollectionManager extends Configured {
+
+  public static final String DEFAULT_FILE_NAME = "subcollections.xml";
+
+  static final Logger LOG = LoggerFactory.getLogger(CollectionManager.class);
+
+  transient Map<String, Subcollection> collectionMap = new HashMap<String, Subcollection>();
+
+  transient URL configfile;
+
+  public CollectionManager(Configuration conf) {
+    super(conf);
+    init();
+  }
+
+  /**
+   * Used for testing
+   */
+  protected CollectionManager() {
+    super(NutchConfiguration.create());
+  }
+
+  protected void init() {
+    try {
+      if (LOG.isInfoEnabled()) {
+        LOG.info("initializing CollectionManager");
+      }
+      // initialize known subcollections
+      configfile = getConf().getResource(
+          getConf().get("subcollections.config", DEFAULT_FILE_NAME));
+
+      InputStream input = getConf().getConfResourceAsInputStream(
+          getConf().get("subcollections.config", DEFAULT_FILE_NAME));
+      parse(input);
+    } catch (Exception e) {
+      if (LOG.isWarnEnabled()) {
+        LOG.warn("Error occured:" + e);
+      }
+    }
+  }
+
+  protected void parse(InputStream input) {
+    Element collections = DomUtil.getDom(input);
+
+    if (collections != null) {
+      NodeList nodeList = collections
+          .getElementsByTagName(Subcollection.TAG_COLLECTION);
+
+      if (LOG.isInfoEnabled()) {
+        LOG.info("file has " + nodeList.getLength() + " elements");
+      }
+
+      for (int i = 0; i < nodeList.getLength(); i++) {
+        Element scElem = (Element) nodeList.item(i);
+        Subcollection subCol = new Subcollection(getConf());
+        subCol.initialize(scElem);
+        collectionMap.put(subCol.name, subCol);
+      }
+    } else if (LOG.isInfoEnabled()) {
+      LOG.info("Cannot find collections");
+    }
+  }
+
+  public static CollectionManager getCollectionManager(Configuration conf) {
+    String key = "collectionmanager";
+    ObjectCache objectCache = ObjectCache.get(conf);
+    CollectionManager impl = (CollectionManager) objectCache.getObject(key);
+    if (impl == null) {
+      try {
+        if (LOG.isInfoEnabled()) {
+          LOG.info("Instantiating CollectionManager");
+        }
+        impl = new CollectionManager(conf);
+        objectCache.setObject(key, impl);
+      } catch (Exception e) {
+        throw new RuntimeException("Couldn't create CollectionManager", e);
+      }
+    }
+    return impl;
+  }
+
+  /**
+   * Returns named subcollection
+   * 
+   * @param id
+   * @return Named SubCollection (or null if not existing)
+   */
+  public Subcollection getSubColection(final String id) {
+    return (Subcollection) collectionMap.get(id);
+  }
+
+  /**
+   * Delete named subcollection
+   * 
+   * @param id
+   *          Id of SubCollection to delete
+   */
+  public void deleteSubCollection(final String id) throws IOException {
+    final Subcollection subCol = getSubColection(id);
+    if (subCol != null) {
+      collectionMap.remove(id);
+    }
+  }
+
+  /**
+   * Create a new subcollection.
+   * 
+   * @param name
+   *          Name of SubCollection to create
+   * @return Created SubCollection or null if allready existed
+   */
+  public Subcollection createSubCollection(final String id, final String name) {
+    Subcollection subCol = null;
+
+    if (!collectionMap.containsKey(id)) {
+      subCol = new Subcollection(id, name, getConf());
+      collectionMap.put(id, subCol);
+    }
+
+    return subCol;
+  }
+
+  /**
+   * Return names of collections url is part of
+   * 
+   * @param url
+   *          The url to test against Collections
+   * @return Subcollections
+   */
+  public List<Subcollection> getSubCollections(final String url) {
+    List<Subcollection> collections = new ArrayList<Subcollection>();
+    final Iterator iterator = collectionMap.values().iterator();
+
+    while (iterator.hasNext()) {
+      final Subcollection subCol = (Subcollection) iterator.next();
+      if (subCol.filter(url) != null) {
+        collections.add(subCol);
+      }
+    }
+    if (LOG.isTraceEnabled()) {
+      LOG.trace("subcollections:" + Arrays.toString(collections.toArray()));
+    }
+
+    return collections;
+  }
+
+  /**
+   * Returns all collections
+   * 
+   * @return All collections CollectionManager knows about
+   */
+  public Collection getAll() {
+    return collectionMap.values();
+  }
+
+  /**
+   * Save collections into file
+   * 
+   * @throws Exception
+   */
+  public void save() throws IOException {
+    try {
+      final FileOutputStream fos = new FileOutputStream(new File(
+          configfile.getFile()));
+      final Document doc = new DocumentImpl();
+      final Element collections = doc
+          .createElement(Subcollection.TAG_COLLECTIONS);
+      final Iterator iterator = collectionMap.values().iterator();
+
+      while (iterator.hasNext()) {
+        final Subcollection subCol = (Subcollection) iterator.next();
+        final Element collection = doc
+            .createElement(Subcollection.TAG_COLLECTION);
+        collections.appendChild(collection);
+        final Element name = doc.createElement(Subcollection.TAG_NAME);
+        name.setNodeValue(subCol.getName());
+        collection.appendChild(name);
+        final Element whiteList = doc
+            .createElement(Subcollection.TAG_WHITELIST);
+        whiteList.setNodeValue(subCol.getWhiteListString());
+        collection.appendChild(whiteList);
+        final Element blackList = doc
+            .createElement(Subcollection.TAG_BLACKLIST);
+        blackList.setNodeValue(subCol.getBlackListString());
+        collection.appendChild(blackList);
+      }
+
+      DomUtil.saveDom(fos, collections);
+      fos.flush();
+      fos.close();
+    } catch (FileNotFoundException e) {
+      throw new IOException(e.toString());
+    }
+  }
+}

[22/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build for nutch-core and nutch-plugins

Posted by th...@apache.org.

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-static/src/main/java/org/apache/nutch/indexer/staticfield/StaticFieldIndexer.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-static/src/main/java/org/apache/nutch/indexer/staticfield/StaticFieldIndexer.java b/nutch-plugins/index-static/src/main/java/org/apache/nutch/indexer/staticfield/StaticFieldIndexer.java
new file mode 100644
index 0000000..1a81041
--- /dev/null
+++ b/nutch-plugins/index-static/src/main/java/org/apache/nutch/indexer/staticfield/StaticFieldIndexer.java
@@ -0,0 +1,143 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.indexer.staticfield;
+
+import java.util.HashMap;
+import java.util.Map.Entry;
+
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.IndexingFilter;
+import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.parse.Parse;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.conf.Configuration;
+
+/**
+ * A simple plugin called at indexing that adds fields with static data. You can
+ * specify a list of fieldname:fieldcontent per nutch job. It can be useful when
+ * collections can't be created by urlpatterns, like in subcollection, but on a
+ * job-basis.
+ */
+
+public class StaticFieldIndexer implements IndexingFilter {
+  private Configuration conf;
+  private HashMap<String, String[]> fields;
+  private boolean addStaticFields = false;
+  private String fieldSep = ",";
+  private String kevSep = ":";
+  private String valueSep = " ";
+
+  /**
+   * The {@link StaticFieldIndexer} filter object which adds fields as per
+   * configuration setting. See {@code index.static} in nutch-default.xml.
+   * 
+   * @param doc
+   *          The {@link NutchDocument} object
+   * @param parse
+   *          The relevant {@link Parse} object passing through the filter
+   * @param url
+   *          URL to be filtered for anchor text
+   * @param datum
+   *          The {@link CrawlDatum} entry
+   * @param inlinks
+   *          The {@link Inlinks} containing anchor text
+   * @return filtered NutchDocument
+   */
+  public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
+      CrawlDatum datum, Inlinks inlinks) throws IndexingException {
+
+    if (this.addStaticFields == true) {
+      for (Entry<String, String[]> entry : this.fields.entrySet()) {
+        for (String val : entry.getValue()) {
+          doc.add(entry.getKey(), val);
+        }
+      }
+    }
+    return doc;
+  }
+
+  /**
+   * Populate a HashMap from a list of fieldname:fieldcontent. See
+   * {@index.static} in nutch-default.xml.
+   * 
+   * @param fieldsString
+   *          string containing field:value pairs
+   * @return HashMap of fields and their corresponding values
+   */
+  private HashMap<String, String[]> parseFields(String fieldsString) {
+    HashMap<String, String[]> fields = new HashMap<String, String[]>();
+
+    /*
+     * The format is very easy, it's a comma-separated list of fields in the
+     * form <name>:<value>
+     */
+    for (String field : fieldsString.split(this.fieldSep)) {
+      String[] entry = field.split(this.kevSep);
+      if (entry.length == 2)
+        fields.put(entry[0].trim(), entry[1].trim().split(this.valueSep));
+    }
+
+    return fields;
+  }
+
+  /**
+   * Set the {@link Configuration} object
+   */
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+
+    // NUTCH-2052: Allow user-defined delimiters in index.static
+    this.fieldSep = this.regexEscape(conf.get("index.static.fieldsep", ","));
+    this.kevSep = this.regexEscape(conf.get("index.static.keysep", ":"));
+    this.valueSep = this.regexEscape(conf.get("index.static.valuesep", " "));
+
+    String fieldsString = conf.get("index.static", null);
+    if (fieldsString != null) {
+      this.addStaticFields = true;
+      this.fields = parseFields(fieldsString);
+    }
+  }
+
+  /**
+   * Get the {@link Configuration} object
+   */
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+  /**
+   * Escapes any character that needs escaping so it can be used in a regexp.
+   */
+  protected String regexEscape(String in) {
+    String result = in;
+    if (in != null) {
+      StringBuffer sb = new StringBuffer();
+      for (int i = 0; i < in.length(); i++) {
+        CharSequence c = in.subSequence(i, i+1);
+        if ("<([{\\^-=$!|]})?*+.>".contains(c)) {
+          sb.append('\\');
+        }
+        sb.append(c);
+      }
+      result = sb.toString();
+    }
+    return result;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-static/src/main/java/org/apache/nutch/indexer/staticfield/package.html
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-static/src/main/java/org/apache/nutch/indexer/staticfield/package.html b/nutch-plugins/index-static/src/main/java/org/apache/nutch/indexer/staticfield/package.html
new file mode 100644
index 0000000..f4b5146
--- /dev/null
+++ b/nutch-plugins/index-static/src/main/java/org/apache/nutch/indexer/staticfield/package.html
@@ -0,0 +1,5 @@
+<html>
+<body>
+<p>A simple plugin called at indexing that adds fields with static data. You can specify a list of fieldname:fieldcontent per nutch job. It can be useful when collections can't be created by urlpatterns, like in subcollection, but on a job-basis.</p><p></p>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/index-static/src/test/java/org/apache/nutch/indexer/staticfield/TestStaticFieldIndexerTest.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-static/src/test/java/org/apache/nutch/indexer/staticfield/TestStaticFieldIndexerTest.java b/nutch-plugins/index-static/src/test/java/org/apache/nutch/indexer/staticfield/TestStaticFieldIndexerTest.java
new file mode 100644
index 0000000..42cd46d
--- /dev/null
+++ b/nutch-plugins/index-static/src/test/java/org/apache/nutch/indexer/staticfield/TestStaticFieldIndexerTest.java
@@ -0,0 +1,194 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer.staticfield;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+/**
+ * JUnit test case which tests 1. that static data fields are added to a
+ * document 2. that empty {@code index.static} does not add anything to the
+ * document 3. that valid field:value pairs are added to the document 4. that
+ * fields and values added to the document are trimmed
+ * 
+ * @author tejasp
+ */
+
+public class TestStaticFieldIndexerTest {
+
+  Configuration conf;
+
+  Inlinks inlinks;
+  ParseImpl parse;
+  CrawlDatum crawlDatum;
+  Text url;
+  StaticFieldIndexer filter;
+
+  @Before
+  public void setUp() throws Exception {
+    conf = NutchConfiguration.create();
+    parse = new ParseImpl();
+    url = new Text("http://nutch.apache.org/index.html");
+    crawlDatum = new CrawlDatum();
+    inlinks = new Inlinks();
+    filter = new StaticFieldIndexer();
+  }
+
+  /**
+   * Test that empty {@code index.static} does not add anything to the document
+   * 
+   * @throws Exception
+   */
+  @Test
+  public void testEmptyIndexStatic() throws Exception {
+
+    Assert.assertNotNull(filter);
+    filter.setConf(conf);
+
+    NutchDocument doc = new NutchDocument();
+
+    try {
+      filter.filter(doc, parse, url, crawlDatum, inlinks);
+    } catch (Exception e) {
+      e.printStackTrace();
+      Assert.fail(e.getMessage());
+    }
+
+    Assert.assertNotNull(doc);
+    Assert.assertTrue("tests if no field is set for empty index.static", doc
+        .getFieldNames().isEmpty());
+  }
+
+  /**
+   * Test that valid field:value pairs are added to the document
+   * 
+   * @throws Exception
+   */
+  @Test
+  public void testNormalScenario() throws Exception {
+
+    conf.set("index.static",
+        "field1:val1, field2    :      val2 val3     , field3, field4 :val4 , ");
+    Assert.assertNotNull(filter);
+    filter.setConf(conf);
+
+    NutchDocument doc = new NutchDocument();
+
+    try {
+      filter.filter(doc, parse, url, crawlDatum, inlinks);
+    } catch (Exception e) {
+      e.printStackTrace();
+      Assert.fail(e.getMessage());
+    }
+
+    Assert.assertNotNull(doc);
+    Assert.assertFalse("test if doc is not empty", doc.getFieldNames()
+        .isEmpty());
+    Assert.assertEquals("test if doc has 3 fields", 3, doc.getFieldNames()
+        .size());
+    Assert.assertTrue("test if doc has field1", doc.getField("field1")
+        .getValues().contains("val1"));
+    Assert.assertTrue("test if doc has field2", doc.getField("field2")
+        .getValues().contains("val2"));
+    Assert.assertTrue("test if doc has field4", doc.getField("field4")
+        .getValues().contains("val4"));
+  }
+
+  /**
+   * Test for NUTCH-2052 custom delimiters in index.static.
+   *
+   * @throws Exception
+   */
+  @Test
+  public void testCustomDelimiters() throws Exception {
+
+    conf.set("index.static.fieldsep", ">");
+    conf.set("index.static.keysep", "=");
+    conf.set("index.static.valuesep", "|");
+    conf.set("index.static",
+        "field1=val1>field2    =      val2|val3     >field3>field4 =val4 > ");
+    Assert.assertNotNull(filter);
+    filter.setConf(conf);
+
+    NutchDocument doc = new NutchDocument();
+
+    try {
+      filter.filter(doc, parse, url, crawlDatum, inlinks);
+    } catch (Exception e) {
+      e.printStackTrace();
+      Assert.fail(e.getMessage());
+    }
+
+    Assert.assertNotNull(doc);
+    Assert.assertFalse("test if doc is not empty", doc.getFieldNames()
+        .isEmpty());
+    Assert.assertEquals("test if doc has 3 fields", 3, doc.getFieldNames()
+        .size());
+    Assert.assertTrue("test if doc has field1", doc.getField("field1")
+        .getValues().contains("val1"));
+    Assert.assertTrue("test if doc has field2", doc.getField("field2")
+        .getValues().contains("val2"));
+    Assert.assertTrue("test if doc has field4", doc.getField("field4")
+        .getValues().contains("val4"));
+  }
+
+  /**
+   * Test for NUTCH-2052 custom delimiters in index.static.
+   *
+   * @throws Exception
+   */
+  @Test
+  public void testCustomMulticharacterDelimiters() throws Exception {
+
+    conf.set("index.static.fieldsep", "\n\n");
+    conf.set("index.static.keysep", "\t\t");
+    conf.set("index.static.valuesep", "***");
+    conf.set("index.static", "field1\t\tval1\n\n" + "field2\t\tval2***val3\n\n"
+        + "field3\n\n" + "field4\t\tval4\n\n\n\n");
+    Assert.assertNotNull(filter);
+    filter.setConf(conf);
+
+    NutchDocument doc = new NutchDocument();
+
+    try {
+      filter.filter(doc, parse, url, crawlDatum, inlinks);
+    } catch (Exception e) {
+      e.printStackTrace();
+      Assert.fail(e.getMessage());
+    }
+
+    Assert.assertNotNull(doc);
+    Assert.assertFalse("test if doc is not empty", doc.getFieldNames()
+        .isEmpty());
+    Assert.assertEquals("test if doc has 3 fields", 3, doc.getFieldNames()
+        .size());
+    Assert.assertTrue("test if doc has field1", doc.getField("field1")
+        .getValues().contains("val1"));
+    Assert.assertTrue("test if doc has field2", doc.getField("field2")
+        .getValues().contains("val2"));
+    Assert.assertTrue("test if doc has field4", doc.getField("field4")
+        .getValues().contains("val4"));
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/indexer-cloudsearch/README.md
----------------------------------------------------------------------
diff --git a/nutch-plugins/indexer-cloudsearch/README.md b/nutch-plugins/indexer-cloudsearch/README.md
new file mode 100644
index 0000000..8669682
--- /dev/null
+++ b/nutch-plugins/indexer-cloudsearch/README.md
@@ -0,0 +1,58 @@
+AWS CloudSearch plugin for Nutch 
+================================
+
+See [http://aws.amazon.com/cloudsearch/] for information on AWS CloudSearch.
+
+Steps to use :
+
+From runtime/local/bin
+
+* Configure the AWS credentials 
+
+Edit `~/.aws/credentials`, see [http://docs.aws.amazon.com/cli/latest/userguide/cli-chap-getting-started.html] for details. Note that this should not be necessary when running Nutch on EC2.
+
+* Edit ../conf/nutch-site.xml and check that 'plugin.includes' contains 'indexer-cloudsearch'. 
+
+* (Optional) Test the indexing 
+
+`./nutch indexchecker -D doIndex=true -D cloudsearch.batch.dump=true "http://nutch.apache.org/"`
+
+if the agent name hasn't been configured in nutch-site.xml, it can be added on the command line with `-D http.agent.name=whateverValueDescribesYouBest`
+
+you should see the fields extracted for the indexing coming up on the console.
+
+Using the `cloudsearch.batch.dump` parameter allows to dump the batch to the local temp dir. The files has the prefix "CloudSearch_" e.g. `/tmp/CloudSearch_4822180575734804454.json`. This temp file can be used as a template when defining the fields in the domain creation (see below).
+
+* Create a CloudSearch domain
+
+This can be done using the web console [https://eu-west-1.console.aws.amazon.com/cloudsearch/home?region=eu-west-1#]. You can use the temp file generated above to bootstrap the field definition. 
+
+You can also create the domain using the AWS CLI [http://docs.aws.amazon.com/cloudsearch/latest/developerguide/creating-domains.html] and the `createCSDomain.sh` example script provided. This script is merely as starting point which you should further improve and fine tune. 
+
+Note that the creation of the domain can take some time. Once it is complete, note the document endpoint, or alternatively verify the region and domain name.
+
+* Edit ../conf/nutch-site.xml and add `cloudsearch.endpoint` and `cloudsearch.region`. 
+
+* Re-test the indexing
+
+`./nutch indexchecker -D doIndex=true "http://nutch.apache.org/"`
+
+and check in the CloudSearch console that the document has been succesfully indexed.
+
+Additional parameters
+
+* `cloudsearch.batch.maxSize` \: can be used to limit the size of the batches sent to CloudSearch to N documents. Note that the default limitations still apply.
+
+* `cloudsearch.batch.dump` \: see above. Stores the JSON representation of the document batch in the local temp dir, useful for bootstrapping the index definition.
+
+Note
+
+The CloudSearchIndexWriter will log any errors while sending the batches to CloudSearch and will resume the process without breaking. This means that you might not get all the documents in the index. You should check the log files for errors. Using small batch sizes will limit the number of documents skipped in case of error.
+
+Any fields not defined in the CloudSearch domain will be ignored by the CloudSearchIndexWriter. Again, the logs will contain a trace of any field names skipped.
+
+
+
+  
+
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/indexer-cloudsearch/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/indexer-cloudsearch/build.xml b/nutch-plugins/indexer-cloudsearch/build.xml
new file mode 100644
index 0000000..852b2650bd
--- /dev/null
+++ b/nutch-plugins/indexer-cloudsearch/build.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="indexer-cloudsearch" default="jar-core">
+
+  <import file="../build-plugin.xml" />
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/indexer-cloudsearch/createCSDomain.sh
----------------------------------------------------------------------
diff --git a/nutch-plugins/indexer-cloudsearch/createCSDomain.sh b/nutch-plugins/indexer-cloudsearch/createCSDomain.sh
new file mode 100644
index 0000000..24fb015
--- /dev/null
+++ b/nutch-plugins/indexer-cloudsearch/createCSDomain.sh
@@ -0,0 +1,22 @@
+# example of domain configuration for CloudSearch
+
+DOMAIN="$1"
+
+if [ "$DOMAIN" = "" ]; then
+    echo "Need to specify a domain name as argument"
+    exit -1;
+fi
+
+aws cloudsearch create-domain --domain-name $DOMAIN
+
+aws cloudsearch define-index-field --domain-name $DOMAIN --name boost --type double --sort-enabled true --facet-enabled false
+aws cloudsearch define-index-field --domain-name $DOMAIN --name content --type text --sort-enabled false
+aws cloudsearch define-index-field --domain-name $DOMAIN --name digest --type literal --sort-enabled false --facet-enabled false
+aws cloudsearch define-index-field --domain-name $DOMAIN --name host --type literal --sort-enabled false --facet-enabled true
+aws cloudsearch define-index-field --domain-name $DOMAIN --name id --type literal --sort-enabled false --facet-enabled false
+aws cloudsearch define-index-field --domain-name $DOMAIN --name segment --type literal --sort-enabled true --facet-enabled false
+aws cloudsearch define-index-field --domain-name $DOMAIN --name title --type text --sort-enabled false
+aws cloudsearch define-index-field --domain-name $DOMAIN --name tstamp --type date --sort-enabled true --facet-enabled false
+aws cloudsearch define-index-field --domain-name $DOMAIN --name url --type literal --sort-enabled false --facet-enabled false
+
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/indexer-cloudsearch/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/indexer-cloudsearch/ivy.xml b/nutch-plugins/indexer-cloudsearch/ivy.xml
new file mode 100644
index 0000000..00d9fc3
--- /dev/null
+++ b/nutch-plugins/indexer-cloudsearch/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../../ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+	<dependency org="com.amazonaws" name="aws-java-sdk-cloudsearch" rev="1.10.0"/>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/indexer-cloudsearch/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/indexer-cloudsearch/plugin.xml b/nutch-plugins/indexer-cloudsearch/plugin.xml
new file mode 100644
index 0000000..5b44253
--- /dev/null
+++ b/nutch-plugins/indexer-cloudsearch/plugin.xml
@@ -0,0 +1,50 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+  
+  http://www.apache.org/licenses/LICENSE-2.0
+  
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<plugin id="indexer-cloudsearch" name="CloudSearchIndexWriter" version="1.0.0"
+  provider-name="nutch.apache.org">
+
+  <runtime>
+    <library name="indexer-cloudsearch.jar">
+      <export name="*" />
+    </library>
+
+     <library name="aws-java-sdk-cloudsearch-1.10.0.jar"/>
+     <library name="aws-java-sdk-core-1.10.0.jar"/>
+     <library name="commons-codec-1.6.jar"/>
+     <library name="commons-logging-1.1.3.jar"/>
+     <library name="httpclient-4.3.6.jar"/>
+     <library name="httpcore-4.3.3.jar"/>
+     <library name="jackson-annotations-2.5.0.jar"/>
+     <library name="jackson-core-2.5.3.jar"/>
+     <library name="jackson-databind-2.5.3.jar"/>
+     <library name="joda-time-2.8.jar"/>
+
+  </runtime>
+
+  <requires>
+    <import plugin="nutch-extensionpoints" />
+  </requires>
+
+  <extension id="org.apache.nutch.indexer.cloudsearch"
+    name="CloudSearch Index Writer"
+    point="org.apache.nutch.indexer.IndexWriter">
+    <implementation id="CloudSearchIndexWriter"
+      class="org.apache.nutch.indexwriter.cloudsearch.CloudSearchIndexWriter" />
+  </extension>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/indexer-cloudsearch/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/indexer-cloudsearch/pom.xml b/nutch-plugins/indexer-cloudsearch/pom.xml
new file mode 100644
index 0000000..af37fc4
--- /dev/null
+++ b/nutch-plugins/indexer-cloudsearch/pom.xml
@@ -0,0 +1,45 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>indexer-cloudsearch</artifactId>
+    <packaging>jar</packaging>
+
+    <name>indexer-cloudsearch</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+    <dependencies>
+        <dependency>
+            <groupId>com.amazonaws</groupId>
+            <artifactId>aws-java-sdk-cloudsearch</artifactId>
+            <version>1.10.0</version>
+        </dependency>
+    </dependencies>
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/indexer-cloudsearch/src/main/java/org/apache/nutch/indexwriter/cloudsearch/CloudSearchConstants.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/indexer-cloudsearch/src/main/java/org/apache/nutch/indexwriter/cloudsearch/CloudSearchConstants.java b/nutch-plugins/indexer-cloudsearch/src/main/java/org/apache/nutch/indexwriter/cloudsearch/CloudSearchConstants.java
new file mode 100644
index 0000000..8bfb161
--- /dev/null
+++ b/nutch-plugins/indexer-cloudsearch/src/main/java/org/apache/nutch/indexwriter/cloudsearch/CloudSearchConstants.java
@@ -0,0 +1,27 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.indexwriter.cloudsearch;
+
+public interface CloudSearchConstants {
+  public static final String CLOUDSEARCH_PREFIX = "cloudsearch.";
+  public static final String ENDPOINT = CLOUDSEARCH_PREFIX + "endpoint";
+  public static final String REGION = CLOUDSEARCH_PREFIX + "region";
+  public static final String BATCH_DUMP = CLOUDSEARCH_PREFIX + "batch.dump";
+  public static final String MAX_DOCS_BATCH = CLOUDSEARCH_PREFIX
+      + "batch.maxSize";
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/indexer-cloudsearch/src/main/java/org/apache/nutch/indexwriter/cloudsearch/CloudSearchIndexWriter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/indexer-cloudsearch/src/main/java/org/apache/nutch/indexwriter/cloudsearch/CloudSearchIndexWriter.java b/nutch-plugins/indexer-cloudsearch/src/main/java/org/apache/nutch/indexwriter/cloudsearch/CloudSearchIndexWriter.java
new file mode 100644
index 0000000..b6f1a9c
--- /dev/null
+++ b/nutch-plugins/indexer-cloudsearch/src/main/java/org/apache/nutch/indexwriter/cloudsearch/CloudSearchIndexWriter.java
@@ -0,0 +1,382 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.indexwriter.cloudsearch;
+
+import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+import java.text.SimpleDateFormat;
+import java.util.Date;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.commons.lang.StringUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.nutch.indexer.IndexWriter;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.indexer.NutchField;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.amazonaws.regions.RegionUtils;
+import com.amazonaws.services.cloudsearchdomain.AmazonCloudSearchDomainClient;
+import com.amazonaws.services.cloudsearchdomain.model.ContentType;
+import com.amazonaws.services.cloudsearchdomain.model.UploadDocumentsRequest;
+import com.amazonaws.services.cloudsearchdomain.model.UploadDocumentsResult;
+import com.amazonaws.services.cloudsearchv2.AmazonCloudSearchClient;
+import com.amazonaws.services.cloudsearchv2.model.DescribeDomainsRequest;
+import com.amazonaws.services.cloudsearchv2.model.DescribeDomainsResult;
+import com.amazonaws.services.cloudsearchv2.model.DescribeIndexFieldsRequest;
+import com.amazonaws.services.cloudsearchv2.model.DescribeIndexFieldsResult;
+import com.amazonaws.services.cloudsearchv2.model.DomainStatus;
+import com.amazonaws.services.cloudsearchv2.model.IndexFieldStatus;
+import com.amazonaws.util.json.JSONException;
+import com.amazonaws.util.json.JSONObject;
+
+/**
+ * Writes documents to CloudSearch.
+ */
+public class CloudSearchIndexWriter implements IndexWriter {
+  public static final Logger LOG = LoggerFactory
+      .getLogger(CloudSearchIndexWriter.class);
+
+  private static final int MAX_SIZE_BATCH_BYTES = 5242880;
+  private static final int MAX_SIZE_DOC_BYTES = 1048576;
+
+  private static final SimpleDateFormat DATE_FORMAT = new SimpleDateFormat(
+      "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'");
+
+  private AmazonCloudSearchDomainClient client;
+
+  private int maxDocsInBatch = -1;
+
+  private StringBuffer buffer;
+
+  private int numDocsInBatch = 0;
+
+  private boolean dumpBatchFilesToTemp = false;
+
+  private Configuration conf;
+
+  private Map<String, String> csfields = new HashMap<String, String>();
+
+  private String regionName;
+
+  @Override
+  public void open(JobConf job, String name) throws IOException {
+    LOG.debug("CloudSearchIndexWriter.open() name={} ", name);
+
+    maxDocsInBatch = job.getInt(CloudSearchConstants.MAX_DOCS_BATCH, -1);
+
+    buffer = new StringBuffer(MAX_SIZE_BATCH_BYTES).append('[');
+
+    dumpBatchFilesToTemp = job.getBoolean(CloudSearchConstants.BATCH_DUMP,
+        false);
+
+    if (dumpBatchFilesToTemp) {
+      // only dumping to local file
+      // no more config required
+      return;
+    }
+
+    String endpoint = job.get(CloudSearchConstants.ENDPOINT);
+
+    if (StringUtils.isBlank(endpoint)) {
+      throw new RuntimeException("endpoint not set for CloudSearch");
+    }
+
+    AmazonCloudSearchClient cl = new AmazonCloudSearchClient();
+    if (StringUtils.isNotBlank(regionName)) {
+      cl.setRegion(RegionUtils.getRegion(regionName));
+    }
+
+    String domainName = null;
+
+    // retrieve the domain name
+    DescribeDomainsResult domains = cl
+        .describeDomains(new DescribeDomainsRequest());
+
+    Iterator<DomainStatus> dsiter = domains.getDomainStatusList().iterator();
+    while (dsiter.hasNext()) {
+      DomainStatus ds = dsiter.next();
+      if (ds.getDocService().getEndpoint().equals(endpoint)) {
+        domainName = ds.getDomainName();
+        break;
+      }
+    }
+
+    // check domain name
+    if (StringUtils.isBlank(domainName)) {
+      throw new RuntimeException(
+          "No domain name found for CloudSearch endpoint");
+    }
+
+    DescribeIndexFieldsResult indexDescription = cl.describeIndexFields(
+        new DescribeIndexFieldsRequest().withDomainName(domainName));
+    for (IndexFieldStatus ifs : indexDescription.getIndexFields()) {
+      String indexname = ifs.getOptions().getIndexFieldName();
+      String indextype = ifs.getOptions().getIndexFieldType();
+      LOG.info("CloudSearch index name {} of type {}", indexname, indextype);
+      csfields.put(indexname, indextype);
+    }
+
+    client = new AmazonCloudSearchDomainClient();
+    client.setEndpoint(endpoint);
+
+  }
+
+  @Override
+  public void delete(String url) throws IOException {
+
+    try {
+      JSONObject doc_builder = new JSONObject();
+
+      doc_builder.put("type", "delete");
+
+      // generate the id from the url
+      String ID = CloudSearchUtils.getID(url);
+      doc_builder.put("id", ID);
+
+      // add to the batch
+      addToBatch(doc_builder.toString(2), url);
+
+    } catch (JSONException e) {
+      LOG.error("Exception caught while building JSON object", e);
+    }
+
+  }
+
+  @Override
+  public void update(NutchDocument doc) throws IOException {
+    write(doc);
+  }
+
+  @Override
+  public void write(NutchDocument doc) throws IOException {
+    try {
+      JSONObject doc_builder = new JSONObject();
+
+      doc_builder.put("type", "add");
+
+      String url = doc.getField("url").toString();
+
+      // generate the id from the url
+      String ID = CloudSearchUtils.getID(url);
+      doc_builder.put("id", ID);
+
+      JSONObject fields = new JSONObject();
+
+      for (final Entry<String, NutchField> e : doc) {
+        String fieldname = cleanFieldName(e.getKey());
+        String type = csfields.get(fieldname);
+
+        // undefined in index
+        if (!dumpBatchFilesToTemp && type == null) {
+          LOG.info(
+              "Field {} not defined in CloudSearch domain for {} - skipping.",
+              fieldname, url);
+          continue;
+        }
+
+        List<Object> values = e.getValue().getValues();
+        // write the values
+        for (Object value : values) {
+          // Convert dates to an integer
+          if (value instanceof Date) {
+            Date d = (Date) value;
+            value = DATE_FORMAT.format(d);
+          }
+          // normalise strings
+          else if (value instanceof String) {
+            value = CloudSearchUtils.stripNonCharCodepoints((String) value);
+          }
+
+          fields.accumulate(fieldname, value);
+        }
+      }
+
+      doc_builder.put("fields", fields);
+
+      addToBatch(doc_builder.toString(2), url);
+
+    } catch (JSONException e) {
+      LOG.error("Exception caught while building JSON object", e);
+    }
+  }
+
+  private void addToBatch(String currentDoc, String url) throws IOException {
+    int currentDocLength = currentDoc.getBytes(StandardCharsets.UTF_8).length;
+
+    // check that the doc is not too large -> skip it if it does
+    if (currentDocLength > MAX_SIZE_DOC_BYTES) {
+      LOG.error("Doc too large. currentDoc.length {} : {}", currentDocLength,
+          url);
+      return;
+    }
+
+    int currentBufferLength = buffer.toString()
+        .getBytes(StandardCharsets.UTF_8).length;
+
+    LOG.debug("currentDoc.length {}, buffer length {}", currentDocLength,
+        currentBufferLength);
+
+    // can add it to the buffer without overflowing?
+    if (currentDocLength + 2 + currentBufferLength < MAX_SIZE_BATCH_BYTES) {
+      if (numDocsInBatch != 0)
+        buffer.append(',');
+      buffer.append(currentDoc);
+      numDocsInBatch++;
+    }
+    // flush the previous batch and create a new one with this doc
+    else {
+      commit();
+      buffer.append(currentDoc);
+      numDocsInBatch++;
+    }
+
+    // have we reached the max number of docs in a batch after adding
+    // this doc?
+    if (maxDocsInBatch > 0 && numDocsInBatch == maxDocsInBatch) {
+      commit();
+    }
+  }
+
+  @Override
+  public void commit() throws IOException {
+
+    // nothing to do
+    if (numDocsInBatch == 0) {
+      return;
+    }
+
+    // close the array
+    buffer.append(']');
+
+    LOG.info("Sending {} docs to CloudSearch", numDocsInBatch);
+
+    byte[] bb = buffer.toString().getBytes(StandardCharsets.UTF_8);
+
+    if (dumpBatchFilesToTemp) {
+      try {
+        File temp = File.createTempFile("CloudSearch_", ".json");
+        FileUtils.writeByteArrayToFile(temp, bb);
+        LOG.info("Wrote batch file {}", temp.getName());
+      } catch (IOException e1) {
+        LOG.error("Exception while generating batch file", e1);
+      } finally {
+        // reset buffer and doc counter
+        buffer = new StringBuffer(MAX_SIZE_BATCH_BYTES).append('[');
+        numDocsInBatch = 0;
+      }
+      return;
+    }
+    // not in debug mode
+    try (InputStream inputStream = new ByteArrayInputStream(bb)) {
+      UploadDocumentsRequest batch = new UploadDocumentsRequest();
+      batch.setContentLength((long) bb.length);
+      batch.setContentType(ContentType.Applicationjson);
+      batch.setDocuments(inputStream);
+      UploadDocumentsResult result = client.uploadDocuments(batch);
+    } catch (Exception e) {
+      LOG.error("Exception while sending batch", e);
+      LOG.error(buffer.toString());
+    } finally {
+      // reset buffer and doc counter
+      buffer = new StringBuffer(MAX_SIZE_BATCH_BYTES).append('[');
+      numDocsInBatch = 0;
+    }
+  }
+
+  @Override
+  public void close() throws IOException {
+    // This will flush any unsent documents.
+    commit();
+    // close the client
+    if (client != null){
+      client.shutdown();
+    }
+  }
+
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+  @Override
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    String endpoint = getConf().get(CloudSearchConstants.ENDPOINT);
+    boolean dumpBatchFilesToTemp = getConf()
+        .getBoolean(CloudSearchConstants.BATCH_DUMP, false);
+    this.regionName = getConf().get(CloudSearchConstants.REGION);
+
+    if (StringUtils.isBlank(endpoint) && !dumpBatchFilesToTemp) {
+      String message = "Missing CloudSearch endpoint. Should set it set via -D "
+          + CloudSearchConstants.ENDPOINT + " or in nutch-site.xml";
+      message += "\n" + describe();
+      LOG.error(message);
+      throw new RuntimeException(message);
+    }
+  }
+
+  public String describe() {
+    String configuredEndpoint = null;
+    String configuredRegion = null;
+
+    // get the values set in the conf
+    if (getConf() != null) {
+      configuredEndpoint = getConf().get(CloudSearchConstants.ENDPOINT);
+      configuredRegion = getConf().get(CloudSearchConstants.REGION);
+    }
+
+    StringBuffer sb = new StringBuffer("CloudSearchIndexWriter\n");
+    sb.append("\t").append(CloudSearchConstants.ENDPOINT)
+        .append(" : URL of the CloudSearch domain's document endpoint.");
+    if (StringUtils.isNotBlank(configuredEndpoint)) {
+      sb.append(" (value: ").append(configuredEndpoint).append(")");
+    }
+    sb.append("\n");
+
+    sb.append("\t").append(CloudSearchConstants.REGION)
+        .append(" : name of the CloudSearch region.");
+    if (StringUtils.isNotBlank(configuredRegion)) {
+      sb.append(" (").append(configuredRegion).append(")");
+    }
+    sb.append("\n");
+    return sb.toString();
+  }
+
+  /**
+   * Remove the non-cloudSearch-legal characters. Note that this might convert
+   * two fields to the same name.
+   * 
+   * @param name
+   * @return
+   */
+  String cleanFieldName(String name) {
+    String lowercase = name.toLowerCase();
+    return lowercase.replaceAll("[^a-z_0-9]", "_");
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/indexer-cloudsearch/src/main/java/org/apache/nutch/indexwriter/cloudsearch/CloudSearchUtils.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/indexer-cloudsearch/src/main/java/org/apache/nutch/indexwriter/cloudsearch/CloudSearchUtils.java b/nutch-plugins/indexer-cloudsearch/src/main/java/org/apache/nutch/indexwriter/cloudsearch/CloudSearchUtils.java
new file mode 100644
index 0000000..5783981
--- /dev/null
+++ b/nutch-plugins/indexer-cloudsearch/src/main/java/org/apache/nutch/indexwriter/cloudsearch/CloudSearchUtils.java
@@ -0,0 +1,73 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.indexwriter.cloudsearch;
+
+import java.nio.charset.StandardCharsets;
+import java.security.MessageDigest;
+import java.security.NoSuchAlgorithmException;
+
+import org.apache.commons.codec.binary.Hex;
+
+public class CloudSearchUtils {
+
+  private static MessageDigest digester;
+
+  static {
+    try {
+      digester = MessageDigest.getInstance("SHA-512");
+    } catch (NoSuchAlgorithmException e) {
+      throw new RuntimeException(e);
+    }
+  }
+
+  /** Returns a normalised doc ID based on the URL of a document **/
+  public static String getID(String url) {
+
+    // the document needs an ID
+    // @see
+    // http://docs.aws.amazon.com/cloudsearch/latest/developerguide/preparing-data.html#creating-document-batches
+    // A unique ID for the document. A document ID can contain any
+    // letter or number and the following characters: _ - = # ; : / ? @
+    // &. Document IDs must be at least 1 and no more than 128
+    // characters long.
+    byte[] dig = digester.digest(url.getBytes(StandardCharsets.UTF_8));
+    String ID = Hex.encodeHexString(dig);
+    // is that even possible?
+    if (ID.length() > 128) {
+      throw new RuntimeException("ID larger than max 128 chars");
+    }
+    return ID;
+  }
+
+  public static String stripNonCharCodepoints(String input) {
+    StringBuilder retval = new StringBuilder();
+    char ch;
+
+    for (int i = 0; i < input.length(); i++) {
+      ch = input.charAt(i);
+
+      // Keep only characters that are legal for CloudSearch
+      if ((ch == 0x9 || ch == 0xa || ch == 0xd)
+          || (ch >= 0x20 && ch <= 0xFFFD)) {
+        retval.append(ch);
+      }
+    }
+
+    return retval.toString();
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/indexer-dummy/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/indexer-dummy/build.xml b/nutch-plugins/indexer-dummy/build.xml
new file mode 100644
index 0000000..d941278
--- /dev/null
+++ b/nutch-plugins/indexer-dummy/build.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="indexer-dummy" default="jar-core">
+
+  <import file="../build-plugin.xml" />
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/indexer-dummy/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/indexer-dummy/ivy.xml b/nutch-plugins/indexer-dummy/ivy.xml
new file mode 100644
index 0000000..1a86d68
--- /dev/null
+++ b/nutch-plugins/indexer-dummy/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/indexer-dummy/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/indexer-dummy/plugin.xml b/nutch-plugins/indexer-dummy/plugin.xml
new file mode 100644
index 0000000..963c66a
--- /dev/null
+++ b/nutch-plugins/indexer-dummy/plugin.xml
@@ -0,0 +1,38 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+  
+  http://www.apache.org/licenses/LICENSE-2.0
+  
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<plugin id="indexer-dummy" name="DummyIndexWriter" version="1.0.0"
+  provider-name="nutch.apache.org">
+
+  <runtime>
+    <library name="indexer-dummy.jar">
+      <export name="*" />
+    </library>
+  </runtime>
+
+  <requires>
+    <import plugin="nutch-extensionpoints" />
+  </requires>
+
+  <extension id="org.apache.nutch.indexer.dummy"
+    name="Dummy Index Writer"
+    point="org.apache.nutch.indexer.IndexWriter">
+    <implementation id="DummyIndexWriter"
+      class="org.apache.nutch.indexwriter.dummy.DummyIndexWriter" />
+  </extension>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/indexer-dummy/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/indexer-dummy/pom.xml b/nutch-plugins/indexer-dummy/pom.xml
new file mode 100644
index 0000000..33f52a9
--- /dev/null
+++ b/nutch-plugins/indexer-dummy/pom.xml
@@ -0,0 +1,38 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>indexer-dummy</artifactId>
+    <packaging>jar</packaging>
+
+    <name>indexer-dummy</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/indexer-dummy/src/main/java/org/apache/nutch/indexwriter/dummy/DummyIndexWriter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/indexer-dummy/src/main/java/org/apache/nutch/indexwriter/dummy/DummyIndexWriter.java b/nutch-plugins/indexer-dummy/src/main/java/org/apache/nutch/indexwriter/dummy/DummyIndexWriter.java
new file mode 100644
index 0000000..b27ba14
--- /dev/null
+++ b/nutch-plugins/indexer-dummy/src/main/java/org/apache/nutch/indexwriter/dummy/DummyIndexWriter.java
@@ -0,0 +1,103 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexwriter.dummy;
+
+import java.io.BufferedWriter;
+import java.io.IOException;
+import java.io.FileWriter;
+import java.io.Writer;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.nutch.indexer.IndexWriter;
+import org.apache.nutch.indexer.IndexerMapReduce;
+import org.apache.nutch.indexer.NutchDocument;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * DummyIndexWriter. This pluggable indexer writes <action>\t<url>\n lines to a
+ * plain text file for debugging purposes. Possible actions are delete, update
+ * and add.
+ */
+public class DummyIndexWriter implements IndexWriter {
+  public static final Logger LOG = LoggerFactory
+      .getLogger(DummyIndexWriter.class);
+  private Configuration config;
+  private Writer writer;
+  private boolean delete = false;
+
+  public void open(JobConf job, String name) throws IOException {
+    delete = job.getBoolean(IndexerMapReduce.INDEXER_DELETE, false);
+  }
+
+  @Override
+  public void delete(String key) throws IOException {
+    if (delete) {
+      writer.write("delete\t" + key + "\n");
+    }
+  }
+
+  @Override
+  public void update(NutchDocument doc) throws IOException {
+    writer.write("update\t" + doc.getFieldValue("id") + "\n");
+  }
+
+  @Override
+  public void write(NutchDocument doc) throws IOException {
+    writer.write("add\t" + doc.getFieldValue("id") + "\n");
+  }
+
+  public void close() throws IOException {
+    writer.flush();
+    writer.close();
+  }
+
+  @Override
+  public void commit() throws IOException {
+    writer.write("commit\n");
+  }
+
+  @Override
+  public Configuration getConf() {
+    return config;
+  }
+
+  @Override
+  public void setConf(Configuration conf) {
+    config = conf;
+    String path = conf.get("dummy.path");
+    if (path == null) {
+      String message = "Missing path. Should be set via -Ddummy.path";
+      message += "\n" + describe();
+      LOG.error(message);
+      throw new RuntimeException(message);
+    }
+
+    try {
+      writer = new BufferedWriter(new FileWriter(conf.get("dummy.path")));
+    } catch (IOException e) {
+    }
+  }
+
+  public String describe() {
+    StringBuffer sb = new StringBuffer("DummyIndexWriter\n");
+    sb.append("\t").append(
+        "dummy.path : Path of the file to write to (mandatory)\n");
+    return sb.toString();
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/indexer-dummy/src/main/java/org/apache/nutch/indexwriter/dummy/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/indexer-dummy/src/main/java/org/apache/nutch/indexwriter/dummy/package-info.java b/nutch-plugins/indexer-dummy/src/main/java/org/apache/nutch/indexwriter/dummy/package-info.java
new file mode 100644
index 0000000..8cc00c4
--- /dev/null
+++ b/nutch-plugins/indexer-dummy/src/main/java/org/apache/nutch/indexwriter/dummy/package-info.java
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Index writer plugin for debugging, writes pairs of &lt;action, url&gt; to a
+ * text file, action is one of "add", "update", or "delete".
+ */
+package org.apache.nutch.indexwriter.dummy;
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/indexer-elastic/build-ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/indexer-elastic/build-ivy.xml b/nutch-plugins/indexer-elastic/build-ivy.xml
new file mode 100644
index 0000000..96f336c
--- /dev/null
+++ b/nutch-plugins/indexer-elastic/build-ivy.xml
@@ -0,0 +1,54 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="indexer-elastic" default="deps-jar" xmlns:ivy="antlib:org.apache.ivy.ant">
+
+    <property name="ivy.install.version" value="2.1.0" />
+    <condition property="ivy.home" value="${env.IVY_HOME}">
+      <isset property="env.IVY_HOME" />
+    </condition>
+    <property name="ivy.home" value="${user.home}/.ant" />
+    <property name="ivy.checksums" value="" />
+    <property name="ivy.jar.dir" value="${ivy.home}/lib" />
+    <property name="ivy.jar.file" value="${ivy.jar.dir}/ivy.jar" />
+
+    <target name="download-ivy" unless="offline">
+
+        <mkdir dir="${ivy.jar.dir}"/>
+        <!-- download Ivy from web site so that it can be used even without any special installation -->
+        <get src="http://repo2.maven.org/maven2/org/apache/ivy/ivy/${ivy.install.version}/ivy-${ivy.install.version}.jar" 
+             dest="${ivy.jar.file}" usetimestamp="true"/>
+    </target>
+
+    <target name="init-ivy" depends="download-ivy">
+      <!-- try to load ivy here from ivy home, in case the user has not already dropped
+              it into ant's lib dir (note that the latter copy will always take precedence).
+              We will not fail as long as local lib dir exists (it may be empty) and
+              ivy is in at least one of ant's lib dir or the local lib dir. -->
+        <path id="ivy.lib.path">
+            <fileset dir="${ivy.jar.dir}" includes="*.jar"/>
+
+        </path>
+        <taskdef resource="org/apache/ivy/ant/antlib.xml"
+                 uri="antlib:org.apache.ivy.ant" classpathref="ivy.lib.path"/>
+    </target>
+
+  <target name="deps-jar" depends="init-ivy">
+    <ivy:retrieve pattern="lib/[artifact]-[revision].[ext]"/>
+  </target>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/indexer-elastic/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/indexer-elastic/build.xml b/nutch-plugins/indexer-elastic/build.xml
new file mode 100644
index 0000000..38955ff
--- /dev/null
+++ b/nutch-plugins/indexer-elastic/build.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="indexer-elastic" default="jar-core">
+
+  <import file="../build-plugin.xml" />
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/indexer-elastic/howto_upgrade_es.txt
----------------------------------------------------------------------
diff --git a/nutch-plugins/indexer-elastic/howto_upgrade_es.txt b/nutch-plugins/indexer-elastic/howto_upgrade_es.txt
new file mode 100644
index 0000000..b577053
--- /dev/null
+++ b/nutch-plugins/indexer-elastic/howto_upgrade_es.txt
@@ -0,0 +1,6 @@
+1. Upgrade elasticsearch dependency in src/plugin/indexer-elastic/ivy.xml
+
+2. Upgrade the Elasticsearch specific dependencies in src/plugin/indexer-elastic/plugin.xml
+   To get the list of dependencies and their versions execute:
+   $ ant -f ./build-ivy.xml
+   $ ls lib/

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/indexer-elastic/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/indexer-elastic/ivy.xml b/nutch-plugins/indexer-elastic/ivy.xml
new file mode 100644
index 0000000..f34075f
--- /dev/null
+++ b/nutch-plugins/indexer-elastic/ivy.xml
@@ -0,0 +1,43 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+        <dependency org="org.elasticsearch" name="elasticsearch" rev="2.3.3"
+                    conf="*->default"/>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/indexer-elastic/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/indexer-elastic/plugin.xml b/nutch-plugins/indexer-elastic/plugin.xml
new file mode 100644
index 0000000..d99a665
--- /dev/null
+++ b/nutch-plugins/indexer-elastic/plugin.xml
@@ -0,0 +1,71 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+  
+  http://www.apache.org/licenses/LICENSE-2.0
+  
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<plugin id="indexer-elastic" name="ElasticIndexWriter" version="1.0.0"
+  provider-name="nutch.apache.org">
+
+  <runtime>
+    <library name="indexer-elastic.jar">
+      <export name="*" />
+    </library>
+    <library name="elasticsearch-2.3.3.jar"/>
+    <library name="commons-cli-1.3.1.jar"/>
+    <library name="compress-lzf-1.0.2.jar"/>
+    <library name="guava-18.0.jar"/>
+    <library name="HdrHistogram-2.1.6.jar"/>
+    <library name="hppc-0.7.1.jar"/>
+    <library name="indexer-elastic.jar"/>
+    <library name="jackson-core-2.6.6.jar"/>
+    <library name="jackson-dataformat-cbor-2.6.6.jar"/>
+    <library name="jackson-dataformat-smile-2.6.6.jar"/>
+    <library name="jackson-dataformat-yaml-2.6.6.jar"/>
+    <library name="joda-convert-1.2.jar"/>
+    <library name="joda-time-2.8.2.jar"/>
+    <library name="jsr166e-1.1.0.jar"/>
+    <library name="lucene-analyzers-common-5.5.0.jar"/>
+    <library name="lucene-backward-codecs-5.5.0.jar"/>
+    <library name="lucene-core-5.5.0.jar"/>
+    <library name="lucene-grouping-5.5.0.jar"/>
+    <library name="lucene-highlighter-5.5.0.jar"/>
+    <library name="lucene-join-5.5.0.jar"/>
+    <library name="lucene-memory-5.5.0.jar"/>
+    <library name="lucene-misc-5.5.0.jar"/>
+    <library name="lucene-queries-5.5.0.jar"/>
+    <library name="lucene-queryparser-5.5.0.jar"/>
+    <library name="lucene-sandbox-5.5.0.jar"/>
+    <library name="lucene-spatial-5.5.0.jar"/>
+    <library name="lucene-spatial3d-5.5.0.jar"/>
+    <library name="lucene-suggest-5.5.0.jar"/>
+    <library name="netty-3.10.5.Final.jar"/>
+    <library name="securesm-1.0.jar"/>
+    <library name="snakeyaml-1.15.jar"/>
+    <library name="spatial4j-0.5.jar"/>
+    <library name="t-digest-3.0.jar"/>
+  </runtime>
+
+  <requires>
+    <import plugin="nutch-extensionpoints" />
+  </requires>
+
+  <extension id="org.apache.nutch.indexer.elastic"
+    name="Elasticsearch Index Writer"
+    point="org.apache.nutch.indexer.IndexWriter">
+    <implementation id="ElasticIndexWriter"
+      class="org.apache.nutch.indexwriter.elastic.ElasticIndexWriter" />
+  </extension>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/indexer-elastic/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/indexer-elastic/pom.xml b/nutch-plugins/indexer-elastic/pom.xml
new file mode 100644
index 0000000..165a94e
--- /dev/null
+++ b/nutch-plugins/indexer-elastic/pom.xml
@@ -0,0 +1,45 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>indexer-elastic</artifactId>
+    <packaging>jar</packaging>
+
+    <name>indexer-elastic</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+    <dependencies>
+        <dependency>
+            <groupId>org.elasticsearch</groupId>
+            <artifactId>elasticsearch</artifactId>
+            <version>2.3.3</version>
+        </dependency>
+    </dependencies>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/indexer-elastic/src/main/java/org/apache/nutch/indexwriter/elastic/ElasticConstants.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/indexer-elastic/src/main/java/org/apache/nutch/indexwriter/elastic/ElasticConstants.java b/nutch-plugins/indexer-elastic/src/main/java/org/apache/nutch/indexwriter/elastic/ElasticConstants.java
new file mode 100644
index 0000000..b0e70c8
--- /dev/null
+++ b/nutch-plugins/indexer-elastic/src/main/java/org/apache/nutch/indexwriter/elastic/ElasticConstants.java
@@ -0,0 +1,28 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexwriter.elastic;
+
+public interface ElasticConstants {
+  public static final String ELASTIC_PREFIX = "elastic.";
+
+  public static final String HOST = ELASTIC_PREFIX + "host";
+  public static final String PORT = ELASTIC_PREFIX + "port";
+  public static final String CLUSTER = ELASTIC_PREFIX + "cluster";
+  public static final String INDEX = ELASTIC_PREFIX + "index";
+  public static final String MAX_BULK_DOCS = ELASTIC_PREFIX + "max.bulk.docs";
+  public static final String MAX_BULK_LENGTH = ELASTIC_PREFIX + "max.bulk.size";
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/indexer-elastic/src/main/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/indexer-elastic/src/main/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java b/nutch-plugins/indexer-elastic/src/main/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java
new file mode 100644
index 0000000..9367e41
--- /dev/null
+++ b/nutch-plugins/indexer-elastic/src/main/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java
@@ -0,0 +1,279 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.indexwriter.elastic;
+
+import static org.elasticsearch.node.NodeBuilder.nodeBuilder;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.net.InetAddress;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.commons.lang.StringUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.nutch.indexer.IndexWriter;
+import org.apache.nutch.indexer.NutchDocument;
+import org.elasticsearch.ElasticsearchException;
+import org.elasticsearch.action.ListenableActionFuture;
+import org.elasticsearch.action.bulk.BulkItemResponse;
+import org.elasticsearch.action.bulk.BulkRequestBuilder;
+import org.elasticsearch.action.bulk.BulkResponse;
+import org.elasticsearch.action.delete.DeleteRequestBuilder;
+import org.elasticsearch.action.index.IndexRequestBuilder;
+import org.elasticsearch.client.Client;
+import org.elasticsearch.client.transport.TransportClient;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.common.settings.Settings.Builder;
+import org.elasticsearch.common.transport.InetSocketTransportAddress;
+import org.elasticsearch.node.Node;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ */
+public class ElasticIndexWriter implements IndexWriter {
+  public static Logger LOG = LoggerFactory.getLogger(ElasticIndexWriter.class);
+
+  private static final int DEFAULT_MAX_BULK_DOCS = 250;
+  private static final int DEFAULT_MAX_BULK_LENGTH = 2500500;
+
+  private Client client;
+  private Node node;
+  private String defaultIndex;
+
+  private Configuration config;
+
+  private BulkRequestBuilder bulk;
+  private ListenableActionFuture<BulkResponse> execute;
+  private int port = -1;
+  private String host = null;
+  private String clusterName = null;
+  private int maxBulkDocs;
+  private int maxBulkLength;
+  private long indexedDocs = 0;
+  private int bulkDocs = 0;
+  private int bulkLength = 0;
+  private boolean createNewBulk = false;
+
+  @Override
+  public void open(JobConf job, String name) throws IOException {
+    clusterName = job.get(ElasticConstants.CLUSTER);
+
+    host = job.get(ElasticConstants.HOST);
+    port = job.getInt(ElasticConstants.PORT, 9300);
+
+    Builder settingsBuilder = Settings.builder();
+
+    BufferedReader reader = new BufferedReader(
+        job.getConfResourceAsReader("elasticsearch.conf"));
+    String line;
+    String parts[];
+
+    while ((line = reader.readLine()) != null) {
+      if (StringUtils.isNotBlank(line) && !line.startsWith("#")) {
+        line.trim();
+        parts = line.split("=");
+
+        if (parts.length == 2) {
+          settingsBuilder.put(parts[0].trim(), parts[1].trim());
+        }
+      }
+    }
+
+    if (StringUtils.isNotBlank(clusterName))
+      settingsBuilder.put("cluster.name", clusterName);
+
+    // Set the cluster name and build the settings
+    Settings settings = settingsBuilder.build();
+
+    // Prefer TransportClient
+    if (host != null && port > 1) {
+      TransportClient transportClient = TransportClient.builder()
+          .settings(settings).build()
+          .addTransportAddress(new InetSocketTransportAddress(InetAddress.getByName(host), port));
+      client = transportClient;
+    } else if (clusterName != null) {
+      node = nodeBuilder().settings(settings).client(true).node();
+      client = node.client();
+    }
+
+    bulk = client.prepareBulk();
+    defaultIndex = job.get(ElasticConstants.INDEX, "nutch");
+    maxBulkDocs = job.getInt(ElasticConstants.MAX_BULK_DOCS,
+        DEFAULT_MAX_BULK_DOCS);
+    maxBulkLength = job.getInt(ElasticConstants.MAX_BULK_LENGTH,
+        DEFAULT_MAX_BULK_LENGTH);
+  }
+
+  @Override
+  public void write(NutchDocument doc) throws IOException {
+    String id = (String) doc.getFieldValue("id");
+    String type = doc.getDocumentMeta().get("type");
+    if (type == null)
+      type = "doc";
+    IndexRequestBuilder request = client.prepareIndex(defaultIndex, type, id);
+
+    Map<String, Object> source = new HashMap<String, Object>();
+
+    // Loop through all fields of this doc
+    for (String fieldName : doc.getFieldNames()) {
+      if (doc.getField(fieldName).getValues().size() > 1) {
+        source.put(fieldName, doc.getFieldValue(fieldName));
+        // Loop through the values to keep track of the size of this
+        // document
+        for (Object value : doc.getField(fieldName).getValues()) {
+          bulkLength += value.toString().length();
+        }
+      } else {
+        if (doc.getFieldValue(fieldName) != null) {
+          source.put(fieldName, doc.getFieldValue(fieldName));
+          bulkLength += doc.getFieldValue(fieldName).toString().length();
+        }
+      }
+    }
+    request.setSource(source);
+
+    // Add this indexing request to a bulk request
+    bulk.add(request);
+    indexedDocs++;
+    bulkDocs++;
+
+    if (bulkDocs >= maxBulkDocs || bulkLength >= maxBulkLength) {
+      LOG.info("Processing bulk request [docs = " + bulkDocs + ", length = "
+          + bulkLength + ", total docs = " + indexedDocs
+          + ", last doc in bulk = '" + id + "']");
+      // Flush the bulk of indexing requests
+      createNewBulk = true;
+      commit();
+    }
+  }
+
+  @Override
+  public void delete(String key) throws IOException {
+    try {
+      DeleteRequestBuilder builder = client.prepareDelete();
+      builder.setIndex(defaultIndex);
+      builder.setType("doc");
+      builder.setId(key);
+      builder.execute().actionGet();
+    } catch (ElasticsearchException e) {
+      throw makeIOException(e);
+    }
+  }
+
+  public static IOException makeIOException(ElasticsearchException e) {
+    final IOException ioe = new IOException();
+    ioe.initCause(e);
+    return ioe;
+  }
+
+  @Override
+  public void update(NutchDocument doc) throws IOException {
+    write(doc);
+  }
+
+  @Override
+  public void commit() throws IOException {
+    if (execute != null) {
+      // wait for previous to finish
+      long beforeWait = System.currentTimeMillis();
+      BulkResponse actionGet = execute.actionGet();
+      if (actionGet.hasFailures()) {
+        for (BulkItemResponse item : actionGet) {
+          if (item.isFailed()) {
+            throw new RuntimeException("First failure in bulk: "
+                + item.getFailureMessage());
+          }
+        }
+      }
+      long msWaited = System.currentTimeMillis() - beforeWait;
+      LOG.info("Previous took in ms " + actionGet.getTookInMillis()
+          + ", including wait " + msWaited);
+      execute = null;
+    }
+    if (bulk != null) {
+      if (bulkDocs > 0) {
+        // start a flush, note that this is an asynchronous call
+        execute = bulk.execute();
+      }
+      bulk = null;
+    }
+    if (createNewBulk) {
+      // Prepare a new bulk request
+      bulk = client.prepareBulk();
+      bulkDocs = 0;
+      bulkLength = 0;
+    }
+  }
+
+  @Override
+  public void close() throws IOException {
+    // Flush pending requests
+    LOG.info("Processing remaining requests [docs = " + bulkDocs
+        + ", length = " + bulkLength + ", total docs = " + indexedDocs + "]");
+    createNewBulk = false;
+    commit();
+    // flush one more time to finalize the last bulk
+    LOG.info("Processing to finalize last execute");
+    createNewBulk = false;
+    commit();
+
+    // Close
+    client.close();
+    if (node != null) {
+      node.close();
+    }
+  }
+
+  @Override
+  public String describe() {
+    StringBuffer sb = new StringBuffer("ElasticIndexWriter\n");
+    sb.append("\t").append(ElasticConstants.CLUSTER)
+        .append(" : elastic prefix cluster\n");
+    sb.append("\t").append(ElasticConstants.HOST).append(" : hostname\n");
+    sb.append("\t").append(ElasticConstants.PORT).append(" : port\n");
+    sb.append("\t").append(ElasticConstants.INDEX)
+        .append(" : elastic index command \n");
+    sb.append("\t").append(ElasticConstants.MAX_BULK_DOCS)
+        .append(" : elastic bulk index doc counts. (default 250) \n");
+    sb.append("\t").append(ElasticConstants.MAX_BULK_LENGTH)
+        .append(" : elastic bulk index length. (default 2500500 ~2.5MB)\n");
+    return sb.toString();
+  }
+
+  @Override
+  public void setConf(Configuration conf) {
+    config = conf;
+    String cluster = conf.get(ElasticConstants.CLUSTER);
+    String host = conf.get(ElasticConstants.HOST);
+
+    if (StringUtils.isBlank(cluster) && StringUtils.isBlank(host)) {
+      String message = "Missing elastic.cluster and elastic.host. At least one of them should be set in nutch-site.xml ";
+      message += "\n" + describe();
+      LOG.error(message);
+      throw new RuntimeException(message);
+    }
+  }
+
+  @Override
+  public Configuration getConf() {
+    return config;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/indexer-elastic/src/main/java/org/apache/nutch/indexwriter/elastic/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/indexer-elastic/src/main/java/org/apache/nutch/indexwriter/elastic/package-info.java b/nutch-plugins/indexer-elastic/src/main/java/org/apache/nutch/indexwriter/elastic/package-info.java
new file mode 100644
index 0000000..f708334
--- /dev/null
+++ b/nutch-plugins/indexer-elastic/src/main/java/org/apache/nutch/indexwriter/elastic/package-info.java
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Index writer plugin for <a href="http://www.elasticsearch.org/">Elasticsearch</a>.
+ */
+package org.apache.nutch.indexwriter.elastic;
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/indexer-solr/build-ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/indexer-solr/build-ivy.xml b/nutch-plugins/indexer-solr/build-ivy.xml
new file mode 100644
index 0000000..9832cf0
--- /dev/null
+++ b/nutch-plugins/indexer-solr/build-ivy.xml
@@ -0,0 +1,54 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="indexer-solr" default="deps-jar" xmlns:ivy="antlib:org.apache.ivy.ant">
+
+    <property name="ivy.install.version" value="2.1.0" />
+    <condition property="ivy.home" value="${env.IVY_HOME}">
+      <isset property="env.IVY_HOME" />
+    </condition>
+    <property name="ivy.home" value="${user.home}/.ant" />
+    <property name="ivy.checksums" value="" />
+    <property name="ivy.jar.dir" value="${ivy.home}/lib" />
+    <property name="ivy.jar.file" value="${ivy.jar.dir}/ivy.jar" />
+
+    <target name="download-ivy" unless="offline">
+
+        <mkdir dir="${ivy.jar.dir}"/>
+        <!-- download Ivy from web site so that it can be used even without any special installation -->
+        <get src="http://repo2.maven.org/maven2/org/apache/ivy/ivy/${ivy.install.version}/ivy-${ivy.install.version}.jar" 
+             dest="${ivy.jar.file}" usetimestamp="true"/>
+    </target>
+
+    <target name="init-ivy" depends="download-ivy">
+      <!-- try to load ivy here from ivy home, in case the user has not already dropped
+              it into ant's lib dir (note that the latter copy will always take precedence).
+              We will not fail as long as local lib dir exists (it may be empty) and
+              ivy is in at least one of ant's lib dir or the local lib dir. -->
+        <path id="ivy.lib.path">
+            <fileset dir="${ivy.jar.dir}" includes="*.jar"/>
+
+        </path>
+        <taskdef resource="org/apache/ivy/ant/antlib.xml"
+                 uri="antlib:org.apache.ivy.ant" classpathref="ivy.lib.path"/>
+    </target>
+
+  <target name="deps-jar" depends="init-ivy">
+    <ivy:retrieve pattern="lib/[artifact]-[revision].[ext]"/>
+  </target>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/indexer-solr/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/indexer-solr/build.xml b/nutch-plugins/indexer-solr/build.xml
new file mode 100644
index 0000000..8d77cdf
--- /dev/null
+++ b/nutch-plugins/indexer-solr/build.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="indexer-solr" default="jar-core">
+
+  <import file="../build-plugin.xml" />
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/indexer-solr/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/indexer-solr/ivy.xml b/nutch-plugins/indexer-solr/ivy.xml
new file mode 100644
index 0000000..65e97e7
--- /dev/null
+++ b/nutch-plugins/indexer-solr/ivy.xml
@@ -0,0 +1,44 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+    <dependency org="org.apache.solr" name="solr-solrj" rev="5.5.0"/>
+    <dependency org="org.apache.httpcomponents" name="httpcore" rev="4.4.1" conf="*->default"/>
+    <dependency org="org.apache.httpcomponents" name="httpmime" rev="4.4.1" conf="*->default"/>
+  </dependencies>
+  
+</ivy-module>

[21/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build for nutch-core and nutch-plugins

Posted by th...@apache.org.

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/indexer-solr/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/indexer-solr/plugin.xml b/nutch-plugins/indexer-solr/plugin.xml
new file mode 100644
index 0000000..0e86796
--- /dev/null
+++ b/nutch-plugins/indexer-solr/plugin.xml
@@ -0,0 +1,48 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+  
+  http://www.apache.org/licenses/LICENSE-2.0
+  
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<plugin id="indexer-solr" name="SolrIndexWriter" version="1.0.0"
+  provider-name="nutch.apache.org">
+
+  <runtime>
+    <library name="indexer-solr.jar">
+      <export name="*" />
+    </library>
+      <library name="commons-io-2.4.jar"/>
+      <library name="httpclient-4.4.1.jar"/>
+      <library name="httpcore-4.4.1.jar"/>
+      <library name="httpmime-4.4.1.jar"/>
+      <library name="noggit-0.6.jar"/>
+      <library name="slf4j-api-1.7.7.jar"/>
+      <library name="solr-solrj-5.5.0.jar"/>
+      <library name="stax2-api-3.1.4.jar"/>
+      <library name="woodstox-core-asl-4.4.1.jar"/>
+      <library name="zookeeper-3.4.6.jar"/> 
+  </runtime>
+
+  <requires>
+    <import plugin="nutch-extensionpoints" />
+  </requires>
+
+  <extension id="org.apache.nutch.indexer.solr"
+    name="Solr Index Writer"
+    point="org.apache.nutch.indexer.IndexWriter">
+    <implementation id="SolrIndexWriter"
+      class="org.apache.nutch.indexwriter.solr.SolrIndexWriter" />
+  </extension>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/indexer-solr/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/indexer-solr/pom.xml b/nutch-plugins/indexer-solr/pom.xml
new file mode 100644
index 0000000..94ff824
--- /dev/null
+++ b/nutch-plugins/indexer-solr/pom.xml
@@ -0,0 +1,55 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>indexer-solr</artifactId>
+    <packaging>jar</packaging>
+
+    <name>indexer-solr</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+    <dependencies>
+        <dependency>
+            <groupId>org.apache.solr</groupId>
+            <artifactId>solr-solrj</artifactId>
+            <version>5.5.0</version>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.httpcomponents</groupId>
+            <artifactId>httpcore</artifactId>
+            <version>4.4.1</version>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.httpcomponents</groupId>
+            <artifactId>httpmime</artifactId>
+            <version>4.4.1</version>
+        </dependency>
+    </dependencies>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/indexer-solr/src/main/java/org/apache/nutch/indexwriter/solr/SolrConstants.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/indexer-solr/src/main/java/org/apache/nutch/indexwriter/solr/SolrConstants.java b/nutch-plugins/indexer-solr/src/main/java/org/apache/nutch/indexwriter/solr/SolrConstants.java
new file mode 100644
index 0000000..44a382e
--- /dev/null
+++ b/nutch-plugins/indexer-solr/src/main/java/org/apache/nutch/indexwriter/solr/SolrConstants.java
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexwriter.solr;
+
+public interface SolrConstants {
+  public static final String SOLR_PREFIX = "solr.";
+
+  public static final String SERVER_URL = SOLR_PREFIX + "server.url";
+
+  public static final String COMMIT_SIZE = SOLR_PREFIX + "commit.size";
+
+  public static final String MAPPING_FILE = SOLR_PREFIX + "mapping.file";
+
+  public static final String USE_AUTH = SOLR_PREFIX + "auth";
+
+  public static final String USERNAME = SOLR_PREFIX + "auth.username";
+
+  public static final String PASSWORD = SOLR_PREFIX + "auth.password";
+
+  public static final String COLLECTION = SOLR_PREFIX + "collection";
+
+  public static final String ZOOKEEPER_HOSTS = SOLR_PREFIX + "zookeeper.hosts";
+
+  public static final String ID_FIELD = "id";
+
+  public static final String URL_FIELD = "url";
+
+  public static final String BOOST_FIELD = "boost";
+
+  public static final String TIMESTAMP_FIELD = "tstamp";
+
+  public static final String DIGEST_FIELD = "digest";
+
+
+
+  @Deprecated
+  public static final String COMMIT_INDEX = SOLR_PREFIX + "commit.index";
+
+  @Deprecated
+  public static final String PARAMS = SOLR_PREFIX + "params";
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/indexer-solr/src/main/java/org/apache/nutch/indexwriter/solr/SolrIndexWriter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/indexer-solr/src/main/java/org/apache/nutch/indexwriter/solr/SolrIndexWriter.java b/nutch-plugins/indexer-solr/src/main/java/org/apache/nutch/indexwriter/solr/SolrIndexWriter.java
new file mode 100644
index 0000000..0d9e2e0
--- /dev/null
+++ b/nutch-plugins/indexer-solr/src/main/java/org/apache/nutch/indexwriter/solr/SolrIndexWriter.java
@@ -0,0 +1,277 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexwriter.solr;
+
+import java.io.IOException;
+import java.io.UnsupportedEncodingException;
+import java.util.ArrayList;
+import java.util.Date;
+import java.util.List;
+import java.util.Map.Entry;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.nutch.indexer.IndexWriter;
+import org.apache.nutch.indexer.IndexerMapReduce;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.indexer.NutchField;
+import org.apache.solr.client.solrj.SolrClient;
+import org.apache.solr.client.solrj.SolrServerException;
+import org.apache.solr.client.solrj.request.UpdateRequest;
+import org.apache.solr.client.solrj.request.AbstractUpdateRequest;
+import org.apache.solr.common.SolrInputDocument;
+import org.apache.solr.common.params.ModifiableSolrParams;
+import org.apache.solr.common.util.DateUtil;
+import org.apache.solr.common.util.NamedList;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.nutch.util.HadoopFSUtil;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.nutch.util.NutchConfiguration;
+
+// WORK AROUND FOR NOT REMOVING URL ENCODED URLS!!!
+import java.net.URLDecoder;
+
+public class SolrIndexWriter implements IndexWriter {
+
+  public static final Logger LOG = LoggerFactory
+      .getLogger(SolrIndexWriter.class);
+
+  private List<SolrClient> solrClients;
+  private SolrMappingReader solrMapping;
+  private ModifiableSolrParams params;
+
+  private Configuration config;
+
+  private final List<SolrInputDocument> inputDocs = new ArrayList<SolrInputDocument>();
+
+  private final List<SolrInputDocument> updateDocs = new ArrayList<SolrInputDocument>();
+    
+  private final List<String> deleteIds = new ArrayList<String>();
+
+  private int batchSize;
+  private int numDeletes = 0;
+  private int totalAdds = 0;
+  private int totalDeletes = 0;
+  private int totalUpdates = 0;
+  private boolean delete = false;
+
+  public void open(JobConf job, String name) throws IOException {
+    solrClients = SolrUtils.getSolrClients(job);
+    init(solrClients, job);
+  }
+
+  // package protected for tests
+  void init(List<SolrClient> solrClients, JobConf job) throws IOException {
+    batchSize = job.getInt(SolrConstants.COMMIT_SIZE, 1000);
+    solrMapping = SolrMappingReader.getInstance(job);
+    delete = job.getBoolean(IndexerMapReduce.INDEXER_DELETE, false);
+    // parse optional params
+    params = new ModifiableSolrParams();
+    String paramString = job.get(IndexerMapReduce.INDEXER_PARAMS);
+    if (paramString != null) {
+      String[] values = paramString.split("&");
+      for (String v : values) {
+        String[] kv = v.split("=");
+        if (kv.length < 2) {
+          continue;
+        }
+        params.add(kv[0], kv[1]);
+      }
+    }
+  }
+
+  public void delete(String key) throws IOException {
+    try {
+      key = URLDecoder.decode(key, "UTF8");
+    } catch (UnsupportedEncodingException e) {
+      LOG.error("Error decoding: " + key);
+      throw new IOException("UnsupportedEncodingException for " + key);
+    } catch (IllegalArgumentException e) {
+      LOG.warn("Could not decode: " + key + ", it probably wasn't encoded in the first place..");
+    }
+    
+    // escape solr hash separator
+    key = key.replaceAll("!", "\\!");
+    
+    if (delete) {
+      deleteIds.add(key);
+      totalDeletes++;
+    }
+    
+    if (deleteIds.size() >= batchSize) {
+      push();
+    }
+
+  }
+
+  public void deleteByQuery(String query) throws IOException {
+    try {
+      LOG.info("SolrWriter: deleting " + query);
+      for (SolrClient solrClient : solrClients) {
+        solrClient.deleteByQuery(query);
+      }
+    } catch (final SolrServerException e) {
+      LOG.error("Error deleting: " + deleteIds);
+      throw makeIOException(e);
+    }
+  }
+
+  @Override
+  public void update(NutchDocument doc) throws IOException {
+    write(doc);
+  }
+
+  public void write(NutchDocument doc) throws IOException {
+    final SolrInputDocument inputDoc = new SolrInputDocument();
+
+    for (final Entry<String, NutchField> e : doc) {
+      for (final Object val : e.getValue().getValues()) {
+        // normalise the string representation for a Date
+        Object val2 = val;
+
+        if (val instanceof Date) {
+          val2 = DateUtil.getThreadLocalDateFormat().format(val);
+        }
+
+        if (e.getKey().equals("content") || e.getKey().equals("title")) {
+          val2 = SolrUtils.stripNonCharCodepoints((String) val);
+        }
+
+        inputDoc.addField(solrMapping.mapKey(e.getKey()), val2, e.getValue()
+            .getWeight());
+        String sCopy = solrMapping.mapCopyKey(e.getKey());
+        if (sCopy != e.getKey()) {
+          inputDoc.addField(sCopy, val);
+        }
+      }
+    }
+
+    inputDoc.setDocumentBoost(doc.getWeight());
+    inputDocs.add(inputDoc);
+    totalAdds++;
+
+    if (inputDocs.size() + numDeletes >= batchSize) {
+      push();
+    }
+  }
+
+  public void close() throws IOException {
+    commit();
+
+    for (SolrClient solrClient : solrClients) {
+      solrClient.close();
+    }
+  }
+
+  @Override
+  public void commit() throws IOException {
+    push();
+    try {
+      for (SolrClient solrClient : solrClients) {
+        solrClient.commit();
+      }
+    } catch (final SolrServerException e) {
+      LOG.error("Failed to commit solr connection: " + e.getMessage()); // FIXME
+    }
+  }
+    
+  public void push() throws IOException {
+    if (inputDocs.size() > 0) {
+      try {
+        LOG.info("Indexing " + Integer.toString(inputDocs.size())
+            + "/" + Integer.toString(totalAdds) + " documents");
+        LOG.info("Deleting " + Integer.toString(numDeletes) + " documents");
+        numDeletes = 0;
+        UpdateRequest req = new UpdateRequest();
+        req.add(inputDocs);
+        req.setAction(AbstractUpdateRequest.ACTION.OPTIMIZE, false, false);
+        req.setParams(params);
+        for (SolrClient solrClient : solrClients) {
+          NamedList res = solrClient.request(req);
+        }
+      } catch (final SolrServerException e) {
+        throw makeIOException(e);
+      }
+      inputDocs.clear();
+    }
+
+    if (deleteIds.size() > 0) {
+      try {
+        LOG.info("SolrIndexer: deleting " + Integer.toString(deleteIds.size()) 
+            + "/" + Integer.toString(totalDeletes) + " documents");
+        for (SolrClient solrClient : solrClients) {
+          solrClient.deleteById(deleteIds);
+        }
+      } catch (final SolrServerException e) {
+        LOG.error("Error deleting: " + deleteIds);
+        throw makeIOException(e);
+      }
+      deleteIds.clear();
+    }
+  }
+
+  public static IOException makeIOException(SolrServerException e) {
+    final IOException ioe = new IOException();
+    ioe.initCause(e);
+    return ioe;
+  }
+
+  @Override
+  public Configuration getConf() {
+    return config;
+  }
+
+  @Override
+  public void setConf(Configuration conf) {
+    config = conf;
+    String serverURL = conf.get(SolrConstants.SERVER_URL);
+    String zkHosts = conf.get(SolrConstants.ZOOKEEPER_HOSTS);
+    if (serverURL == null && zkHosts == null) {
+      String message = "Missing SOLR URL and Zookeeper URL. Either on should be set via -D "
+          + SolrConstants.SERVER_URL + " or -D " + SolrConstants.ZOOKEEPER_HOSTS;
+      message += "\n" + describe();
+      LOG.error(message);
+      throw new RuntimeException(message);
+    }
+  }
+
+  public String describe() {
+    StringBuffer sb = new StringBuffer("SOLRIndexWriter\n");
+    sb.append("\t").append(SolrConstants.SERVER_URL)
+        .append(" : URL of the SOLR instance\n");
+    sb.append("\t").append(SolrConstants.ZOOKEEPER_HOSTS)
+        .append(" : URL of the Zookeeper quorum\n");
+    sb.append("\t").append(SolrConstants.COMMIT_SIZE)
+        .append(" : buffer size when sending to SOLR (default 1000)\n");
+    sb.append("\t")
+        .append(SolrConstants.MAPPING_FILE)
+        .append(
+            " : name of the mapping file for fields (default solrindex-mapping.xml)\n");
+    sb.append("\t").append(SolrConstants.USE_AUTH)
+        .append(" : use authentication (default false)\n");
+    sb.append("\t").append(SolrConstants.USERNAME)
+        .append(" : username for authentication\n");
+    sb.append("\t").append(SolrConstants.PASSWORD)
+        .append(" : password for authentication\n");
+    return sb.toString();
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/indexer-solr/src/main/java/org/apache/nutch/indexwriter/solr/SolrMappingReader.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/indexer-solr/src/main/java/org/apache/nutch/indexwriter/solr/SolrMappingReader.java b/nutch-plugins/indexer-solr/src/main/java/org/apache/nutch/indexwriter/solr/SolrMappingReader.java
new file mode 100644
index 0000000..19ffa6f
--- /dev/null
+++ b/nutch-plugins/indexer-solr/src/main/java/org/apache/nutch/indexwriter/solr/SolrMappingReader.java
@@ -0,0 +1,147 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexwriter.solr;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.MalformedURLException;
+import java.util.HashMap;
+import java.util.Map;
+
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.parsers.ParserConfigurationException;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.ObjectCache;
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.NodeList;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+
+public class SolrMappingReader {
+  public static Logger LOG = LoggerFactory.getLogger(SolrMappingReader.class);
+
+  private Configuration conf;
+
+  private Map<String, String> keyMap = new HashMap<String, String>();
+  private Map<String, String> copyMap = new HashMap<String, String>();
+  private String uniqueKey = "id";
+
+  public static synchronized SolrMappingReader getInstance(Configuration conf) {
+    ObjectCache cache = ObjectCache.get(conf);
+    SolrMappingReader instance = (SolrMappingReader) cache
+        .getObject(SolrMappingReader.class.getName());
+    if (instance == null) {
+      instance = new SolrMappingReader(conf);
+      cache.setObject(SolrMappingReader.class.getName(), instance);
+    }
+    return instance;
+  }
+
+  protected SolrMappingReader(Configuration conf) {
+    this.conf = conf;
+    parseMapping();
+  }
+
+  private void parseMapping() {
+    InputStream ssInputStream = null;
+    ssInputStream = conf.getConfResourceAsInputStream(conf.get(
+        SolrConstants.MAPPING_FILE, "solrindex-mapping.xml"));
+
+    InputSource inputSource = new InputSource(ssInputStream);
+    try {
+      DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
+      DocumentBuilder builder = factory.newDocumentBuilder();
+      Document document = builder.parse(inputSource);
+      Element rootElement = document.getDocumentElement();
+      NodeList fieldList = rootElement.getElementsByTagName("field");
+      if (fieldList.getLength() > 0) {
+        for (int i = 0; i < fieldList.getLength(); i++) {
+          Element element = (Element) fieldList.item(i);
+          LOG.info("source: " + element.getAttribute("source") + " dest: "
+              + element.getAttribute("dest"));
+          keyMap.put(element.getAttribute("source"),
+              element.getAttribute("dest"));
+        }
+      }
+      NodeList copyFieldList = rootElement.getElementsByTagName("copyField");
+      if (copyFieldList.getLength() > 0) {
+        for (int i = 0; i < copyFieldList.getLength(); i++) {
+          Element element = (Element) copyFieldList.item(i);
+          LOG.info("source: " + element.getAttribute("source") + " dest: "
+              + element.getAttribute("dest"));
+          copyMap.put(element.getAttribute("source"),
+              element.getAttribute("dest"));
+        }
+      }
+      NodeList uniqueKeyItem = rootElement.getElementsByTagName("uniqueKey");
+      if (uniqueKeyItem.getLength() > 1) {
+        LOG.warn("More than one unique key definitions found in solr index mapping, using default 'id'");
+        uniqueKey = "id";
+      } else if (uniqueKeyItem.getLength() == 0) {
+        LOG.warn("No unique key definition found in solr index mapping using, default 'id'");
+      } else {
+        uniqueKey = uniqueKeyItem.item(0).getFirstChild().getNodeValue();
+      }
+    } catch (MalformedURLException e) {
+      LOG.warn(e.toString());
+    } catch (SAXException e) {
+      LOG.warn(e.toString());
+    } catch (IOException e) {
+      LOG.warn(e.toString());
+    } catch (ParserConfigurationException e) {
+      LOG.warn(e.toString());
+    }
+  }
+
+  public Map<String, String> getKeyMap() {
+    return keyMap;
+  }
+
+  public Map<String, String> getCopyMap() {
+    return copyMap;
+  }
+
+  public String getUniqueKey() {
+    return uniqueKey;
+  }
+
+  public String hasCopy(String key) {
+    if (copyMap.containsKey(key)) {
+      key = (String) copyMap.get(key);
+    }
+    return key;
+  }
+
+  public String mapKey(String key) throws IOException {
+    if (keyMap.containsKey(key)) {
+      key = (String) keyMap.get(key);
+    }
+    return key;
+  }
+
+  public String mapCopyKey(String key) throws IOException {
+    if (copyMap.containsKey(key)) {
+      key = (String) copyMap.get(key);
+    }
+    return key;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/indexer-solr/src/main/java/org/apache/nutch/indexwriter/solr/SolrUtils.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/indexer-solr/src/main/java/org/apache/nutch/indexwriter/solr/SolrUtils.java b/nutch-plugins/indexer-solr/src/main/java/org/apache/nutch/indexwriter/solr/SolrUtils.java
new file mode 100644
index 0000000..eec0080
--- /dev/null
+++ b/nutch-plugins/indexer-solr/src/main/java/org/apache/nutch/indexwriter/solr/SolrUtils.java
@@ -0,0 +1,97 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexwriter.solr;
+
+
+import java.util.ArrayList;
+import java.util.List;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.solr.client.solrj.SolrClient;
+import org.apache.solr.client.solrj.impl.HttpSolrClient;
+import org.apache.solr.client.solrj.impl.CloudSolrClient;
+
+import java.net.MalformedURLException;
+
+public class SolrUtils {
+
+  public static Logger LOG = LoggerFactory.getLogger(SolrUtils.class);
+
+  /**
+   *
+   *
+   * @param JobConf
+   * @return SolrClient
+   */
+  public static ArrayList<SolrClient> getSolrClients(JobConf job) throws MalformedURLException {
+    String[] urls = job.getStrings(SolrConstants.SERVER_URL);
+    String[] zkHostString = job.getStrings(SolrConstants.ZOOKEEPER_HOSTS);
+    ArrayList<SolrClient> solrClients = new ArrayList<SolrClient>();
+    
+    if (zkHostString != null && zkHostString.length > 0) {
+      for (int i = 0; i < zkHostString.length; i++) {
+        CloudSolrClient sc = getCloudSolrClient(zkHostString[i]);
+        sc.setDefaultCollection(job.get(SolrConstants.COLLECTION));
+        solrClients.add(sc);
+      }
+    } else {
+      for (int i = 0; i < urls.length; i++) {
+        SolrClient sc = new HttpSolrClient(urls[i]);
+        solrClients.add(sc);
+      }
+    }
+
+    return solrClients;
+  }
+
+  public static CloudSolrClient getCloudSolrClient(String url) throws MalformedURLException {
+    CloudSolrClient sc = new CloudSolrClient(url.replace('|', ','));
+    sc.setParallelUpdates(true);
+    sc.connect();
+    return sc;
+  }
+
+  public static SolrClient getHttpSolrClient(String url) throws MalformedURLException {
+    SolrClient sc =new HttpSolrClient(url);
+    return sc;
+  }
+  
+  public static String stripNonCharCodepoints(String input) {
+    StringBuilder retval = new StringBuilder();
+    char ch;
+
+    for (int i = 0; i < input.length(); i++) {
+      ch = input.charAt(i);
+
+      // Strip all non-characters
+      // http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[:Noncharacter_Code_Point=True:]
+      // and non-printable control characters except tabulator, new line and
+      // carriage return
+      if (ch % 0x10000 != 0xffff && // 0xffff - 0x10ffff range step 0x10000
+          ch % 0x10000 != 0xfffe && // 0xfffe - 0x10fffe range
+          (ch <= 0xfdd0 || ch >= 0xfdef) && // 0xfdd0 - 0xfdef
+          (ch > 0x1F || ch == 0x9 || ch == 0xa || ch == 0xd)) {
+
+        retval.append(ch);
+      }
+    }
+
+    return retval.toString();
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/indexer-solr/src/main/java/org/apache/nutch/indexwriter/solr/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/indexer-solr/src/main/java/org/apache/nutch/indexwriter/solr/package-info.java b/nutch-plugins/indexer-solr/src/main/java/org/apache/nutch/indexwriter/solr/package-info.java
new file mode 100644
index 0000000..af5f50c
--- /dev/null
+++ b/nutch-plugins/indexer-solr/src/main/java/org/apache/nutch/indexwriter/solr/package-info.java
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Index writer plugin for <a href="http://lucene.apache.org/solr/">Apache Solr</a>.
+ */
+package org.apache.nutch.indexwriter.solr;
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/language-identifier/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/language-identifier/build.xml b/nutch-plugins/language-identifier/build.xml
new file mode 100644
index 0000000..668075e
--- /dev/null
+++ b/nutch-plugins/language-identifier/build.xml
@@ -0,0 +1,38 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="language-identifier" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+  <target name="init-plugin">
+    <echo>Copying language profiles</echo>
+    <copy todir="${build.classes}">
+      <fileset dir="${src.dir}" includes="**/*.ngp, **/*.properties"/>
+    </copy>
+    <echo>Copying test files</echo>
+    <copy todir="${build.test}">
+      <fileset dir="${src.test}" includes="**/*.test, **/*.txt"/>
+    </copy>
+  </target>
+
+  <!-- Deploy Unit test dependencies -->
+  <target name="deps-test">
+    <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
+  </target>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/language-identifier/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/language-identifier/ivy.xml b/nutch-plugins/language-identifier/ivy.xml
new file mode 100644
index 0000000..1a86d68
--- /dev/null
+++ b/nutch-plugins/language-identifier/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/language-identifier/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/language-identifier/plugin.xml b/nutch-plugins/language-identifier/plugin.xml
new file mode 100644
index 0000000..dcf1209
--- /dev/null
+++ b/nutch-plugins/language-identifier/plugin.xml
@@ -0,0 +1,49 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="language-identifier"
+   name="Language Identification Parser/Filter"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+    <runtime>
+      <library name="language-identifier.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+   <extension id="org.apache.nutch.analysis.lang.LanguageParser"
+              name="Nutch language Parser"
+              point="org.apache.nutch.parse.HtmlParseFilter">
+      <implementation id="LanguageParser"
+                      class="org.apache.nutch.analysis.lang.HTMLLanguageParser"/>
+   </extension>
+
+   <extension id="org.apache.nutch.analysis.lang"
+              name="Nutch language identifier filter"
+              point="org.apache.nutch.indexer.IndexingFilter">
+      <implementation id="LanguageIdentifier"
+                      class="org.apache.nutch.analysis.lang.LanguageIndexingFilter"/>
+   </extension>
+
+</plugin>
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/language-identifier/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/language-identifier/pom.xml b/nutch-plugins/language-identifier/pom.xml
new file mode 100644
index 0000000..7937cde
--- /dev/null
+++ b/nutch-plugins/language-identifier/pom.xml
@@ -0,0 +1,38 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>language-identifier</artifactId>
+    <packaging>jar</packaging>
+
+    <name>language-identifier</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/language-identifier/src/main/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/language-identifier/src/main/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java b/nutch-plugins/language-identifier/src/main/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
new file mode 100644
index 0000000..cb8f8c1
--- /dev/null
+++ b/nutch-plugins/language-identifier/src/main/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
@@ -0,0 +1,320 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.analysis.lang;
+
+// JDK imports
+import java.util.Enumeration;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Properties;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.parse.HTMLMetaTags;
+import org.apache.nutch.parse.HtmlParseFilter;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseResult;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.NodeWalker;
+import org.apache.tika.language.LanguageIdentifier;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.w3c.dom.DocumentFragment;
+import org.w3c.dom.Element;
+import org.w3c.dom.NamedNodeMap;
+import org.w3c.dom.Node;
+
+public class HTMLLanguageParser implements HtmlParseFilter {
+
+  public static final Logger LOG = LoggerFactory
+      .getLogger(HTMLLanguageParser.class);
+
+  private int detect = -1, identify = -1;
+
+  private int contentMaxlength = -1;
+
+  private boolean onlyCertain = false;
+
+  /* A static Map of ISO-639 language codes */
+  private static Map<String, String> LANGUAGES_MAP = new HashMap<String, String>();
+  static {
+    try {
+      Properties p = new Properties();
+      p.load(HTMLLanguageParser.class
+          .getResourceAsStream("langmappings.properties"));
+      Enumeration<?> keys = p.keys();
+      while (keys.hasMoreElements()) {
+        String key = (String) keys.nextElement();
+        String[] values = p.getProperty(key).split(",", -1);
+        LANGUAGES_MAP.put(key, key);
+        for (int i = 0; i < values.length; i++) {
+          LANGUAGES_MAP.put(values[i].trim().toLowerCase(), key);
+        }
+      }
+    } catch (Exception e) {
+      if (LOG.isErrorEnabled()) {
+        LOG.error(e.toString());
+      }
+    }
+  }
+
+  private Configuration conf;
+
+  /**
+   * Scan the HTML document looking at possible indications of content language<br>
+   * <li>1. html lang attribute
+   * (http://www.w3.org/TR/REC-html40/struct/dirlang.html#h-8.1) <li>2. meta
+   * dc.language
+   * (http://dublincore.org/documents/2000/07/16/usageguide/qualified
+   * -html.shtml#language) <li>3. meta http-equiv (content-language)
+   * (http://www.w3.org/TR/REC-html40/struct/global.html#h-7.4.4.2) <br>
+   */
+  public ParseResult filter(Content content, ParseResult parseResult,
+      HTMLMetaTags metaTags, DocumentFragment doc) {
+    String lang = null;
+
+    Parse parse = parseResult.get(content.getUrl());
+
+    if (detect >= 0 && identify < 0) {
+      lang = detectLanguage(parse, doc);
+    } else if (detect < 0 && identify >= 0) {
+      lang = identifyLanguage(parse);
+    } else if (detect < identify) {
+      lang = detectLanguage(parse, doc);
+      if (lang == null) {
+        lang = identifyLanguage(parse);
+      }
+    } else if (identify < detect) {
+      lang = identifyLanguage(parse);
+      if (lang == null) {
+        lang = detectLanguage(parse, doc);
+      }
+    } else {
+      LOG.warn("No configuration for language extraction policy is provided");
+      return parseResult;
+    }
+
+    if (lang != null) {
+      parse.getData().getParseMeta().set(Metadata.LANGUAGE, lang);
+      return parseResult;
+    }
+
+    return parseResult;
+  }
+
+  /** Try to find the document's language from page headers and metadata */
+  private String detectLanguage(Parse page, DocumentFragment doc) {
+    String lang = getLanguageFromMetadata(page.getData().getParseMeta());
+    if (lang == null) {
+      LanguageParser parser = new LanguageParser(doc);
+      lang = parser.getLanguage();
+    }
+
+    if (lang != null) {
+      return lang;
+    }
+
+    lang = page.getData().getContentMeta().get(Response.CONTENT_LANGUAGE);
+
+    return lang;
+  }
+
+  /** Use statistical language identification to extract page language */
+  private String identifyLanguage(Parse parse) {
+    StringBuilder text = new StringBuilder();
+    if (parse == null)
+      return null;
+
+    String title = parse.getData().getTitle();
+    if (title != null) {
+      text.append(title.toString());
+    }
+
+    String content = parse.getText();
+    if (content != null) {
+      text.append(" ").append(content.toString());
+    }
+
+    // trim content?
+    String titleandcontent = text.toString();
+
+    if (this.contentMaxlength != -1
+        && titleandcontent.length() > this.contentMaxlength)
+      titleandcontent = titleandcontent.substring(0, contentMaxlength);
+
+    LanguageIdentifier identifier = new LanguageIdentifier(titleandcontent);
+
+    if (onlyCertain) {
+      if (identifier.isReasonablyCertain())
+        return identifier.getLanguage();
+      else
+        return null;
+    }
+    return identifier.getLanguage();
+  }
+
+  // Check in the metadata whether the language has already been stored there
+  // by Tika
+  private static String getLanguageFromMetadata(Metadata meta) {
+    if (meta == null)
+      return null;
+    // dublin core
+    String lang = meta.get("dc.language");
+    if (lang != null)
+      return lang;
+    // meta content-language
+    lang = meta.get("content-language");
+    if (lang != null)
+      return lang;
+    // lang attribute
+    return meta.get("lang");
+  }
+
+  static class LanguageParser {
+
+    private String dublinCore = null;
+    private String htmlAttribute = null;
+    private String httpEquiv = null;
+    private String language = null;
+
+    LanguageParser(Node node) {
+      parse(node);
+      if (htmlAttribute != null) {
+        language = htmlAttribute;
+      } else if (dublinCore != null) {
+        language = dublinCore;
+      } else {
+        language = httpEquiv;
+      }
+    }
+
+    String getLanguage() {
+      return language;
+    }
+
+    void parse(Node node) {
+
+      NodeWalker walker = new NodeWalker(node);
+      while (walker.hasNext()) {
+
+        Node currentNode = walker.nextNode();
+        String nodeName = currentNode.getNodeName();
+        short nodeType = currentNode.getNodeType();
+
+        if (nodeType == Node.ELEMENT_NODE) {
+
+          // Check for the lang HTML attribute
+          if (htmlAttribute == null) {
+            htmlAttribute = parseLanguage(((Element) currentNode)
+                .getAttribute("lang"));
+          }
+
+          // Check for Meta
+          if ("meta".equalsIgnoreCase(nodeName)) {
+            NamedNodeMap attrs = currentNode.getAttributes();
+
+            // Check for the dc.language Meta
+            if (dublinCore == null) {
+              for (int i = 0; i < attrs.getLength(); i++) {
+                Node attrnode = attrs.item(i);
+                if ("name".equalsIgnoreCase(attrnode.getNodeName())) {
+                  if ("dc.language".equalsIgnoreCase(attrnode.getNodeValue())) {
+                    Node valueattr = attrs.getNamedItem("content");
+                    if (valueattr != null) {
+                      dublinCore = parseLanguage(valueattr.getNodeValue());
+                    }
+                  }
+                }
+              }
+            }
+
+            // Check for the http-equiv content-language
+            if (httpEquiv == null) {
+              for (int i = 0; i < attrs.getLength(); i++) {
+                Node attrnode = attrs.item(i);
+                if ("http-equiv".equalsIgnoreCase(attrnode.getNodeName())) {
+                  if ("content-language".equals(attrnode.getNodeValue()
+                      .toLowerCase())) {
+                    Node valueattr = attrs.getNamedItem("content");
+                    if (valueattr != null) {
+                      httpEquiv = parseLanguage(valueattr.getNodeValue());
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+
+        if ((dublinCore != null) && (htmlAttribute != null)
+            && (httpEquiv != null)) {
+          return;
+        }
+      }
+    }
+
+    /**
+     * Parse a language string and return an ISO 639 primary code, or
+     * <code>null</code> if something wrong occurs, or if no language is found.
+     */
+    final static String parseLanguage(String lang) {
+
+      if (lang == null) {
+        return null;
+      }
+
+      String code = null;
+      String language = null;
+
+      // First, split multi-valued values
+      String langs[] = lang.split(",| |;|\\.|\\(|\\)|=", -1);
+
+      int i = 0;
+      while ((language == null) && (i < langs.length)) {
+        // Then, get the primary code
+        code = langs[i].split("-")[0];
+        code = code.split("_")[0];
+        // Find the ISO 639 code
+        language = (String) LANGUAGES_MAP.get(code.toLowerCase());
+        i++;
+      }
+
+      return language;
+    }
+
+  }
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    contentMaxlength = conf.getInt("lang.analyze.max.length", -1);
+    onlyCertain = conf.getBoolean("lang.identification.only.certain", false);
+    String[] policy = conf.getStrings("lang.extraction.policy");
+    for (int i = 0; i < policy.length; i++) {
+      if (policy[i].equals("detect")) {
+        detect = i;
+      } else if (policy[i].equals("identify")) {
+        identify = i;
+      }
+    }
+  }
+
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/language-identifier/src/main/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/language-identifier/src/main/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java b/nutch-plugins/language-identifier/src/main/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java
new file mode 100644
index 0000000..fbfe8f9
--- /dev/null
+++ b/nutch-plugins/language-identifier/src/main/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java
@@ -0,0 +1,89 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.analysis.lang;
+
+// Nutch imports
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.IndexingFilter;
+import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.net.protocols.Response;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
+
+/**
+ * An {@link org.apache.nutch.indexer.IndexingFilter} that add a
+ * <code>lang</code> (language) field to the document.
+ * 
+ * It tries to find the language of the document by:
+ * <ul>
+ * <li>First, checking if {@link HTMLLanguageParser} add some language
+ * information</li>
+ * <li>Then, checking if a <code>Content-Language</code> HTTP header can be
+ * found</li>
+ * <li>Finaly by analyzing the document content</li>
+ * </ul>
+ * 
+ * @author Sami Siren
+ * @author Jerome Charron
+ */
+public class LanguageIndexingFilter implements IndexingFilter {
+
+  private Configuration conf;
+
+  /**
+   * Constructs a new Language Indexing Filter.
+   */
+  public LanguageIndexingFilter() {
+
+  }
+
+  // Inherited JavaDoc
+  public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
+      CrawlDatum datum, Inlinks inlinks) throws IndexingException {
+
+    // check if LANGUAGE found, possibly put there by HTMLLanguageParser
+    String lang = parse.getData().getParseMeta().get(Metadata.LANGUAGE);
+
+    // check if HTTP-header tels us the language
+    if (lang == null) {
+      lang = parse.getData().getContentMeta().get(Response.CONTENT_LANGUAGE);
+    }
+
+    if (lang == null || lang.length() == 0) {
+      lang = "unknown";
+    }
+
+    doc.add("lang", lang);
+
+    return doc;
+  }
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+  }
+
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/language-identifier/src/main/java/org/apache/nutch/analysis/lang/langmappings.properties
----------------------------------------------------------------------
diff --git a/nutch-plugins/language-identifier/src/main/java/org/apache/nutch/analysis/lang/langmappings.properties b/nutch-plugins/language-identifier/src/main/java/org/apache/nutch/analysis/lang/langmappings.properties
new file mode 100644
index 0000000..e02c8f9
--- /dev/null
+++ b/nutch-plugins/language-identifier/src/main/java/org/apache/nutch/analysis/lang/langmappings.properties
@@ -0,0 +1,188 @@
+# Defines some mapping between common erroneous languages codes and
+# the ISO 639 two-letters language codes.
+aa=aar,Afar
+ab=abk,Abkhazian
+ae=ave
+af=afr,Afrikaans
+ak=aka
+am=amh,Amharic
+an=arg
+ar=ara,Arabic
+as=asm,Assamese
+av=ava
+ay=aym,Aymara
+az=aze,Azerbaijani
+ba=bak,Bashkir
+be=bel,Byelorussian
+bg=bul,Bulgarian
+bh=bih,Bihari
+bi=bis,Bislama
+bm=bam
+bn=ben,Bengali
+bo=bod,tib,Tibetan
+br=bre,Breton
+bs=bos
+ca=cat,Catalan
+ce=che
+ch=cha
+co=cos,Corsican
+cr=cre
+cs=ces,cze,Czech
+cu=chu
+cv=chv
+cy=cym,wel,Welsh
+da=dan,Danish
+de=deu,ger,German
+dv=div
+dz=dzo,Dzongkha
+ee=ewe
+el=ell,gre,Greek
+en=eng,English
+eo=epo,Esperanto
+es=esl,spa,Spanish
+et=est,Estonian
+eu=baq,eus,Basque
+fa=fas,per,Persian
+ff=ful
+fi=fin,Finnish
+fj=fij,Fijian
+fo=fao,Faroese
+fr=fra,fre,French
+fy=fry,Frisian
+ga=gai,iri,Irish
+gd=gla
+gl=glg,Gallegan
+gn=grn,Guarani
+gu=guj,Gujarati
+gv=glv
+ha=hau,Hausa
+he=heb,Hebrew
+hi=hin,Hindi
+ho=hmo
+hr=scr,hrv,Croatian
+ht=hat
+hu=hun,Hungarian
+hy=arm,hye,Armenian
+hz=her
+ia=ina,Interlingua
+id=ind,Indonesian
+ie=ile
+ig=ibo
+ii=iii
+ik=ipk,Inupiak
+io=ido
+is=ice,isl,Icelandic
+it=ita,Italian
+iu=iku,Inuktitut
+ja=jpn,Japanese
+jv=jw,jav,jaw,Javanese
+ka=geo,kat,Georgian
+kg=kon
+ki=kik
+kj=kua
+kk=kaz,Kazakh
+kl=kal,Greenlandic
+km=khm,Khmer
+kn=kan,Kannada
+ko=kor,Korean
+kr=kau
+ks=kas,Kashmiri
+ku=kur,Kurdish
+kv=kom
+kw=cor
+ky=kir,Kirghiz
+la=lat,Latin
+lb=ltz
+lg=lug
+li=lim
+ln=lin,Lingala
+lo=lao,Lao
+lt=lit,Lithuanian
+lu=lub
+lv=lav,Latvian
+mg=mlg,Malagasy
+mh=mah
+mi=mao,mri,Maori
+mk=mac,mak,Macedonian
+ml=mal,mlt,Maltese
+mn=mon,Mongolian
+mo=mol,Moldavian
+mr=mar,Marathi
+ms=may,msa,Malay
+mt=mlt
+my=bur,mya,Burmese
+na=nau,Nauru
+nb=nob
+nd=nde
+ne=nep,Nepali
+ng=ndo
+nl=dut,nla,Dutch
+nn=nno
+no=nor,Norwegian
+nr=nbl
+nv=nav
+ny=nya
+oc=oci,Langue d'Oc
+oj=oji
+om=orm,Oromo
+or=ori,Oriya
+os=oss
+pa=pan,Panjabi
+pi=pli
+pl=pol,Polish
+ps=pus,Pushto
+pt=por,Portuguese
+qu=que,Quechua
+rm=roh,Rhaeto-Romance
+rn=run,Rundi
+ro=ron,rum,Romanian
+ru=rus,Russian
+rw=kin,Kinyarwanda
+sa=san,Sanskrit
+sc=srd
+sd=snd,Sindhi
+se=sme
+sg=sag,Sango
+sh=scr,Serbo-Croatian
+si=sin,Singhalese
+sk=slk,slo,Slovak
+sl=slv,Slovenian
+sm=smo,Samoan
+sn=sna,Shona
+so=som,Somali
+sq=alb,sqi,Albanian
+sr=scc,srp,Serbian
+ss=ssw,Siswant
+st=sot,Sotho
+su=sun,Sudanese
+sv=sve,swe,Swedish,Svenska,Sweden
+sw=swa,Swahili
+ta=tam,Tamil
+te=tel,Telugu
+tg=tgk,Tajik
+th=tha,Thai
+ti=tir,Tigrinya
+tk=tuk,Turkmen
+tl=tgl,Tagalog
+tn=tsn,Tswana
+to=tog,Tonga
+tr=tur,Turkish
+ts=tso,Tsonga
+tt=tat,Tatar
+tw=twi,Twi
+ty=tah
+ug=uig,Uighur
+uk=ukr,Ukrainian
+ur=urd,Urdu
+uz=uzb,Uzbek
+ve=ven
+vi=vie,Vietnamese
+vo=vol,Volapk
+wa=wln
+wo=wol,Wolof
+xh=xho,Xhosa
+yi=yidYiddish
+yo=yor,Yoruba
+za=zha,Zhuang
+zh=chi,zho,Chinese
+zu=zul,Zulu

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/language-identifier/src/main/java/org/apache/nutch/analysis/lang/package.html
----------------------------------------------------------------------
diff --git a/nutch-plugins/language-identifier/src/main/java/org/apache/nutch/analysis/lang/package.html b/nutch-plugins/language-identifier/src/main/java/org/apache/nutch/analysis/lang/package.html
new file mode 100644
index 0000000..06343c8
--- /dev/null
+++ b/nutch-plugins/language-identifier/src/main/java/org/apache/nutch/analysis/lang/package.html
@@ -0,0 +1,6 @@
+<html>
+<body>
+<p>Text document language identifier.</p><p>Language profiles are based on material from
+<a href="http://www.homepages.inf.ed.ac.uk/pkoehn/publications/europarl.ps/">http://www.homepages.inf.ed.ac.uk/pkoehn/publications/europarl.ps/</a>.</p>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/language-identifier/src/test/java/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/language-identifier/src/test/java/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java b/nutch-plugins/language-identifier/src/test/java/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java
new file mode 100644
index 0000000..8245151
--- /dev/null
+++ b/nutch-plugins/language-identifier/src/test/java/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java
@@ -0,0 +1,149 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.analysis.lang;
+
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
+
+// Nutch imports
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.tika.language.LanguageIdentifier;
+import org.junit.Assert;
+import org.junit.Test;
+
+public class TestHTMLLanguageParser {
+
+  private static String URL = "http://foo.bar/";
+
+  private static String BASE = "http://foo.bar/";
+
+  String docs[] = {
+      "<html lang=\"fi\"><head>document 1 title</head><body>jotain suomeksi</body></html>",
+      "<html><head><meta http-equiv=\"content-language\" content=\"en\"><title>document 2 title</head><body>this is english</body></html>",
+      "<html><head><meta name=\"dc.language\" content=\"en\"><title>document 3 title</head><body>this is english</body></html>" };
+
+  // Tika does not return "fi" but null
+  String metalanguages[] = { "fi", "en", "en" };
+
+  /**
+   * Test parsing of language identifiers from html
+   **/
+  @Test
+  public void testMetaHTMLParsing() {
+
+    try {
+      ParseUtil parser = new ParseUtil(NutchConfiguration.create());
+      /* loop through the test documents and validate result */
+      for (int t = 0; t < docs.length; t++) {
+        Content content = getContent(docs[t]);
+        Parse parse = parser.parse(content).get(content.getUrl());
+        Assert.assertEquals(metalanguages[t], (String) parse.getData()
+            .getParseMeta().get(Metadata.LANGUAGE));
+      }
+    } catch (Exception e) {
+      e.printStackTrace(System.out);
+      Assert.fail(e.toString());
+    }
+
+  }
+
+  /** Test of <code>LanguageParser.parseLanguage(String)</code> method. */
+  @Test
+  public void testParseLanguage() {
+    String tests[][] = { { "(SCHEME=ISO.639-1) sv", "sv" },
+        { "(SCHEME=RFC1766) sv-FI", "sv" }, { "(SCHEME=Z39.53) SWE", "sv" },
+        { "EN_US, SV, EN, EN_UK", "en" }, { "English Swedish", "en" },
+        { "English, swedish", "en" }, { "English,Swedish", "en" },
+        { "Other (Svenska)", "sv" }, { "SE", "se" }, { "SV", "sv" },
+        { "SV charset=iso-8859-1", "sv" }, { "SV-FI", "sv" },
+        { "SV; charset=iso-8859-1", "sv" }, { "SVE", "sv" }, { "SW", "sw" },
+        { "SWE", "sv" }, { "SWEDISH", "sv" }, { "Sv", "sv" }, { "Sve", "sv" },
+        { "Svenska", "sv" }, { "Swedish", "sv" }, { "Swedish, svenska", "sv" },
+        { "en, sv", "en" }, { "sv", "sv" },
+        { "sv, be, dk, de, fr, no, pt, ch, fi, en", "sv" }, { "sv,en", "sv" },
+        { "sv-FI", "sv" }, { "sv-SE", "sv" }, { "sv-en", "sv" },
+        { "sv-fi", "sv" }, { "sv-se", "sv" },
+        { "sv; Content-Language: sv", "sv" }, { "sv_SE", "sv" },
+        { "sve", "sv" }, { "svenska, swedish, engelska, english", "sv" },
+        { "sw", "sw" }, { "swe", "sv" }, { "swe.SPR.", "sv" },
+        { "sweden", "sv" }, { "swedish", "sv" }, { "swedish,", "sv" },
+        { "text/html; charset=sv-SE", "sv" }, { "text/html; sv", "sv" },
+        { "torp, stuga, uthyres, bed & breakfast", null } };
+
+    for (int i = 0; i < 44; i++) {
+      Assert.assertEquals(tests[i][1],
+          HTMLLanguageParser.LanguageParser.parseLanguage(tests[i][0]));
+    }
+  }
+
+  private Content getContent(String text) {
+    Metadata meta = new Metadata();
+    meta.add("Content-Type", "text/html");
+    return new Content(URL, BASE, text.getBytes(), "text/html", meta,
+        NutchConfiguration.create());
+  }
+
+  @Test
+  public void testLanguageIndentifier() {
+    try {
+      long total = 0;
+      LanguageIdentifier identifier;
+      BufferedReader in = new BufferedReader(new InputStreamReader(this
+          .getClass().getResourceAsStream("test-referencial.txt")));
+      String line = null;
+      while ((line = in.readLine()) != null) {
+        String[] tokens = line.split(";");
+        if (!tokens[0].equals("")) {
+          StringBuilder content = new StringBuilder();
+          // Test each line of the file...
+          BufferedReader testFile = new BufferedReader(new InputStreamReader(
+              this.getClass().getResourceAsStream(tokens[0]), "UTF-8"));
+          String testLine = null, lang = null;
+          while ((testLine = testFile.readLine()) != null) {
+            content.append(testLine + "\n");
+            testLine = testLine.trim();
+            if (testLine.length() > 256) {
+              identifier = new LanguageIdentifier(testLine);
+              lang = identifier.getLanguage();
+              Assert.assertEquals(tokens[1], lang);
+            }
+          }
+          testFile.close();
+
+          // Test the whole file
+          long start = System.currentTimeMillis();
+          System.out.println(content.toString());
+          identifier = new LanguageIdentifier(content.toString());
+          lang = identifier.getLanguage();
+          System.out.println(lang);
+          total += System.currentTimeMillis() - start;
+          Assert.assertEquals(tokens[1], lang);
+        }
+      }
+      in.close();
+      System.out.println("Total Time=" + total);
+    } catch (Exception e) {
+      e.printStackTrace();
+      Assert.fail(e.toString());
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/language-identifier/src/test/java/org/apache/nutch/analysis/lang/da.test
----------------------------------------------------------------------
diff --git a/nutch-plugins/language-identifier/src/test/java/org/apache/nutch/analysis/lang/da.test b/nutch-plugins/language-identifier/src/test/java/org/apache/nutch/analysis/lang/da.test
new file mode 100644
index 0000000..1238cd5
--- /dev/null
+++ b/nutch-plugins/language-identifier/src/test/java/org/apache/nutch/analysis/lang/da.test
@@ -0,0 +1,108 @@
+Genoptagelse af sessionen
+Jeg erkl�rer Europa-Parlamentets session, der blev afbrudt fredag den 17. december, for genoptaget. Endnu en gang vil jeg �nske Dem godt nyt�r, og jeg h�ber, De har haft en god ferie.
+Som De kan se, indfandt det store "�r 2000-problem" sig ikke. Til geng�ld har borgerne i en del af medlemslandene v�ret ramt af meget forf�rdelige naturkatastrofer. De har udtrykt �nske om en debat om dette emne i l�bet af m�deperioden. I mellemtiden �nsker jeg - som ogs� en del kolleger har anmodet om - at vi iagttager et minuts stilhed til minde om ofrene for bl.a. stormene i de medlemslande, der blev ramt. Jeg opfordrer Dem til st�ende at iagttage et minuts stilhed.
+(Parlamentet iagttog st�ende et minuts stilhed
+
+Fru formand, en bem�rkning til forretningsordenen. Gennem pressen og tv vil De v�re bekendt med en r�kke bombeeksplosioner og drab i Sri Lanka. En af de personer, der blev myrdet for ganske nylig i Sri Lanka, var hr. Kumar Ponnambalam, der bes�gte Europa-Parlamentet for f� m�neder siden. Ville det v�re passende, hvis De, fru formand, sendte en skrivelse til Sri Lankas pr�sident for at udtrykke vores dybe beklagelse i forbindelse med Kumar Ponnambalams d�d og de andre voldsomme d�dsfald i Sri Lanka og for indtr�ngende at anmode pr�sidenten om at g�re alt for at opn� en fredelig l�sning p� en meget vanskelig situation?
+
+Ja, hr. Evans, jeg mener, at et initiativ, som det, De foresl�r, ville v�re meget hensigtsm�ssigt. Hvis Europa-Parlamentet er enigt, vil jeg g�re, som hr. Evans har foresl�et.
+
+Fru formand, en bem�rkning til forretningsordenen. Jeg vil gerne have Deres r�d om artikel 143 vedr�rende afvisning. Mit sp�rgsm�l omhandler et emne, der vil blive behandlet p� torsdag, og jeg vil gerne tage emnet op igen ved den lejlighed.
+Bet�nkningen af Cunha om fler�rige udviklingsprogrammer skal forhandles af Parlamentet p� torsdag og indeholder et forslag i punkt 6 om, at der skal indf�res kvotesanktioner for lande, der ikke overholder deres �rlige m�ls�tninger for fl�dereduktion. Dette skal i henhold til punkt 6 indf�res til trods for princippet om relativ stabilitet. Jeg mener, at princippet om relativ stabilitet er et grundl�ggende retsprincip for den f�lles fiskeripolitik, og at der vil v�re juridisk bel�g for at afvise et forslag om at undergrave dette princip. Jeg vil gerne vide, om man kan g�re indsigelse mod noget, der bare er en bet�nkning og ikke et forslag til retsakt, og om det er noget, jeg kan g�re p� torsdag?
+
+Det er netop d�r, De - hvis De �nsker det - kan rejse dette sp�rgsm�l, det vil sige p� torsdag ved forhandlingens begyndelse.
+
+Fru formand, samtidig med Europa-Parlamentets f�rste m�deperiode i �r har man i Texas i USA fastsat datoen for henrettelsen af en d�dsd�mt, nemlig en ung mand p� 34 �r ved navn Hicks, og det er desv�rre p� n�ste torsdag.
+P� anmodning af et fransk parlamentsmedlem, hr. Zimeray, er der allerede indgivet et andragende, som mange har skrevet under p�, heriblandt undertegnede, men i tr�d med den holdning, som Europa-Parlamentet og hele Det Europ�iske F�llesskab konstant giver udtryk for, anmoder jeg Dem om at g�re den indflydelse, De har i kraft af Deres embede og den institution, De repr�senterer, g�ldende over for pr�sidenten og Texas' guvern�r Bush, som har bef�jelse til at oph�ve d�dsdommen og ben�de den d�mte.
+Alt dette er i tr�d med de principper, vi altid har v�ret tilh�ngere af.
+
+Tak, hr. Segni, det g�r jeg med gl�de. Det er s�ledes helt i tr�d med den holdning, Europa-Parlamentet altid har indtaget.
+
+Fru formand, jeg vil gerne g�re Dem opm�rksom p� en sag, som Parlamentet har besk�ftiget sig med gentagne gange. Det drejer sig om Alexander Nikitin. Vi gl�der os alle sammen over, at domstolen har frifundet ham og understreget, at adgangen til milj�informationer ogs� er konstitutionel ret i Rusland. Men nu er det s�dan, at han skal anklages igen, fordi statsadvokaten har anket dommen. Vi ved og har fastsl�et i virkelig mange beslutninger - netop p� det sidste m�de sidste �r - at dette ikke bare er en juridisk sag, og at det er forkert at beskylde Alexander Nikitin for at have beg�et kriminalitet og forr�deri, fordi vi som ber�rte nyder godt af hans resultater. Disse resultater er grundlaget for de europ�iske programmer til beskyttelse af Barentsee, og derfor beder jeg Dem gennemg� et brevudkast, som beskriver de vigtigste fakta, og tydeligg�re denne holdning i Rusland i overensstemmelse med Parlamentets beslutninger.
+
+Ja, fru Schroedter, jeg skal med gl�de unders�ge dette sp�rgsm�l, n�r jeg har modtaget Deres brev.
+
+Fru formand, jeg vil gerne f�rst give Dem en kompliment for den kendsgerning, at De har holdt Deres ord, og at antallet af tv-kanaler p� vores kontorer faktisk er udvidet enormt nu i denne f�rste m�deperiode i det nye �r. Men, fru formand, det, som jeg havde anmodet om, er ikke sket. Der er nu ganske vist to finske kanaler og en portugisisk kanal, men der er stadig ingen nederlandsk kanal, og jeg havde anmodet Dem om en nederlandsk kanal, fordi ogs� nederl�ndere gerne vil f�lge med i nyhederne hver m�ned, n�r vi forvises til dette sted. Jeg vil s�ledes endnu en gang anmode Dem om alligevel at s�rge for, at vi ogs� f�r en nederlandsk kanal.
+
+Fru Plooij-van Gorsel, jeg kan oplyse Dem om, at dette sp�rgsm�l er opf�rt p� dagsordenen for kv�storernes m�de p� onsdag. Det vil, h�ber jeg, blive behandlet i en positiv �nd.
+
+Fru formand, kan De fort�lle mig, hvorfor Parlamentet ikke overholder de lovgivningsbestemmelser om sundhed og sikkerhed, som det selv har fastsat? Hvorfor er der ikke foretaget en unders�gelse af luftkvaliteten i denne bygning, siden vi blev valgt? Hvorfor har Sundheds- og Sikkerhedsudvalget ikke haft et m�de siden 1998? Hvorfor har der ikke v�ret brand�velser, hverken i parlamentsbygningerne i Bruxelles eller Strasbourg? Hvorfor er der ingen brandinstrukser? Hvorfor etableres der ikke omr�der med rygeforbud? Det er fuldst�ndig skandal�st, at vi fasts�tter lovgivningsbestemmelser og s� ikke overholder dem selv.
+
+Fru Lynne, De har fuldst�ndig ret, og jeg vil kontrollere, om alle disse ting virkelig ikke er blevet gjort. Jeg vil ligeledes freml�gge problemet for kv�storerne, og jeg er sikker p�, at kv�storerne vil bestr�be sig p� at s�rge for, at vi overholder den lovgivning, vi vedtager.
+
+Fru formand, fru D�ez Gonz�lez og jeg havde stillet nogle sp�rgsm�l om visse holdninger gengivet i en spansk avis, som n�stformanden, fru de Palacio, har givet udtryk for. De kompetente tjenestegrene har ikke opf�rt dem p� dagsordenen, fordi de mener, at de blev besvaret ved et tidligere m�de.
+Jeg anmoder om, at denne beslutning tages op til fornyet overvejelse, for det er ikke tilf�ldet. De sp�rgsm�l, der tidligere er blevet besvaret, drejede sig om fru de Palacios medvirken i en bestemt sag og ikke om de erkl�ringer, som kunne l�ses i avisen ABC den 18. november sidste �r.
+
+K�re kolleger, vi vil unders�ge alt dette. Jeg m� indr�mme, at det hele forekommer mig lidt forvirrende i �jeblikket. Derfor vil vi unders�ge det meget omhyggeligt, s�ledes at alt er, som det skal v�re.
+
+Fru formand, jeg vil gerne vide, om der kommer en klar melding fra Parlamentet i denne uge om vores utilfredshed i forbindelse med dagens beslutning om ikke at forl�nge embargoen mod v�beneksport til Indonesien i betragtning af, at et stort flertal i Parlamentet tidligere har undertegnet v�benembargoen i Indonesien. Dagens beslutning om ikke at forl�nge embargoen er meget farlig p� grund af situationen der. Parlamentet b�r derfor tilkendegive sin holdning, da det er flertallets �nske. Det er uansvarligt af EU-medlemsstater at n�gte at forl�nge embargoen. Som n�vnt tidligere er der tale om en meget ustabil situation. Der er endog fare for et milit�rkup i fremtiden. Vi ved ikke, hvad der sker. S� hvorfor skal v�benproducenter i EU profitere p� bekostning af uskyldige mennesker?
+
+Under alle omst�ndigheder er punktet ikke p� nuv�rende tidspunkt opf�rt under forhandlingen om aktuelle og uops�ttelige sp�rgsm�l p� torsdag.
+
+Arbejdsplan
+N�ste punkt p� dagsordenen er fasts�ttelse af arbejdsplanen.
+Det endelige forslag til dagsorden, som det blev opstillet af Formandskonferencen p� m�det torsdag den 13. januar i overensstemmelse med forretningsordenens artikel 95, er omdelt.
+Det foreligger ingen forslag til �ndring for mandag og tirsdag.
+Onsdag:
+PSE-gruppen anmoder om at f� en redeg�relse fra Kommissionen om dens strategiske m�l for de kommende fem �r samt om den administrative reform opf�rt p� dagsordenen.
+Hvis hr. Bar�n Crespo, der har fremsat anmodningen, �nsker det, opfordrer jeg ham til at begrunde sit forslag. Dern�st g�r vi, som vi plejer, det vil sige, at vi h�rer et indl�g for og et indl�g imod forslaget.
+
+Fru formand, forel�ggelsen af Prodi-Kommissionens politiske program for hele valgperioden var til at begynde med et forslag fra De Europ�iske Socialdemokraters Gruppe, som opn�ede enstemmighed p� Formandskonferencen i september og ogs� hr. Prodis udtrykkelige accept, og han gentog sit l�fte i sin inds�ttelsestale.
+Dette l�fte er vigtigt, fordi Kommissionen er et organ, der har initiativmonopol i henhold til traktaterne og derfor grundl�ggende udformer Parlamentets politiske arbejde og lovgivningsarbejde i de kommende fem �r. Jeg vil ogs� minde om, fru formand, at Parlamentet to gange i foreg�ende valgperiode ved afstemning gav udtryk for sin tillid til formand Prodi. I denne valgperiode igen i juli og senere, med den nye Kommission p� plads, gav det igen i september hele Kommissionen et tillidsvotum. Der har derfor v�ret tid nok til, at Kommissionen kunne forberede sit program, og til at vi kunne f� kendskab til det og forklare det til borgerne. I den forbindelse vil jeg minde om beslutningen fra 15. september, hvori der blev henstillet til, at forslaget blev forelagt hurtigst muligt.
+Det, der skete i sidste uge - og som opstod uden for Formandskonferencen, hvor den udelukkende blev brugt til at bekr�fte og godkende beslutninger, som var truffet uden for den - skaber et dilemma: Enten er Kommissionen ikke i stand til at freml�gge det program. (I s� fald ville det v�re passende, at den informerede om det. If�lge kommissionsformandens udsagn er de i stand til at g�re det. Eftersom Kommissionen er repr�senteret af n�stformanden, fru de Palacio, mener jeg, at det f�r afstemningen ville v�re p� sin plads at v�re p� det rene med Kommissionens situation, hvad ang�r dets vilje til at forel�gge programmet, ligesom det var blevet aftalt.) Eller ogs� er Parlamentet ikke i stand til at behandle dette program, som der vist er nogle, der p�st�r. Efter min mening ville denne anden hypotese v�re det samme som at give afkald p� vores ansvar som parlament og desuden at indf�re en original teori, en ukendt metode, der best�r i skriftligt at give de politiske g
 rupper kendskab til Kommissionens program en uge f�r - og ikke dagen f�r, som det var aftalen - i betragtning af, at lovgivningsprogrammet skal diskuteres i februar, s�ledes at vi kunne springe forhandlingen over, fordi pressen og Internettet dagen efter havde givet alle borgerne kendskab til det, og Parlamentet ville ikke l�ngere beh�ve at bekymre sig om sagen.
+Da min gruppe mener, at et parlament er til for at lytte, diskutere og overveje, mener vi, at der ikke er noget som helst, der kan retf�rdigg�re denne uds�ttelse, og vi mener, at hvis Kommissionen er i stand til at g�re det, er der tid nok til, at vi kan genetablere den oprindelige aftale mellem Parlamentet og Kommissionen og handle ansvarligt over for vores medborgere. Derfor g�r det forslag, som De Europ�iske Socialdemokraters Gruppe stiller, og som De har n�vnt, ud p�, at vi holder fast ved forel�ggelsen af Prodi-Kommissionens program for valgperioden p� onsdag, og at dette program ogs� omfatter forslaget til administrativ reform, for hvis det ikke bliver s�dan, kan vi komme i en paradoksal situation: Med en undskyldning om at der ikke er en tekst, n�gtes formanden for Kommissionen p� den ene side retten til at tale i Parlamentet, og p� den anden side forhindres det, at der finder en forhandling sted om reformen, uden at Parlamentet p� forh�nd kender de tekster, 
 som den er baseret p�. Derfor, fru formand, anmoder jeg Dem om at bede Kommissionen om at udtale sig nu, og at vi derefter g�r over til afstemning.
+(Bifald fra PSE-gruppen)
+
+Fru formand, k�re kolleger, jeg er godt nok noget forbavset over vores kollega Bar�n Crespos opf�rsel. Han forlanger nu, at dette punkt s�ttes p� dagsordenen for onsdag.
+Hr. Bar�n Crespo, De kunne ikke deltage den sidste torsdag p� Formandskonferencen. Det kritiserer jeg ikke, for det sker af og til, at man lader sig repr�sentere. Hr. H�nsch repr�senterede Dem d�r. Vi havde en udf�rlig debat p� Formandskonferencen. Kun Deres gruppe repr�senterede det, som De siger nu. Vi stemte derefter om det. Hver ordf�rer har jo lige s� mange stemmer, som der er medlemmer i gruppen. Der var en afstemning om dette punkt. S� vidt jeg husker, faldt denne afstemning s�ledes ud: 422 mod 180 stemmer og nogle f�, der undlod at stemme. Det vil sige, at alle grupper med undtagelse af l�sg�ngerne - men de udg�r jo ikke nogen gruppe - var enige, kun Deres gruppe mente, at man skulle b�re sig s�dan ad, som De har foresl�et her. Alle andre mente noget andet. Det var beslutningen.
+Nu vil jeg gerne sige noget til selve sagen. Vi har tillid til Kommissionen, til Romano Prodi, og flertallet i vores gruppe har udtrykt tillid til Romano Prodi og Kommissionen efter en vanskelig proces, som alle kender til. Men vi mener ogs�, at vi skal have en debat om Kommissionens strategi i en ordin�r procedure, ikke kun p� baggrund af en mundtlig forklaring her i Europa-Parlamentet, men ogs� p� baggrund af et dokument, som er blevet besluttet i Kommissionen, og som beskriver dette program for fem �r. Et s�dant dokument findes ikke!
+
+Kommissionen vil freml�gge programmet for �r 2000 til februar. Vi har sagt, at hvis Kommissionen ikke �nsker at lave programmet for �r 2000 i januar, s� g�r vi det i februar. Det har vi godkendt. Vi �nsker s�dan set ikke nogen konflikt med Kommissionen, vi mener derimod, at hvis det g�r, skal Kommissionen og Parlamentet g� samme vej. Men Parlamentet er ogs� Kommissionens kontroll�r. Og ikke alt, hvad der kommer fra Kommissionen, skal n�dvendigvis v�re i overensstemmelse med os.
+Jeg vil gerne have, at vi f�r mulighed for at forberede os fornuftigt p� en debat om fem�rsprogrammet i grupperne. Man kan ikke forberede sig, hvis man h�rer en forklaring her og slet ikke ved, hvad indholdet af en s�dan forklaring er. Derfor anbefaler vi - og det er mit indtryk, at Kommissionen ogs� er �ben over for denne tanke - at vi f�rer debatten om Kommissionens langsigtede program frem til �r 2005 i februar - jeg h�ber ogs�, at Kommissionen er blevet enig om et program til den tid, som den vil foresl� os - og at vi samtidig f�rer en debat om Kommissionens lovgivningsprogram for �r 2000 i februar. Det er s�ledes ogs� en fornuftig saglig sammenh�ng, som r�der os til at f�re debatten om begge programmer i f�llesskab. Derfor afviser min gruppe p� det bestemteste Den Socialdemokratiske Gruppes forslag!
+(Bifald fra PPE-DE-gruppen)
+
+Fru formand, jeg vil g�re det meget klart, at Kommissionen f�rst og fremmest har den st�rste respekt for Parlamentets beslutninger, deriblandt opstillingen af dagsordenen. Derfor respekterer vi Parlamentets beslutning, hvad det ang�r.
+Men jeg vil ogs� g�re det meget klart, at hr. Prodi aftalte med Parlamentet at indf�re en ny forhandling, som hr. Bar�n nok husker, ud over den �rlige forhandling om Kommissionens lovgivningsprogram, om hovedlinjerne i aktionerne for den kommende fem�rsperiode, alts� for denne valgperiode.
+Jeg vil sige, fru formand, at denne forhandling i den aftale, som blev indg�et i september, adskilte sig fra Kommissionens �rlige forel�ggelse af programmet for lovgivningen. Og jeg vil sige, fru formand, at vi i Kommissionen er forberedt p� og rede til at deltage i den forhandling, n�r det er belejligt, at vi var rede til at gennemf�re den i denne uge, som det var aftalt fra begyndelsen, med udgangspunkt i at den blev forelagt dagen f�r i en tale til de parlamentariske grupper.
+Jeg vil derfor gentage, fru formand, at vi for vores del har diskuteret handlingsprogrammet for de kommende fem �r, og at vi er rede til, n�r Parlamentet bestemmer det - i denne uge, hvis det er beslutningen - at komme og forel�gge programmet for de kommende fem �r og i n�ste m�ned programmet for 2000, hvilket er helt i overensstemmelse med aftalen.
+
+Jeg foresl�r, at vi stemmer om PSE-gruppens anmodning om at f� en redeg�relse fra Kommissionen om dens strategiske m�l genopf�rt p� dagsordenen.
+(Forslaget forkastedes) Formanden. Stadig med hensyn til dagsordenen for onsdag har jeg et forslag om de mundtlige foresp�rgsler om kapitalskat. PPE-DE-gruppen �nsker, at dette punkt tages af dagsordenen.
+�nsker nogen at tage ordet p� vegne af gruppen for at begrunde denne anmodning?
+
+Fru formand, da jeg kan h�re en smule latter fra Socialdemokraterne - jeg har f�et fortalt, at brede kredse i Den Socialdemokratiske Gruppe ogs� gerne vil have taget dette punkt af dagsordenen, fordi der ved afstemningen p� Formandskonferencen ikke forel� noget votum fra arbejdsgruppen af ansvarlige kolleger i Den Socialdemokratiske Gruppe. Jeg ved ikke, om denne oplysning er rigtig, men PPE-DE-gruppen ville i hvert fald v�re taknemmelig, hvis dette punkt blev annulleret, fordi Parlamentet allerede har besk�ftiget sig med dette sp�rgsm�l flere gange. Der er ogs� truffet beslutninger mod en s�dan skat. Derfor anmoder min gruppe om, at dette punkt tages af dagsordenen.
+
+Tak, hr. Poettering.
+Vi skal nu h�re hr. Wurtz, der er imod forslaget.
+
+Fru formand, jeg vil f�rst og fremmest fremh�ve hr. Poetterings manglende konsekvens. For et �jeblik siden bel�rte han socialdemokraterne, fordi de ville �ndre en klar beslutning truffet p� Formandskonferencen. Imidlertid g�r han det samme. Vi havde en diskussion, vi var alle - p� n�r PPE-DE-gruppen og Den Liberale Gruppe - enige, og jeg bem�rkede endda - som De sikkert husker, k�re medform�nd - at det ikke drejede sig om, hvorvidt De er for eller imod Tobin-afgiften, men om De turde h�re, hvad Kommissionen og R�det mente om den. Dette er ikke for meget forlangt. Derfor fastholder jeg forslaget om at bevare det mundtlige sp�rgsm�l til Kommissionen og R�det, s�ledes at vi �n gang for alle f�r opklaret, hvilken holdning de to institutioner har til dette ret beskedne forslag, som dog sender et vigtigt signal til befolkningen, navnlig efter fiaskoen i Seattle.
+
+Vi skal stemme om PPE-DE-gruppens anmodning om, at de mundtlige foresp�rgsler om kapitalskat tages af dagsordenen.
+(Forslaget forkastedes. 164 stemte for, 166 stemte imod, og 7 undlod at stemme)
+
+Fru formand, jeg vil gerne takke hr. Poettering for den reklame, han netop har gjort for denne debat. Tak.
+
+Fru formand, er min stemme, som jeg ikke kunne afgive elektronisk, fordi jeg ikke har kortet, blevet talt med? Jeg stemte for.
+
+Det er rigtigt. Hvis vi tilf�jer de to kolleger, der har givet sig til kende, bliver resultatet ...
+
+Fru formand, formandskabet har bekendtgjort afstemningens udfald. Det kan der ikke laves om p�.
+
+K�re kolleger, jeg minder endnu en gang om, at det er vigtigt, at alle har deres kort om mandagen. Det er tydeligt, at vi har et problem, og jeg m� derfor tr�ffe en beslutning.
+Jeg har ogs� glemt mit kort, og jeg ville have stemt imod. Derfor mener jeg, at det mundtlige sp�rgsm�l fortsat skal medtages p� dagsordenen.
+Det er sidste gang, vi vil tage hensyn til glemte kort. Lad dette v�re helt klart, og husk det.
+(Bifald)
+Ja, det mundtlige sp�rgsm�l er fortsat opf�rt p� dagsordenen, og ja, formanden har ret til at stemme, ligesom hun har ret til at glemme sit kort.
+Vi forts�tter nu med de �vrige �ndringer af dagsordenen.
+
+Fru formand, i den tidligere afstemning - og jeg vil rette mig efter Deres afg�relse om dette emne - om sp�rgsm�let om Kommissionens redeg�relse om dens strategiske m�l gav jeg udtryk for, at jeg gerne ville tale p� vegne af min gruppe f�r afstemningen. Det blev ikke til noget. Jeg vil s�tte pris p� at f� lejlighed til at afgive stemmeforklaring p� vegne af min gruppe i forbindelse med afslutningen af dette sp�rgsm�l. Det er et vigtigt sp�rgsm�l, og det vil v�re nyttigt for Parlamentet, hvis det er angivet, hvordan de forskellige personer opfatter vores handlinger i lyset af deres egne politiske analyser.
+
+Fru formand, jeg vil ikke genoptage debatten, men jeg havde ogs� meldt mig for at tage stilling til hr. Bar�n Crespos �ndringsforslag. De r�bte mig heller ikke op. Det beklager jeg, men afstemningen er gennemf�rt, afg�relsen er truffet, vi lader det alts� ligge.
+
+Jeg beklager, hr. H�nsch og hr. Cox, jeg s� ikke, at De anmodede om ordet. Men i �vrigt mener jeg, at holdningerne er meget klare, og de vil blive indf�rt i protokollen. N�r vi i morgen skal vedtage protokollen for i dag, kan de kolleger, der ikke synes, at holdningerne er blevet tilstr�kkeligt forklaret, anmode om �ndringer. Det, mener jeg, er en god l�sning. Selvf�lgelig vil protokollen for m�det i morgen tage hensyn til alle de supplerende forklaringer. Jeg mener, at det er en bedre l�sning end at g� over til stemmeforklaringer p� nuv�rende tidspunkt, som ville v�re et stort sidespring. Hr. Cox og hr. H�nsch, passer denne l�sning Dem?
+
+Fru formand, hvis protokollen giver korrekt udtryk for min gruppes holdning i forbindelse med afstemningen, vil og kan jeg ikke g�re indsigelser. Hvis De afg�r, at der ikke er grund til at afgive stemmeforklaring, vil jeg acceptere det, men med forbehold.
+
+Vi vil derfor v�re meget opm�rksomme p� udarbejdelsen af protokollen. Det er vi i �vrigt altid. Hvis holdningerne ikke klart fremg�r, kan vi eventuelt �ndre den.
+(Den s�ledes �ndrede dagsorden godkendtes)
+

[27/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build for nutch-core and nutch-plugins

Posted by th...@apache.org.

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/tools/proxy/AbstractTestbedHandler.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/tools/proxy/AbstractTestbedHandler.java b/nutch-core/src/test/java/org/apache/nutch/tools/proxy/AbstractTestbedHandler.java
new file mode 100644
index 0000000..b4771d0
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/tools/proxy/AbstractTestbedHandler.java
@@ -0,0 +1,49 @@
+package org.apache.nutch.tools.proxy;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import java.io.IOException;
+
+import javax.servlet.ServletException;
+import javax.servlet.http.HttpServletRequest;
+import javax.servlet.http.HttpServletResponse;
+
+import org.mortbay.jetty.HttpConnection;
+import org.mortbay.jetty.Request;
+import org.mortbay.jetty.handler.AbstractHandler;
+
+public abstract class AbstractTestbedHandler extends AbstractHandler {
+  protected boolean debug = false;
+
+  @Override
+  public void handle(String target, HttpServletRequest req,
+      HttpServletResponse res, int dispatch) throws IOException,
+      ServletException {
+    Request base_request = (req instanceof Request) ? (Request) req
+        : HttpConnection.getCurrentConnection().getRequest();
+    res.addHeader("X-TestbedHandlers", this.getClass().getSimpleName());
+    handle(base_request, res, target, dispatch);
+  }
+
+  public abstract void handle(Request req, HttpServletResponse res,
+      String target, int dispatch) throws IOException, ServletException;
+
+  public void addMyHeader(HttpServletResponse res, String name, String value) {
+    name = "X-" + this.getClass().getSimpleName() + "-" + name;
+    res.addHeader(name, value);
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/tools/proxy/DelayHandler.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/tools/proxy/DelayHandler.java b/nutch-core/src/test/java/org/apache/nutch/tools/proxy/DelayHandler.java
new file mode 100644
index 0000000..58f1f43
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/tools/proxy/DelayHandler.java
@@ -0,0 +1,56 @@
+package org.apache.nutch.tools.proxy;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Random;
+
+import javax.servlet.ServletException;
+import javax.servlet.http.HttpServletResponse;
+
+import org.mortbay.jetty.Request;
+
+public class DelayHandler extends AbstractTestbedHandler {
+
+  public static final long DEFAULT_DELAY = 2000;
+
+  private int delay;
+  private boolean random;
+  private Random r;
+
+  public DelayHandler(int delay) {
+    if (delay < 0) {
+      delay = -delay;
+      random = true;
+      r = new Random(1234567890L); // repeatable random
+    }
+    this.delay = delay;
+  }
+
+  @Override
+  public void handle(Request req, HttpServletResponse res, String target,
+      int dispatch) throws IOException, ServletException {
+    try {
+      int del = random ? r.nextInt(delay) : delay;
+      Thread.sleep(del);
+      addMyHeader(res, "Delay", String.valueOf(del));
+    } catch (Exception e) {
+
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/tools/proxy/FakeHandler.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/tools/proxy/FakeHandler.java b/nutch-core/src/test/java/org/apache/nutch/tools/proxy/FakeHandler.java
new file mode 100644
index 0000000..a40b199
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/tools/proxy/FakeHandler.java
@@ -0,0 +1,102 @@
+package org.apache.nutch.tools.proxy;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.OutputStream;
+import java.util.Random;
+
+import javax.servlet.ServletException;
+import javax.servlet.http.HttpServletResponse;
+
+import org.mortbay.jetty.HttpURI;
+import org.mortbay.jetty.Request;
+
+public class FakeHandler extends AbstractTestbedHandler {
+  Random r = new Random(1234567890L); // predictable
+
+  private static final String testA = "<html><body><h1>Internet Weather Forecast Accuracy</h1>\n"
+      + "<p>Weather forecasting is a secure and popular online presence, which is understandable. The weather affects most everyone's life, and the Internet can provide information on just about any location at any hour of the day or night. But how accurate is this information? How much can we trust it? Perhaps it is just my skeptical nature (or maybe the seeming unpredictability of nature), but I've never put much weight into weather forecasts - especially those made more than three days in advance. That skepticism progressed to a new high in the Summer of 2004, but I have only now done the research necessary to test the accuracy of online weather forecasts. First the story, then the data.</p>"
+      + "<h2>An Internet Weather Forecast Gone Terribly Awry</h2>"
+      + "<p>It was the Summer of 2004 and my wife and I were gearing up for a trip with another couple to Schlitterbahn in New Braunfels - one of the (if not the) best waterparks ever created. As a matter of course when embarking on a 2.5-hour drive to spend the day in a swimsuit, and given the tendency of the area for natural disasters, we checked the weather. The temperatures looked ideal and, most importantly, the chance of rain was a nice round goose egg.</p>";
+  private static final String testB = "<p>A couple of hours into our Schlitterbahn experience, we got on a bus to leave the 'old section' for the 'new section.' Along the way, clouds gathered and multiple claps of thunder sounded. 'So much for the 0% chance of rain,' I commented. By the time we got to our destination, lightning sightings had led to the slides and pools being evacuated and soon the rain began coming down in torrents - accompanied by voluminous lightning flashes. After at least a half an hour the downpour had subsided, but the lightning showed no sign of letting up, so we began heading back to our vehicles. A hundred yards into the parking lot, we passing a tree that had apparently been split in two during the storm (whether by lightning or wind, I'm not sure). Not but a few yards later, there was a distinct thud and the husband of the couple accompanying us cried out as a near racquetball sized hunk of ice rebounded off of his head and onto the concrete. Soon, simila
 rly sized hail was falling all around us as everyone scampered for cover. Some cowered under overturned trashcans while others were more fortunate and made it indoors.</p>"
+      + "<p>The hail, rain and lightning eventually subsided, but the most alarming news was waiting on cell phone voicemail. A friend who lived in the area had called frantically, knowing we were at the park, as the local news was reporting multiple people had been by struck by lightning at Schlitterbahn during the storm.</p>"
+      + "<p>'So much for the 0% chance of rain,' I repeated.</p></body></html>";
+
+  @Override
+  public void handle(Request req, HttpServletResponse res, String target,
+      int dispatch) throws IOException, ServletException {
+    HttpURI u = req.getUri();
+    String uri = u.toString();
+    // System.err.println("-faking " + uri.toString());
+    addMyHeader(res, "URI", uri);
+    // don't pass it down the chain
+    req.setHandled(true);
+    res.addHeader("X-Handled-By", getClass().getSimpleName());
+    if (uri.endsWith("/robots.txt")) {
+      return;
+    }
+    res.setContentType("text/html");
+    try {
+      OutputStream os = res.getOutputStream();
+      byte[] bytes = testA.getBytes("UTF-8");
+      os.write(bytes);
+      // record URI
+      String p = "<p>URI: " + uri + "</p>\r\n";
+      os.write(p.getBytes());
+      // fake some links
+      String base;
+      if (u.getPath().length() > 5) {
+        base = u.getPath().substring(0, u.getPath().length() - 5);
+      } else {
+        base = u.getPath();
+      }
+      String prefix = u.getScheme() + "://" + u.getHost();
+      if (u.getPort() != 80 && u.getPort() != -1)
+        base += ":" + u.getPort();
+      if (!base.startsWith("/"))
+        prefix += "/";
+      prefix = prefix + base;
+      for (int i = 0; i < 10; i++) {
+        String link = "<p><a href='" + prefix;
+        if (!prefix.endsWith("/")) {
+          link += "/";
+        }
+        link += i + ".html'>outlink " + i + "</a></p>\r\n";
+        os.write(link.getBytes());
+      }
+      // fake a few links to random nonexistent hosts
+      for (int i = 0; i < 5; i++) {
+        int h = r.nextInt(1000000); // 1 mln hosts
+        String link = "<p><a href='http://www.fake-" + h + ".com/'>fake host "
+            + h + "</a></p>\r\n";
+        os.write(link.getBytes());
+      }
+      // fake a link to the root URL
+      String link = "<p><a href='" + u.getScheme() + "://" + u.getHost();
+      if (u.getPort() != 80 && u.getPort() != -1)
+        link += ":" + u.getPort();
+      link += "/'>site " + u.getHost() + "</a></p>\r\n";
+      os.write(link.getBytes());
+      os.write(testB.getBytes());
+      res.flushBuffer();
+    } catch (IOException ioe) {
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/tools/proxy/LogDebugHandler.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/tools/proxy/LogDebugHandler.java b/nutch-core/src/test/java/org/apache/nutch/tools/proxy/LogDebugHandler.java
new file mode 100644
index 0000000..2682f6d
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/tools/proxy/LogDebugHandler.java
@@ -0,0 +1,64 @@
+package org.apache.nutch.tools.proxy;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import javax.servlet.Filter;
+import javax.servlet.FilterChain;
+import javax.servlet.FilterConfig;
+import javax.servlet.ServletException;
+import javax.servlet.ServletRequest;
+import javax.servlet.ServletResponse;
+import javax.servlet.http.HttpServletResponse;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.mortbay.jetty.Request;
+
+public class LogDebugHandler extends AbstractTestbedHandler implements Filter {
+  private static final Logger LOG = LoggerFactory
+      .getLogger(LogDebugHandler.class);
+
+  @Override
+  public void handle(Request req, HttpServletResponse res, String target,
+      int dispatch) throws IOException, ServletException {
+    LOG.info("-- " + req.getMethod() + " " + req.getUri().toString() + "\n"
+        + req.getConnection().getRequestFields());
+  }
+
+  @Override
+  public void doFilter(ServletRequest req, ServletResponse res,
+      FilterChain chain) throws IOException, ServletException {
+    ((HttpServletResponse) res).addHeader("X-Handled-By", "AsyncProxyHandler");
+    ((HttpServletResponse) res).addHeader("X-TestbedHandlers",
+        "AsyncProxyHandler");
+    try {
+      chain.doFilter(req, res);
+    } catch (Throwable e) {
+      ((HttpServletResponse) res).sendError(HttpServletResponse.SC_BAD_REQUEST,
+          e.toString());
+    }
+  }
+
+  @Override
+  public void init(FilterConfig arg0) throws ServletException {
+    // TODO Auto-generated method stub
+
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/tools/proxy/NotFoundHandler.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/tools/proxy/NotFoundHandler.java b/nutch-core/src/test/java/org/apache/nutch/tools/proxy/NotFoundHandler.java
new file mode 100644
index 0000000..ef439a6
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/tools/proxy/NotFoundHandler.java
@@ -0,0 +1,40 @@
+package org.apache.nutch.tools.proxy;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import javax.servlet.ServletException;
+import javax.servlet.http.HttpServletResponse;
+
+import org.mortbay.jetty.Request;
+
+public class NotFoundHandler extends AbstractTestbedHandler {
+
+  @Override
+  public void handle(Request req, HttpServletResponse res, String target,
+      int dispatch) throws IOException, ServletException {
+    // don't pass it down the chain
+    req.setHandled(true);
+    res.addHeader("X-Handled-By", getClass().getSimpleName());
+    addMyHeader(res, "URI", req.getUri().toString());
+    res.sendError(HttpServletResponse.SC_NOT_FOUND, "Not found: "
+        + req.getUri().toString());
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/tools/proxy/ProxyTestbed.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/tools/proxy/ProxyTestbed.java b/nutch-core/src/test/java/org/apache/nutch/tools/proxy/ProxyTestbed.java
new file mode 100644
index 0000000..a7e6aeb
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/tools/proxy/ProxyTestbed.java
@@ -0,0 +1,156 @@
+package org.apache.nutch.tools.proxy;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Iterator;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.nutch.util.HadoopFSUtil;
+import org.apache.nutch.util.NutchConfiguration;
+import org.mortbay.jetty.Handler;
+import org.mortbay.jetty.Server;
+import org.mortbay.jetty.bio.SocketConnector;
+import org.mortbay.jetty.handler.HandlerList;
+import org.mortbay.jetty.servlet.ServletHandler;
+import org.mortbay.proxy.AsyncProxyServlet;
+
+public class ProxyTestbed {
+  private static final Logger LOG = LoggerFactory.getLogger(ProxyTestbed.class);
+
+  /**
+   * @param args
+   */
+  public static void main(String[] args) throws Exception {
+    if (args.length == 0) {
+      System.err
+          .println("TestbedProxy [-seg <segment_name> | -segdir <segments>] [-port <nnn>] [-forward] [-fake] [-delay nnn] [-debug]");
+      System.err
+          .println("-seg <segment_name>\tpath to a single segment (can be specified multiple times)");
+      System.err
+          .println("-segdir <segments>\tpath to a parent directory of multiple segments (as above)");
+      System.err
+          .println("-port <nnn>\trun the proxy on port <nnn> (special permissions may be needed for ports < 1024)");
+      System.err
+          .println("-forward\tif specified, requests to all unknown urls will be passed to");
+      System.err
+          .println("\t\toriginal servers. If false (default) unknown urls generate 404 Not Found.");
+      System.err
+          .println("-delay\tdelay every response by nnn seconds. If delay is negative use a random value up to nnn");
+      System.err
+          .println("-fake\tif specified, requests to all unknown urls will succeed with fake content");
+      System.exit(-1);
+    }
+
+    Configuration conf = NutchConfiguration.create();
+    int port = conf.getInt("segment.proxy.port", 8181);
+    boolean forward = false;
+    boolean fake = false;
+    boolean delay = false;
+    boolean debug = false;
+    int delayVal = 0;
+
+    HashSet<Path> segs = new HashSet<Path>();
+    for (int i = 0; i < args.length; i++) {
+      if (args[i].equals("-segdir")) {
+        FileSystem fs = FileSystem.get(conf);
+        FileStatus[] fstats = fs.listStatus(new Path(args[++i]));
+        Path[] paths = HadoopFSUtil.getPaths(fstats);
+        segs.addAll(Arrays.asList(paths));
+      } else if (args[i].equals("-port")) {
+        port = Integer.parseInt(args[++i]);
+      } else if (args[i].equals("-forward")) {
+        forward = true;
+      } else if (args[i].equals("-delay")) {
+        delay = true;
+        delayVal = Integer.parseInt(args[++i]);
+      } else if (args[i].equals("-fake")) {
+        fake = true;
+      } else if (args[i].equals("-debug")) {
+        debug = true;
+      } else if (args[i].equals("-seg")) {
+        segs.add(new Path(args[++i]));
+      } else {
+        LOG.error("Unknown argument: " + args[i]);
+        System.exit(-1);
+      }
+    }
+
+    // Create the server
+    Server server = new Server();
+    SocketConnector connector = new SocketConnector();
+    connector.setPort(port);
+    connector.setResolveNames(false);
+    server.addConnector(connector);
+
+    // create a list of handlers
+    HandlerList list = new HandlerList();
+    server.addHandler(list);
+
+    if (debug) {
+      LOG.info("* Added debug handler.");
+      list.addHandler(new LogDebugHandler());
+    }
+
+    if (delay) {
+      LOG.info("* Added delay handler: "
+          + (delayVal < 0 ? "random delay up to " + (-delayVal)
+              : "constant delay of " + delayVal));
+      list.addHandler(new DelayHandler(delayVal));
+    }
+
+    // XXX alternatively, we can add the DispatchHandler as the first one,
+    // XXX to activate handler plugins and redirect requests to appropriate
+    // XXX handlers ... Here we always load these handlers
+
+    Iterator<Path> it = segs.iterator();
+    while (it.hasNext()) {
+      Path p = it.next();
+      try {
+        SegmentHandler segment = new SegmentHandler(conf, p);
+        list.addHandler(segment);
+        LOG.info("* Added segment handler for: " + p);
+      } catch (Exception e) {
+        LOG.warn("Skipping segment '" + p + "': "
+            + StringUtils.stringifyException(e));
+      }
+    }
+    if (forward) {
+      LOG.info("* Adding forwarding proxy for all unknown urls ...");
+      ServletHandler servlets = new ServletHandler();
+      servlets.addServletWithMapping(AsyncProxyServlet.class, "/*");
+      servlets.addFilterWithMapping(LogDebugHandler.class, "/*", Handler.ALL);
+      list.addHandler(servlets);
+    }
+    if (fake) {
+      LOG.info("* Added fake handler for remaining URLs.");
+      list.addHandler(new FakeHandler());
+    }
+    list.addHandler(new NotFoundHandler());
+    // Start the http server
+    server.start();
+    server.join();
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/tools/proxy/SegmentHandler.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/tools/proxy/SegmentHandler.java b/nutch-core/src/test/java/org/apache/nutch/tools/proxy/SegmentHandler.java
new file mode 100644
index 0000000..5d198b4
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/tools/proxy/SegmentHandler.java
@@ -0,0 +1,255 @@
+package org.apache.nutch.tools.proxy;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.util.Arrays;
+import java.util.HashMap;
+
+import javax.servlet.ServletException;
+import javax.servlet.http.HttpServletResponse;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.FileUtil;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.PathFilter;
+import org.apache.hadoop.io.MapFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapred.MapFileOutputFormat;
+import org.apache.hadoop.mapred.Partitioner;
+import org.apache.hadoop.mapred.lib.HashPartitioner;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ProtocolStatus;
+import org.mortbay.jetty.Request;
+
+/**
+ * XXX should turn this into a plugin?
+ */
+public class SegmentHandler extends AbstractTestbedHandler {
+  private static final Logger LOG = LoggerFactory
+      .getLogger(SegmentHandler.class);
+  private Segment seg;
+
+  private static HashMap<Integer, Integer> protoCodes = new HashMap<Integer, Integer>();
+
+  static {
+    protoCodes.put(ProtocolStatus.ACCESS_DENIED,
+        HttpServletResponse.SC_UNAUTHORIZED);
+    protoCodes.put(ProtocolStatus.BLOCKED,
+        HttpServletResponse.SC_SERVICE_UNAVAILABLE);
+    protoCodes.put(ProtocolStatus.EXCEPTION,
+        HttpServletResponse.SC_INTERNAL_SERVER_ERROR);
+    protoCodes.put(ProtocolStatus.FAILED, HttpServletResponse.SC_BAD_REQUEST);
+    protoCodes.put(ProtocolStatus.GONE, HttpServletResponse.SC_GONE);
+    protoCodes.put(ProtocolStatus.MOVED,
+        HttpServletResponse.SC_MOVED_PERMANENTLY);
+    protoCodes.put(ProtocolStatus.NOTFETCHING,
+        HttpServletResponse.SC_BAD_REQUEST);
+    protoCodes.put(ProtocolStatus.NOTFOUND, HttpServletResponse.SC_NOT_FOUND);
+    protoCodes.put(ProtocolStatus.NOTMODIFIED,
+        HttpServletResponse.SC_NOT_MODIFIED);
+    protoCodes.put(ProtocolStatus.PROTO_NOT_FOUND,
+        HttpServletResponse.SC_BAD_REQUEST);
+    protoCodes.put(ProtocolStatus.REDIR_EXCEEDED,
+        HttpServletResponse.SC_BAD_REQUEST);
+    protoCodes.put(ProtocolStatus.RETRY, HttpServletResponse.SC_BAD_REQUEST);
+    protoCodes.put(ProtocolStatus.ROBOTS_DENIED,
+        HttpServletResponse.SC_FORBIDDEN);
+    protoCodes.put(ProtocolStatus.SUCCESS, HttpServletResponse.SC_OK);
+    protoCodes.put(ProtocolStatus.TEMP_MOVED,
+        HttpServletResponse.SC_MOVED_TEMPORARILY);
+    protoCodes.put(ProtocolStatus.WOULDBLOCK,
+        HttpServletResponse.SC_BAD_REQUEST);
+  }
+
+  private static class SegmentPathFilter implements PathFilter {
+    public static final SegmentPathFilter INSTANCE = new SegmentPathFilter();
+
+    @Override
+    public boolean accept(Path p) {
+      return p.getName().startsWith("part-");
+    }
+
+  }
+
+  private static class Segment implements Closeable {
+
+    private static final Partitioner<Text, Writable> PARTITIONER = new HashPartitioner<Text, Writable>();
+
+    private Path segmentDir;
+
+    private Object cLock = new Object();
+    private Object crawlLock = new Object();
+    private MapFile.Reader[] content;
+    private MapFile.Reader[] parseText;
+    private MapFile.Reader[] parseData;
+    private MapFile.Reader[] crawl;
+    private Configuration conf;
+
+    public Segment(FileSystem fs, Path segmentDir, Configuration conf)
+        throws IOException {
+      this.segmentDir = segmentDir;
+      this.conf = conf;
+    }
+
+    public CrawlDatum getCrawlDatum(Text url) throws IOException {
+      synchronized (crawlLock) {
+        if (crawl == null)
+          crawl = getReaders(CrawlDatum.FETCH_DIR_NAME);
+      }
+      return (CrawlDatum) getEntry(crawl, url, new CrawlDatum());
+    }
+
+    public Content getContent(Text url) throws IOException {
+      synchronized (cLock) {
+        if (content == null)
+          content = getReaders(Content.DIR_NAME);
+      }
+      return (Content) getEntry(content, url, new Content());
+    }
+
+    /** Open the output generated by this format. */
+    private MapFile.Reader[] getReaders(String subDir) throws IOException {
+      Path dir = new Path(segmentDir, subDir);
+      FileSystem fs = dir.getFileSystem(conf);
+      Path[] names = FileUtil.stat2Paths(fs.listStatus(dir,
+          SegmentPathFilter.INSTANCE));
+
+      // sort names, so that hash partitioning works
+      Arrays.sort(names);
+
+      MapFile.Reader[] parts = new MapFile.Reader[names.length];
+      for (int i = 0; i < names.length; i++) {
+        parts[i] = new MapFile.Reader(names[i], conf);
+      }
+      return parts;
+    }
+
+    private Writable getEntry(MapFile.Reader[] readers, Text url, Writable entry)
+        throws IOException {
+      return MapFileOutputFormat.getEntry(readers, PARTITIONER, url, entry);
+    }
+
+    public void close() throws IOException {
+      if (content != null) {
+        closeReaders(content);
+      }
+      if (parseText != null) {
+        closeReaders(parseText);
+      }
+      if (parseData != null) {
+        closeReaders(parseData);
+      }
+      if (crawl != null) {
+        closeReaders(crawl);
+      }
+    }
+
+    private void closeReaders(MapFile.Reader[] readers) throws IOException {
+      for (int i = 0; i < readers.length; i++) {
+        readers[i].close();
+      }
+    }
+
+  }
+
+  public SegmentHandler(Configuration conf, Path name) throws Exception {
+    seg = new Segment(FileSystem.get(conf), name, conf);
+  }
+
+  @Override
+  public void handle(Request req, HttpServletResponse res, String target,
+      int dispatch) throws IOException, ServletException {
+    try {
+      String uri = req.getUri().toString();
+      LOG.info("URI: " + uri);
+      addMyHeader(res, "URI", uri);
+      Text url = new Text(uri.toString());
+      CrawlDatum cd = seg.getCrawlDatum(url);
+      if (cd != null) {
+        addMyHeader(res, "Res", "found");
+        LOG.info("-got " + cd.toString());
+        ProtocolStatus ps = (ProtocolStatus) cd.getMetaData().get(
+            Nutch.WRITABLE_PROTO_STATUS_KEY);
+        if (ps != null) {
+          Integer TrCode = protoCodes.get(ps.getCode());
+          if (TrCode != null) {
+            res.setStatus(TrCode.intValue());
+          } else {
+            res.setStatus(HttpServletResponse.SC_OK);
+          }
+          addMyHeader(res, "ProtocolStatus", ps.toString());
+        } else {
+          res.setStatus(HttpServletResponse.SC_OK);
+        }
+        Content c = seg.getContent(url);
+        if (c == null) { // missing content
+          req.setHandled(true);
+          res.addHeader("X-Handled-By", getClass().getSimpleName());
+          return;
+        }
+        byte[] data = c.getContent();
+        LOG.debug("-data len=" + data.length);
+        Metadata meta = c.getMetadata();
+        String[] names = meta.names();
+        LOG.debug("- " + names.length + " meta");
+        for (int i = 0; i < names.length; i++) {
+          boolean my = true;
+          char ch = names[i].charAt(0);
+          if (Character.isLetter(ch) && Character.isUpperCase(ch)) {
+            // pretty good chance it's a standard header
+            my = false;
+          }
+          String[] values = meta.getValues(names[i]);
+          for (int k = 0; k < values.length; k++) {
+            if (my) {
+              addMyHeader(res, names[i], values[k]);
+            } else {
+              res.addHeader(names[i], values[k]);
+            }
+          }
+        }
+        req.setHandled(true);
+        res.addHeader("X-Handled-By", getClass().getSimpleName());
+        res.setContentType(meta.get(Metadata.CONTENT_TYPE));
+        res.setContentLength(data.length);
+        OutputStream os = res.getOutputStream();
+        os.write(data, 0, data.length);
+        res.flushBuffer();
+      } else {
+        addMyHeader(res, "Res", "not found");
+        LOG.info(" -not found " + url);
+      }
+    } catch (Exception e) {
+      e.printStackTrace();
+      LOG.warn(StringUtils.stringifyException(e));
+      addMyHeader(res, "Res", "Exception: " + StringUtils.stringifyException(e));
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/tools/proxy/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/tools/proxy/package-info.java b/nutch-core/src/test/java/org/apache/nutch/tools/proxy/package-info.java
new file mode 100644
index 0000000..cc820a7
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/tools/proxy/package-info.java
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Proxy to {@link org.apache.nutch.tools.Benchmark benchmark} the crawler.
+ */
+package org.apache.nutch.tools.proxy;
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/util/DumpFileUtilTest.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/util/DumpFileUtilTest.java b/nutch-core/src/test/java/org/apache/nutch/util/DumpFileUtilTest.java
new file mode 100644
index 0000000..03caa48
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/util/DumpFileUtilTest.java
@@ -0,0 +1,68 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util;
+
+import org.junit.Test;
+
+import static org.junit.Assert.*;
+
+public class DumpFileUtilTest {
+
+    @Test
+    public void testGetUrlMD5() throws Exception {
+        String testUrl = "http://apache.org";
+
+        String result = DumpFileUtil.getUrlMD5(testUrl);
+
+        assertEquals("991e599262e04ea2ec76b6c5aed499a7", result);
+    }
+
+    @Test
+    public void testCreateTwoLevelsDirectory() throws Exception {
+        String testUrl = "http://apache.org";
+        String basePath = "/tmp";
+        String fullDir = DumpFileUtil.createTwoLevelsDirectory(basePath, DumpFileUtil.getUrlMD5(testUrl));
+
+        assertEquals("/tmp/96/ea", fullDir);
+
+        String basePath2 = "/this/path/is/not/existed/just/for/testing";
+        String fullDir2 = DumpFileUtil.createTwoLevelsDirectory(basePath2, DumpFileUtil.getUrlMD5(testUrl));
+
+        assertNull(fullDir2);
+    }
+
+    @Test
+    public void testCreateFileName() throws Exception {
+        String testUrl = "http://apache.org";
+        String baseName = "test";
+        String extension = "html";
+        String fullDir = DumpFileUtil.createFileName(DumpFileUtil.getUrlMD5(testUrl), baseName, extension);
+
+        assertEquals("991e599262e04ea2ec76b6c5aed499a7_test.html", fullDir);
+
+        String tooLongBaseName = "testtesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttest";
+        String fullDir2 = DumpFileUtil.createFileName(DumpFileUtil.getUrlMD5(testUrl), tooLongBaseName, extension);
+
+        assertEquals("991e599262e04ea2ec76b6c5aed499a7_testtesttesttesttesttesttesttest.html", fullDir2);
+
+        String tooLongExtension = "testtesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttesttest";
+        String fullDir3 = DumpFileUtil.createFileName(DumpFileUtil.getUrlMD5(testUrl), baseName, tooLongExtension);
+
+        assertEquals("991e599262e04ea2ec76b6c5aed499a7_test.testt", fullDir3);
+    }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/util/TestEncodingDetector.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/util/TestEncodingDetector.java b/nutch-core/src/test/java/org/apache/nutch/util/TestEncodingDetector.java
new file mode 100644
index 0000000..8697a62
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/util/TestEncodingDetector.java
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.util;
+
+import java.io.UnsupportedEncodingException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.Content;
+import org.junit.Assert;
+import org.junit.Test;
+
+public class TestEncodingDetector {
+  private static Configuration conf = NutchConfiguration.create();
+
+  private static byte[] contentInOctets;
+
+  static {
+    try {
+      contentInOctets = "�����\u0414\u041b\u0436\u04b6".getBytes("utf-8");
+    } catch (UnsupportedEncodingException e) {
+      // not possible
+    }
+  }
+
+  @Test
+  public void testGuessing() {
+    // first disable auto detection
+    conf.setInt(EncodingDetector.MIN_CONFIDENCE_KEY, -1);
+
+    Metadata metadata = new Metadata();
+    EncodingDetector detector;
+    Content content;
+    String encoding;
+
+    content = new Content("http://www.example.com", "http://www.example.com/",
+        contentInOctets, "text/plain", metadata, conf);
+    detector = new EncodingDetector(conf);
+    detector.autoDetectClues(content, true);
+    encoding = detector.guessEncoding(content, "windows-1252");
+    // no information is available, so it should return default encoding
+    Assert.assertEquals("windows-1252", encoding.toLowerCase());
+
+    metadata.clear();
+    metadata.set(Response.CONTENT_TYPE, "text/plain; charset=UTF-16");
+    content = new Content("http://www.example.com", "http://www.example.com/",
+        contentInOctets, "text/plain", metadata, conf);
+    detector = new EncodingDetector(conf);
+    detector.autoDetectClues(content, true);
+    encoding = detector.guessEncoding(content, "windows-1252");
+    Assert.assertEquals("utf-16", encoding.toLowerCase());
+
+    metadata.clear();
+    content = new Content("http://www.example.com", "http://www.example.com/",
+        contentInOctets, "text/plain", metadata, conf);
+    detector = new EncodingDetector(conf);
+    detector.autoDetectClues(content, true);
+    detector.addClue("windows-1254", "sniffed");
+    encoding = detector.guessEncoding(content, "windows-1252");
+    Assert.assertEquals("windows-1254", encoding.toLowerCase());
+
+    // enable autodetection
+    conf.setInt(EncodingDetector.MIN_CONFIDENCE_KEY, 50);
+    metadata.clear();
+    metadata.set(Response.CONTENT_TYPE, "text/plain; charset=UTF-16");
+    content = new Content("http://www.example.com", "http://www.example.com/",
+        contentInOctets, "text/plain", metadata, conf);
+    detector = new EncodingDetector(conf);
+    detector.autoDetectClues(content, true);
+    detector.addClue("utf-32", "sniffed");
+    encoding = detector.guessEncoding(content, "windows-1252");
+    Assert.assertEquals("utf-8", encoding.toLowerCase());
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/util/TestGZIPUtils.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/util/TestGZIPUtils.java b/nutch-core/src/test/java/org/apache/nutch/util/TestGZIPUtils.java
new file mode 100644
index 0000000..a3d4610
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/util/TestGZIPUtils.java
@@ -0,0 +1,241 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util;
+
+import java.io.IOException;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+/** Unit tests for GZIPUtils methods. */
+public class TestGZIPUtils {
+
+  /* a short, highly compressable, string */
+  String SHORT_TEST_STRING = "aaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbbbbbbcccccccccccccccc";
+
+  /* a short, highly compressable, string */
+  String LONGER_TEST_STRING = SHORT_TEST_STRING + SHORT_TEST_STRING
+      + SHORT_TEST_STRING + SHORT_TEST_STRING + SHORT_TEST_STRING
+      + SHORT_TEST_STRING + SHORT_TEST_STRING + SHORT_TEST_STRING
+      + SHORT_TEST_STRING + SHORT_TEST_STRING + SHORT_TEST_STRING
+      + SHORT_TEST_STRING;
+
+  /* a snapshot of the nutch webpage */
+  String WEBPAGE = "<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\">\n"
+      + "<html>\n"
+      + "<head>\n"
+      + "  <meta http-equiv=\"content-type\"\n"
+      + " content=\"text/html; charset=ISO-8859-1\">\n"
+      + "  <title>Nutch</title>\n"
+      + "</head>\n"
+      + "<body>\n"
+      + "<h1\n"
+      + " style=\"font-family: helvetica,arial,sans-serif; text-align: center; color: rgb(255, 153, 0);\"><a\n"
+      + " href=\"http://www.nutch.org/\"><font style=\"color: rgb(255, 153, 0);\">Nutch</font></a><br>\n"
+      + "<small>an open source web-search engine</small></h1>\n"
+      + "<hr style=\"width: 100%; height: 1px;\" noshade=\"noshade\">\n"
+      + "<table\n"
+      + " style=\"width: 100%; text-align: left; margin-left: auto; margin-right: auto;\"\n"
+      + " border=\"0\" cellspacing=\"0\" cellpadding=\"0\">\n"
+      + "  <tbody>\n"
+      + "    <tr>\n"
+      + "      <td style=\"vertical-align: top; text-align: center;\"><a\n"
+      + " href=\"http://sourceforge.net/project/showfiles.php?group_id=59548\">Download</a><br>\n"
+      + "      </td>\n"
+      + "      <td style=\"vertical-align: top; text-align: center;\"><a\n"
+      + " href=\"tutorial.html\">Tutorial</a><br>\n"
+      + "      </td>\n"
+      + "      <td style=\"vertical-align: top; text-align: center;\"><a\n"
+      + " href=\"http://cvs.sourceforge.net/cgi-bin/viewcvs.cgi/nutch/nutch/\">CVS</a><br>\n"
+      + "      </td>\n"
+      + "      <td style=\"vertical-align: top; text-align: center;\"><a\n"
+      + " href=\"api/index.html\">Javadoc</a><br>\n"
+      + "      </td>\n"
+      + "      <td style=\"vertical-align: top; text-align: center;\"><a\n"
+      + " href=\"http://sourceforge.net/tracker/?atid=491356&amp;group_id=59548&amp;func=browse\">Bugs</a><br>\n"
+      + "      </td>\n"
+      + "      <td style=\"vertical-align: top; text-align: center;\"><a\n"
+      + " href=\"http://sourceforge.net/mail/?group_id=59548\">Lists</a></td>\n"
+      + "      <td style=\"vertical-align: top; text-align: center;\"><a\n"
+      + " href=\"policies.html\">Policies</a><br>\n"
+      + "      </td>\n"
+      + "    </tr>\n"
+      + "  </tbody>\n"
+      + "</table>\n"
+      + "<hr style=\"width: 100%; height: 1px;\" noshade=\"noshade\">\n"
+      + "<h2>Introduction</h2>\n"
+      + "Nutch is a nascent effort to implement an open-source web search\n"
+      + "engine. Web search is a basic requirement for internet navigation, yet\n"
+      + "the number of web search engines is decreasing. Today's oligopoly could\n"
+      + "soon be a monopoly, with a single company controlling nearly all web\n"
+      + "search for its commercial gain. &nbsp;That would not be good for the\n"
+      + "users of internet. &nbsp;Nutch aims to enable anyone to easily and\n"
+      + "cost-effectively deploy a world-class web search engine.<br>\n"
+      + "<br>\n"
+      + "To succeed, the Nutch software must be able to:<br>\n"
+      + "<ul>\n"
+      + "  <li> crawl several billion pages per month</li>\n"
+      + "  <li>maintain an index of these pages</li>\n"
+      + "  <li>search that index up to 1000 times per second</li>\n"
+      + "  <li>provide very high quality search results</li>\n"
+      + "  <li>operate at minimal cost</li>\n"
+      + "</ul>\n"
+      + "<h2>Status</h2>\n"
+      + "Currently we're just a handful of developers working part-time to put\n"
+      + "together a demo. &nbsp;The demo is coded entirely in Java. &nbsp;However\n"
+      + "persistent data is written in well-documented formats so that modules\n"
+      + "may eventually be re-written in other languages (e.g., Perl, C++) as the\n"
+      + "project progresses.<br>\n"
+      + "<br>\n"
+      + "<hr style=\"width: 100%; height: 1px;\" noshade=\"noshade\"> <a\n"
+      + " href=\"http://sourceforge.net\"> </a>\n"
+      + "<div style=\"text-align: center;\"><a href=\"http://sourceforge.net\"><img\n"
+      + " src=\"http://sourceforge.net/sflogo.php?group_id=59548&amp;type=1\"\n"
+      + " style=\"border: 0px solid ; width: 88px; height: 31px;\"\n"
+      + " alt=\"SourceForge.net Logo\" title=\"\"></a></div>\n"
+      + "</body>\n"
+      + "</html>\n";
+
+  @Test
+  public void testZipUnzip() {
+    byte[] testBytes = SHORT_TEST_STRING.getBytes();
+    testZipUnzip(testBytes);
+    testBytes = LONGER_TEST_STRING.getBytes();
+    testZipUnzip(testBytes);
+    testBytes = WEBPAGE.getBytes();
+    testZipUnzip(testBytes);
+  }
+
+  @Test
+  public void testZipUnzipBestEffort() {
+    byte[] testBytes = SHORT_TEST_STRING.getBytes();
+    testZipUnzipBestEffort(testBytes);
+    testBytes = LONGER_TEST_STRING.getBytes();
+    testZipUnzipBestEffort(testBytes);
+    testBytes = WEBPAGE.getBytes();
+    testZipUnzipBestEffort(testBytes);
+  }
+
+  public void testTruncation() {
+    byte[] testBytes = SHORT_TEST_STRING.getBytes();
+    testTruncation(testBytes);
+    testBytes = LONGER_TEST_STRING.getBytes();
+    testTruncation(testBytes);
+    testBytes = WEBPAGE.getBytes();
+    testTruncation(testBytes);
+  }
+
+  @Test
+  public void testLimit() {
+    byte[] testBytes = SHORT_TEST_STRING.getBytes();
+    testLimit(testBytes);
+    testBytes = LONGER_TEST_STRING.getBytes();
+    testLimit(testBytes);
+    testBytes = WEBPAGE.getBytes();
+    testLimit(testBytes);
+  }
+
+  // helpers
+
+  public void testZipUnzip(byte[] origBytes) {
+    byte[] compressedBytes = GZIPUtils.zip(origBytes);
+
+    Assert.assertTrue("compressed array is not smaller!",
+        compressedBytes.length < origBytes.length);
+
+    byte[] uncompressedBytes = null;
+    try {
+      uncompressedBytes = GZIPUtils.unzip(compressedBytes);
+    } catch (IOException e) {
+      e.printStackTrace();
+      Assert.assertTrue("caught exception '" + e + "' during unzip()", false);
+    }
+    Assert.assertTrue("uncompressedBytes is wrong size",
+        uncompressedBytes.length == origBytes.length);
+
+    for (int i = 0; i < origBytes.length; i++)
+      if (origBytes[i] != uncompressedBytes[i])
+        Assert.assertTrue("uncompressedBytes does not match origBytes", false);
+  }
+
+  public void testZipUnzipBestEffort(byte[] origBytes) {
+    byte[] compressedBytes = GZIPUtils.zip(origBytes);
+
+    Assert.assertTrue("compressed array is not smaller!",
+        compressedBytes.length < origBytes.length);
+
+    byte[] uncompressedBytes = GZIPUtils.unzipBestEffort(compressedBytes);
+    Assert.assertTrue("uncompressedBytes is wrong size",
+        uncompressedBytes.length == origBytes.length);
+
+    for (int i = 0; i < origBytes.length; i++)
+      if (origBytes[i] != uncompressedBytes[i])
+        Assert.assertTrue("uncompressedBytes does not match origBytes", false);
+  }
+
+  public void testTruncation(byte[] origBytes) {
+    byte[] compressedBytes = GZIPUtils.zip(origBytes);
+
+    System.out.println("original data has len " + origBytes.length);
+    System.out.println("compressed data has len " + compressedBytes.length);
+
+    for (int i = compressedBytes.length; i >= 0; i--) {
+
+      byte[] truncCompressed = new byte[i];
+
+      for (int j = 0; j < i; j++)
+        truncCompressed[j] = compressedBytes[j];
+
+      byte[] trunc = GZIPUtils.unzipBestEffort(truncCompressed);
+
+      if (trunc == null) {
+        System.out.println("truncated to len " + i + ", trunc is null");
+      } else {
+        System.out.println("truncated to len " + i + ", trunc.length=  "
+            + trunc.length);
+
+        for (int j = 0; j < trunc.length; j++)
+          if (trunc[j] != origBytes[j])
+            Assert.assertTrue("truncated/uncompressed array differs at pos "
+                + j + " (compressed data had been truncated to len " + i + ")",
+                false);
+      }
+    }
+  }
+
+  public void testLimit(byte[] origBytes) {
+    byte[] compressedBytes = GZIPUtils.zip(origBytes);
+
+    Assert.assertTrue("compressed array is not smaller!",
+        compressedBytes.length < origBytes.length);
+
+    for (int i = 0; i < origBytes.length; i++) {
+
+      byte[] uncompressedBytes = GZIPUtils.unzipBestEffort(compressedBytes, i);
+
+      Assert.assertTrue("uncompressedBytes is wrong size",
+          uncompressedBytes.length == i);
+
+      for (int j = 0; j < i; j++)
+        if (origBytes[j] != uncompressedBytes[j])
+          Assert
+              .assertTrue("uncompressedBytes does not match origBytes", false);
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/util/TestMimeUtil.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/util/TestMimeUtil.java b/nutch-core/src/test/java/org/apache/nutch/util/TestMimeUtil.java
new file mode 100644
index 0000000..d812110
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/util/TestMimeUtil.java
@@ -0,0 +1,135 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.nio.charset.Charset;
+
+import org.apache.hadoop.conf.Configuration;
+
+import com.google.common.io.Files;
+
+import junit.framework.TestCase;
+import org.apache.nutch.test.TestUtils;
+
+public class TestMimeUtil extends TestCase {
+
+  public static String urlPrefix = "http://localhost/";
+
+  private static Charset defaultCharset = Charset.forName("UTF-8");
+
+  private File sampleDir;
+  {
+    try {
+      sampleDir = TestUtils.getFile(this, "test-mime-util");
+    } catch (FileNotFoundException e){
+      throw new RuntimeException(e);
+    }
+  }
+
+  /**
+   * test data, every element on "test page":
+   * <ol>
+   * <li>MIME type</li>
+   * <li>file name (last URL path element)</li>
+   * <li>Content-Type (HTTP header)</li>
+   * <li>content: if empty, do not test MIME magic</li>
+   * </ol>
+   */
+  public static String[][] textBasedFormats = {
+      {
+          "text/html",
+          "test.html",
+          "text/html; charset=utf-8",
+          "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" "
+              + "\"http://www.w3.org/TR/html4/loose.dtd\">\n"
+              + "<html>\n<head>\n"
+              + "<meta http-equiv=Content-Type content=\"text/html; charset=utf-8\" />\n"
+              + "</head>\n<body>Hello, World!</body></html>" },
+      {
+          "text/html",
+          "test.html",
+          "", // no Content-Type in HTTP header => test URL pattern
+          "<!DOCTYPE html>\n<html>\n<head>\n"
+              + "</head>\n<body>Hello, World!</body></html>" },
+      {
+          "application/xhtml+xml",
+          "test.html",
+          "application/xhtml+xml; charset=utf-8",
+          "<?xml version=\"1.0\"?>\n<html xmlns=\"http://www.w3.org/1999/xhtml\">"
+              + "<html>\n<head>\n"
+              + "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />"
+              + "</head>\n<body>Hello, World!</body></html>" } };
+
+  public static String[][] binaryFiles = { {
+      "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+      "test.xlsx", "" } };
+
+  private String getMimeType(String url, File file, String contentType,
+      boolean useMagic) throws IOException {
+    return getMimeType(url, Files.toByteArray(file), contentType, useMagic);
+  }
+
+  private String getMimeType(String url, byte[] bytes, String contentType,
+      boolean useMagic) {
+    Configuration conf = NutchConfiguration.create();
+    conf.setBoolean("mime.type.magic", useMagic);
+    MimeUtil mimeUtil = new MimeUtil(conf);
+    return mimeUtil.autoResolveContentType(contentType, url, bytes);
+  }
+
+  /** use HTTP Content-Type, URL pattern, and MIME magic */
+  public void testWithMimeMagic() {
+    for (String[] testPage : textBasedFormats) {
+      String mimeType = getMimeType(urlPrefix,
+          testPage[3].getBytes(defaultCharset), testPage[2], true);
+      assertEquals("", testPage[0], mimeType);
+    }
+  }
+
+  /** use only HTTP Content-Type (if given) and URL pattern */
+  public void testWithoutMimeMagic() {
+    for (String[] testPage : textBasedFormats) {
+      String mimeType = getMimeType(urlPrefix + testPage[1],
+          testPage[3].getBytes(defaultCharset), testPage[2], false);
+      assertEquals("", testPage[0], mimeType);
+    }
+  }
+
+  /** use only MIME magic (detection from content bytes) */
+  public void testOnlyMimeMagic() {
+    for (String[] testPage : textBasedFormats) {
+      String mimeType = getMimeType(urlPrefix,
+          testPage[3].getBytes(defaultCharset), "", true);
+      assertEquals("", testPage[0], mimeType);
+    }
+  }
+
+  /** test binary file formats (real files) */
+  public void testBinaryFiles() throws IOException {
+    for (String[] testPage : binaryFiles) {
+      File dataFile = new File(sampleDir, testPage[1]);
+      String mimeType = getMimeType(urlPrefix + testPage[1], dataFile,
+          testPage[2], false);
+      assertEquals("", testPage[0], mimeType);
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/util/TestNodeWalker.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/util/TestNodeWalker.java b/nutch-core/src/test/java/org/apache/nutch/util/TestNodeWalker.java
new file mode 100644
index 0000000..8edf5ab
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/util/TestNodeWalker.java
@@ -0,0 +1,107 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util;
+
+import java.io.ByteArrayInputStream;
+
+import org.apache.xerces.parsers.DOMParser;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+import org.w3c.dom.Node;
+import org.xml.sax.InputSource;
+
+/** Unit tests for NodeWalker methods. */
+public class TestNodeWalker {
+
+  /* a snapshot of the nutch webpage */
+  private final static String WEBPAGE = "<html xmlns=\"http://www.w3.org/1999/xhtml\" lang=\"en\" xml:lang=\"en\"><head><title>Nutch</title></head>"
+      + "<body>"
+      + "<ul>"
+      + "<li>crawl several billion pages per month</li>"
+      + "<li>maintain an index of these pages</li>"
+      + "<li>search that index up to 1000 times per second</li>"
+      + "<li>provide very high quality search results</li>"
+      + "<li>operate at minimal cost</li>" + "</ul>" + "</body>" + "</html>";
+
+  private final static String[] ULCONTENT = new String[4];
+
+  @Before
+  public void setUp() throws Exception {
+    ULCONTENT[0] = "crawl several billion pages per month";
+    ULCONTENT[1] = "maintain an index of these pages";
+    ULCONTENT[2] = "search that index up to 1000 times per second";
+    ULCONTENT[3] = "operate at minimal cost";
+  }
+
+  @Test
+  public void testSkipChildren() {
+    DOMParser parser = new DOMParser();
+
+    try {
+      parser.setFeature("http://xml.org/sax/features/validation", false);
+      parser.setFeature(
+          "http://apache.org/xml/features/nonvalidating/load-external-dtd",
+          false);
+      parser
+          .parse(new InputSource(new ByteArrayInputStream(WEBPAGE.getBytes())));
+    } catch (Exception e) {
+      e.printStackTrace();
+    }
+
+    StringBuffer sb = new StringBuffer();
+    NodeWalker walker = new NodeWalker(parser.getDocument());
+    while (walker.hasNext()) {
+      Node currentNode = walker.nextNode();
+      short nodeType = currentNode.getNodeType();
+      if (nodeType == Node.TEXT_NODE) {
+        String text = currentNode.getNodeValue();
+        text = text.replaceAll("\\s+", " ");
+        sb.append(text);
+      }
+    }
+    Assert.assertTrue("UL Content can NOT be found in the node",
+        findSomeUlContent(sb.toString()));
+
+    StringBuffer sbSkip = new StringBuffer();
+    NodeWalker walkerSkip = new NodeWalker(parser.getDocument());
+    while (walkerSkip.hasNext()) {
+      Node currentNode = walkerSkip.nextNode();
+      String nodeName = currentNode.getNodeName();
+      short nodeType = currentNode.getNodeType();
+      if ("ul".equalsIgnoreCase(nodeName)) {
+        walkerSkip.skipChildren();
+      }
+      if (nodeType == Node.TEXT_NODE) {
+        String text = currentNode.getNodeValue();
+        text = text.replaceAll("\\s+", " ");
+        sbSkip.append(text);
+      }
+    }
+    Assert.assertFalse("UL Content can be found in the node",
+        findSomeUlContent(sbSkip.toString()));
+  }
+
+  public boolean findSomeUlContent(String str) {
+    for (int i = 0; i < ULCONTENT.length; i++) {
+      if (str.contains(ULCONTENT[i]))
+        return true;
+    }
+    return false;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/util/TestPrefixStringMatcher.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/util/TestPrefixStringMatcher.java b/nutch-core/src/test/java/org/apache/nutch/util/TestPrefixStringMatcher.java
new file mode 100644
index 0000000..9d8b07b
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/util/TestPrefixStringMatcher.java
@@ -0,0 +1,115 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+/** Unit tests for PrefixStringMatcher. */
+public class TestPrefixStringMatcher {
+
+  private final static int NUM_TEST_ROUNDS = 20;
+  private final static int MAX_TEST_PREFIXES = 100;
+  private final static int MAX_PREFIX_LEN = 10;
+  private final static int NUM_TEST_INPUTS_PER_ROUND = 100;
+  private final static int MAX_INPUT_LEN = 20;
+
+  private final static char[] alphabet = new char[] { 'a', 'b', 'c', 'd',
+  // 'e', 'f', 'g', 'h', 'i', 'j',
+  // 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't',
+  // 'u', 'v', 'w', 'x', 'y', 'z', '1', '2', '3', '4',
+  // '5', '6', '7', '8', '9', '0'
+  };
+
+  private String makeRandString(int minLen, int maxLen) {
+    int len = minLen + (int) (Math.random() * (maxLen - minLen));
+    char[] chars = new char[len];
+
+    for (int pos = 0; pos < len; pos++) {
+      chars[pos] = alphabet[(int) (Math.random() * alphabet.length)];
+    }
+
+    return new String(chars);
+  }
+
+  @Test
+  public void testPrefixMatcher() {
+    int numMatches = 0;
+    int numInputsTested = 0;
+
+    for (int round = 0; round < NUM_TEST_ROUNDS; round++) {
+
+      // build list of prefixes
+      int numPrefixes = (int) (Math.random() * MAX_TEST_PREFIXES);
+      String[] prefixes = new String[numPrefixes];
+      for (int i = 0; i < numPrefixes; i++) {
+        prefixes[i] = makeRandString(0, MAX_PREFIX_LEN);
+      }
+
+      PrefixStringMatcher prematcher = new PrefixStringMatcher(prefixes);
+
+      // test random strings for prefix matches
+      for (int i = 0; i < NUM_TEST_INPUTS_PER_ROUND; i++) {
+        String input = makeRandString(0, MAX_INPUT_LEN);
+        boolean matches = false;
+        int longestMatch = -1;
+        int shortestMatch = -1;
+
+        for (int j = 0; j < prefixes.length; j++) {
+
+          if ((prefixes[j].length() > 0) && input.startsWith(prefixes[j])) {
+
+            matches = true;
+            int matchSize = prefixes[j].length();
+
+            if (matchSize > longestMatch)
+              longestMatch = matchSize;
+
+            if ((matchSize < shortestMatch) || (shortestMatch == -1))
+              shortestMatch = matchSize;
+          }
+
+        }
+
+        if (matches)
+          numMatches++;
+
+        numInputsTested++;
+
+        Assert.assertTrue("'" + input + "' should " + (matches ? "" : "not ")
+            + "match!", matches == prematcher.matches(input));
+        if (matches) {
+          Assert.assertTrue(shortestMatch == prematcher.shortestMatch(input)
+              .length());
+          Assert.assertTrue(input.substring(0, shortestMatch).equals(
+              prematcher.shortestMatch(input)));
+
+          Assert.assertTrue(longestMatch == prematcher.longestMatch(input)
+              .length());
+          Assert.assertTrue(input.substring(0, longestMatch).equals(
+              prematcher.longestMatch(input)));
+
+        }
+      }
+    }
+
+    System.out.println("got " + numMatches + " matches out of "
+        + numInputsTested + " tests");
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/util/TestStringUtil.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/util/TestStringUtil.java b/nutch-core/src/test/java/org/apache/nutch/util/TestStringUtil.java
new file mode 100644
index 0000000..df021f0
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/util/TestStringUtil.java
@@ -0,0 +1,61 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+/** Unit tests for StringUtil methods. */
+public class TestStringUtil {
+
+  public void testRightPad() {
+    String s = "my string";
+
+    String ps = StringUtil.rightPad(s, 0);
+    Assert.assertTrue(s.equals(ps));
+
+    ps = StringUtil.rightPad(s, 9);
+    Assert.assertTrue(s.equals(ps));
+
+    ps = StringUtil.rightPad(s, 10);
+    Assert.assertTrue((s + " ").equals(ps));
+
+    ps = StringUtil.rightPad(s, 15);
+    Assert.assertTrue((s + "      ").equals(ps));
+
+  }
+
+  @Test
+  public void testLeftPad() {
+    String s = "my string";
+
+    String ps = StringUtil.leftPad(s, 0);
+    Assert.assertTrue(s.equals(ps));
+
+    ps = StringUtil.leftPad(s, 9);
+    Assert.assertTrue(s.equals(ps));
+
+    ps = StringUtil.leftPad(s, 10);
+    Assert.assertTrue((" " + s).equals(ps));
+
+    ps = StringUtil.leftPad(s, 15);
+    Assert.assertTrue(("      " + s).equals(ps));
+
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/util/TestSuffixStringMatcher.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/util/TestSuffixStringMatcher.java b/nutch-core/src/test/java/org/apache/nutch/util/TestSuffixStringMatcher.java
new file mode 100644
index 0000000..f2e8a5c
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/util/TestSuffixStringMatcher.java
@@ -0,0 +1,114 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+/** Unit tests for SuffixStringMatcher. */
+public class TestSuffixStringMatcher {
+
+  private final static int NUM_TEST_ROUNDS = 20;
+  private final static int MAX_TEST_SUFFIXES = 100;
+  private final static int MAX_SUFFIX_LEN = 10;
+  private final static int NUM_TEST_INPUTS_PER_ROUND = 100;
+  private final static int MAX_INPUT_LEN = 20;
+
+  private final static char[] alphabet = new char[] { 'a', 'b', 'c', 'd',
+  // 'e', 'f', 'g', 'h', 'i', 'j',
+  // 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't',
+  // 'u', 'v', 'w', 'x', 'y', 'z', '1', '2', '3', '4',
+  // '5', '6', '7', '8', '9', '0'
+  };
+
+  private String makeRandString(int minLen, int maxLen) {
+    int len = minLen + (int) (Math.random() * (maxLen - minLen));
+    char[] chars = new char[len];
+
+    for (int pos = 0; pos < len; pos++) {
+      chars[pos] = alphabet[(int) (Math.random() * alphabet.length)];
+    }
+
+    return new String(chars);
+  }
+
+  @Test
+  public void testSuffixMatcher() {
+    int numMatches = 0;
+    int numInputsTested = 0;
+
+    for (int round = 0; round < NUM_TEST_ROUNDS; round++) {
+
+      // build list of suffixes
+      int numSuffixes = (int) (Math.random() * MAX_TEST_SUFFIXES);
+      String[] suffixes = new String[numSuffixes];
+      for (int i = 0; i < numSuffixes; i++) {
+        suffixes[i] = makeRandString(0, MAX_SUFFIX_LEN);
+      }
+
+      SuffixStringMatcher sufmatcher = new SuffixStringMatcher(suffixes);
+
+      // test random strings for suffix matches
+      for (int i = 0; i < NUM_TEST_INPUTS_PER_ROUND; i++) {
+        String input = makeRandString(0, MAX_INPUT_LEN);
+        boolean matches = false;
+        int longestMatch = -1;
+        int shortestMatch = -1;
+
+        for (int j = 0; j < suffixes.length; j++) {
+
+          if ((suffixes[j].length() > 0) && input.endsWith(suffixes[j])) {
+
+            matches = true;
+            int matchSize = suffixes[j].length();
+
+            if (matchSize > longestMatch)
+              longestMatch = matchSize;
+
+            if ((matchSize < shortestMatch) || (shortestMatch == -1))
+              shortestMatch = matchSize;
+          }
+
+        }
+
+        if (matches)
+          numMatches++;
+
+        numInputsTested++;
+
+        Assert.assertTrue("'" + input + "' should " + (matches ? "" : "not ")
+            + "match!", matches == sufmatcher.matches(input));
+        if (matches) {
+          Assert.assertTrue(shortestMatch == sufmatcher.shortestMatch(input)
+              .length());
+          Assert.assertTrue(input.substring(input.length() - shortestMatch)
+              .equals(sufmatcher.shortestMatch(input)));
+
+          Assert.assertTrue(longestMatch == sufmatcher.longestMatch(input)
+              .length());
+          Assert.assertTrue(input.substring(input.length() - longestMatch)
+              .equals(sufmatcher.longestMatch(input)));
+        }
+      }
+    }
+
+    System.out.println("got " + numMatches + " matches out of "
+        + numInputsTested + " tests");
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/util/TestTableUtil.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/util/TestTableUtil.java b/nutch-core/src/test/java/org/apache/nutch/util/TestTableUtil.java
new file mode 100644
index 0000000..fb07556
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/util/TestTableUtil.java
@@ -0,0 +1,75 @@
+/*******************************************************************************
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ ******************************************************************************/
+package org.apache.nutch.util;
+
+import org.apache.nutch.util.TableUtil;
+import org.junit.Test;
+import static org.junit.Assert.*;
+
+public class TestTableUtil {
+
+  String urlString1 = "http://foo.com/";
+  String urlString2 = "http://foo.com:8900/";
+  String urlString3 = "ftp://bar.baz.com/";
+  String urlString4 = "http://bar.baz.com:8983/to/index.html?a=b&c=d";
+  String urlString5 = "http://foo.com?a=/a/b&c=0";
+  String urlString5rev = "http://foo.com/?a=/a/b&c=0";
+  String urlString6 = "http://foo.com";
+  String urlString7 = "file:///var/www/index.html";
+
+  String reversedUrlString1 = "com.foo:http/";
+  String reversedUrlString2 = "com.foo:http:8900/";
+  String reversedUrlString3 = "com.baz.bar:ftp/";
+  String reversedUrlString4 = "com.baz.bar:http:8983/to/index.html?a=b&c=d";
+  String reversedUrlString5 = "com.foo:http/?a=/a/b&c=0";
+  String reversedUrlString6 = "com.foo:http";
+  String reversedUrlString7 = ":file/var/www/index.html";
+
+  @Test
+  public void testReverseUrl() throws Exception {
+    assertReverse(urlString1, reversedUrlString1);
+    assertReverse(urlString2, reversedUrlString2);
+    assertReverse(urlString3, reversedUrlString3);
+    assertReverse(urlString4, reversedUrlString4);
+    assertReverse(urlString5, reversedUrlString5);
+    assertReverse(urlString5, reversedUrlString5);
+    assertReverse(urlString6, reversedUrlString6);
+    assertReverse(urlString7, reversedUrlString7);
+  }
+
+  @Test
+  public void testUnreverseUrl() throws Exception {
+    assertUnreverse(reversedUrlString1, urlString1);
+    assertUnreverse(reversedUrlString2, urlString2);
+    assertUnreverse(reversedUrlString3, urlString3);
+    assertUnreverse(reversedUrlString4, urlString4);
+    assertUnreverse(reversedUrlString5, urlString5rev);
+    assertUnreverse(reversedUrlString6, urlString6);
+    assertUnreverse(reversedUrlString7, urlString7);
+  }
+
+  private static void assertReverse(String url, String expectedReversedUrl)
+      throws Exception {
+    String reversed = TableUtil.reverseUrl(url);
+    assertEquals(expectedReversedUrl, reversed);
+  }
+
+  private static void assertUnreverse(String reversedUrl, String expectedUrl) {
+    String unreversed = TableUtil.unreverseUrl(reversedUrl);
+    assertEquals(expectedUrl, unreversed);
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/util/TestURLUtil.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/util/TestURLUtil.java b/nutch-core/src/test/java/org/apache/nutch/util/TestURLUtil.java
new file mode 100644
index 0000000..b1fdd5b
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/util/TestURLUtil.java
@@ -0,0 +1,281 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util;
+
+import java.net.URL;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+/** Test class for URLUtil */
+public class TestURLUtil {
+
+  @Test
+  public void testGetDomainName() throws Exception {
+
+    URL url = null;
+
+    url = new URL("http://lucene.apache.org/nutch");
+    Assert.assertEquals("apache.org", URLUtil.getDomainName(url));
+
+    url = new URL("http://en.wikipedia.org/wiki/Java_coffee");
+    Assert.assertEquals("wikipedia.org", URLUtil.getDomainName(url));
+
+    url = new URL("http://140.211.11.130/foundation/contributing.html");
+    Assert.assertEquals("140.211.11.130", URLUtil.getDomainName(url));
+
+    url = new URL("http://www.example.co.uk:8080/index.html");
+    Assert.assertEquals("example.co.uk", URLUtil.getDomainName(url));
+
+    url = new URL("http://com");
+    Assert.assertEquals("com", URLUtil.getDomainName(url));
+
+    url = new URL("http://www.example.co.uk.com");
+    Assert.assertEquals("uk.com", URLUtil.getDomainName(url));
+
+    // "nn" is not a tld
+    url = new URL("http://example.com.nn");
+    Assert.assertEquals("nn", URLUtil.getDomainName(url));
+
+    url = new URL("http://");
+    Assert.assertEquals("", URLUtil.getDomainName(url));
+
+    url = new URL("http://www.edu.tr.xyz");
+    Assert.assertEquals("xyz", URLUtil.getDomainName(url));
+
+    url = new URL("http://www.example.c.se");
+    Assert.assertEquals("example.c.se", URLUtil.getDomainName(url));
+
+    // plc.co.im is listed as a domain suffix
+    url = new URL("http://www.example.plc.co.im");
+    Assert.assertEquals("example.plc.co.im", URLUtil.getDomainName(url));
+
+    // 2000.hu is listed as a domain suffix
+    url = new URL("http://www.example.2000.hu");
+    Assert.assertEquals("example.2000.hu", URLUtil.getDomainName(url));
+
+    // test non-ascii
+    url = new URL("http://www.example.\u5546\u696d.tw");
+    Assert.assertEquals("example.\u5546\u696d.tw", URLUtil.getDomainName(url));
+  }
+
+  @Test
+  public void testGetDomainSuffix() throws Exception {
+    URL url = null;
+
+    url = new URL("http://lucene.apache.org/nutch");
+    Assert.assertEquals("org", URLUtil.getDomainSuffix(url).getDomain());
+
+    url = new URL("http://140.211.11.130/foundation/contributing.html");
+    Assert.assertNull(URLUtil.getDomainSuffix(url));
+
+    url = new URL("http://www.example.co.uk:8080/index.html");
+    Assert.assertEquals("co.uk", URLUtil.getDomainSuffix(url).getDomain());
+
+    url = new URL("http://com");
+    Assert.assertEquals("com", URLUtil.getDomainSuffix(url).getDomain());
+
+    url = new URL("http://www.example.co.uk.com");
+    Assert.assertEquals("com", URLUtil.getDomainSuffix(url).getDomain());
+
+    // "nn" is not a tld
+    url = new URL("http://example.com.nn");
+    Assert.assertNull(URLUtil.getDomainSuffix(url));
+
+    url = new URL("http://");
+    Assert.assertNull(URLUtil.getDomainSuffix(url));
+
+    url = new URL("http://www.edu.tr.xyz");
+    Assert.assertNull(URLUtil.getDomainSuffix(url));
+
+    url = new URL("http://subdomain.example.edu.tr");
+    Assert.assertEquals("edu.tr", URLUtil.getDomainSuffix(url).getDomain());
+
+    url = new URL("http://subdomain.example.presse.fr");
+    Assert.assertEquals("presse.fr", URLUtil.getDomainSuffix(url).getDomain());
+
+    url = new URL("http://subdomain.example.presse.tr");
+    Assert.assertEquals("tr", URLUtil.getDomainSuffix(url).getDomain());
+
+    // plc.co.im is listed as a domain suffix
+    url = new URL("http://www.example.plc.co.im");
+    Assert.assertEquals("plc.co.im", URLUtil.getDomainSuffix(url).getDomain());
+
+    // 2000.hu is listed as a domain suffix
+    url = new URL("http://www.example.2000.hu");
+    Assert.assertEquals("2000.hu", URLUtil.getDomainSuffix(url).getDomain());
+
+    // test non-ascii
+    url = new URL("http://www.example.\u5546\u696d.tw");
+    Assert.assertEquals("\u5546\u696d.tw", URLUtil.getDomainSuffix(url).getDomain());
+  }
+
+  @Test
+  public void testGetHostSegments() throws Exception {
+    URL url;
+    String[] segments;
+
+    url = new URL("http://subdomain.example.edu.tr");
+    segments = URLUtil.getHostSegments(url);
+    Assert.assertEquals("subdomain", segments[0]);
+    Assert.assertEquals("example", segments[1]);
+    Assert.assertEquals("edu", segments[2]);
+    Assert.assertEquals("tr", segments[3]);
+
+    url = new URL("http://");
+    segments = URLUtil.getHostSegments(url);
+    Assert.assertEquals(1, segments.length);
+    Assert.assertEquals("", segments[0]);
+
+    url = new URL("http://140.211.11.130/foundation/contributing.html");
+    segments = URLUtil.getHostSegments(url);
+    Assert.assertEquals(1, segments.length);
+    Assert.assertEquals("140.211.11.130", segments[0]);
+
+    // test non-ascii
+    url = new URL("http://www.example.\u5546\u696d.tw");
+    segments = URLUtil.getHostSegments(url);
+    Assert.assertEquals("www", segments[0]);
+    Assert.assertEquals("example", segments[1]);
+    Assert.assertEquals("\u5546\u696d", segments[2]);
+    Assert.assertEquals("tw", segments[3]);
+
+  }
+
+  @Test
+  public void testChooseRepr() throws Exception {
+
+    String aDotCom = "http://www.a.com";
+    String bDotCom = "http://www.b.com";
+    String aSubDotCom = "http://www.news.a.com";
+    String aQStr = "http://www.a.com?y=1";
+    String aPath = "http://www.a.com/xyz/index.html";
+    String aPath2 = "http://www.a.com/abc/page.html";
+    String aPath3 = "http://www.news.a.com/abc/page.html";
+
+    // 1) different domain them keep dest, temp or perm
+    // a.com -> b.com*
+    Assert.assertEquals(bDotCom, URLUtil.chooseRepr(aDotCom, bDotCom, true));
+    Assert.assertEquals(bDotCom, URLUtil.chooseRepr(aDotCom, bDotCom, false));
+
+    // 2) permanent and root, keep src
+    // *a.com -> a.com?y=1 || *a.com -> a.com/xyz/index.html
+    Assert.assertEquals(aDotCom, URLUtil.chooseRepr(aDotCom, aQStr, false));
+    Assert.assertEquals(aDotCom, URLUtil.chooseRepr(aDotCom, aPath, false));
+
+    // 3) permanent and not root and dest root, keep dest
+    // a.com/xyz/index.html -> a.com*
+    Assert.assertEquals(aDotCom, URLUtil.chooseRepr(aPath, aDotCom, false));
+
+    // 4) permanent and neither root keep dest
+    // a.com/xyz/index.html -> a.com/abc/page.html*
+    Assert.assertEquals(aPath2, URLUtil.chooseRepr(aPath, aPath2, false));
+
+    // 5) temp and root and dest not root keep src
+    // *a.com -> a.com/xyz/index.html
+    Assert.assertEquals(aDotCom, URLUtil.chooseRepr(aDotCom, aPath, true));
+
+    // 6) temp and not root and dest root keep dest
+    // a.com/xyz/index.html -> a.com*
+    Assert.assertEquals(aDotCom, URLUtil.chooseRepr(aPath, aDotCom, true));
+
+    // 7) temp and neither root, keep shortest, if hosts equal by path else by
+    // hosts
+    // a.com/xyz/index.html -> a.com/abc/page.html*
+    // *www.a.com/xyz/index.html -> www.news.a.com/xyz/index.html
+    Assert.assertEquals(aPath2, URLUtil.chooseRepr(aPath, aPath2, true));
+    Assert.assertEquals(aPath, URLUtil.chooseRepr(aPath, aPath3, true));
+
+    // 8) temp and both root keep shortest sub domain
+    // *www.a.com -> www.news.a.com
+    Assert.assertEquals(aDotCom, URLUtil.chooseRepr(aDotCom, aSubDotCom, true));
+  }
+
+  // from RFC3986 section 5.4.1
+  private static String baseString = "http://a/b/c/d;p?q";
+  private static String[][] targets = new String[][] {
+      // unknown protocol {"g:h" , "g:h"},
+      { "g", "http://a/b/c/g" }, { "./g", "http://a/b/c/g" },
+      { "g/", "http://a/b/c/g/" }, { "/g", "http://a/g" },
+      { "//g", "http://g" }, { "?y", "http://a/b/c/d;p?y" },
+      { "g?y", "http://a/b/c/g?y" }, { "#s", "http://a/b/c/d;p?q#s" },
+      { "g#s", "http://a/b/c/g#s" }, { "g?y#s", "http://a/b/c/g?y#s" },
+      { ";x", "http://a/b/c/;x" }, { "g;x", "http://a/b/c/g;x" },
+      { "g;x?y#s", "http://a/b/c/g;x?y#s" }, { "", "http://a/b/c/d;p?q" },
+      { ".", "http://a/b/c/" }, { "./", "http://a/b/c/" },
+      { "..", "http://a/b/" }, { "../", "http://a/b/" },
+      { "../g", "http://a/b/g" }, { "../..", "http://a/" },
+      { "../../", "http://a/" }, { "../../g", "http://a/g" } };
+
+  @Test
+  public void testResolveURL() throws Exception {
+    // test NUTCH-436
+    URL u436 = new URL("http://a/b/c/d;p?q#f");
+    Assert.assertEquals("http://a/b/c/d;p?q#f", u436.toString());
+    URL abs = URLUtil.resolveURL(u436, "?y");
+    Assert.assertEquals("http://a/b/c/d;p?y", abs.toString());
+    // test NUTCH-566
+    URL u566 = new URL("http://www.fleurie.org/entreprise.asp");
+    abs = URLUtil.resolveURL(u566, "?id_entrep=111");
+    Assert.assertEquals("http://www.fleurie.org/entreprise.asp?id_entrep=111",
+        abs.toString());
+    URL base = new URL(baseString);
+    Assert.assertEquals("base url parsing", baseString, base.toString());
+    for (int i = 0; i < targets.length; i++) {
+      URL u = URLUtil.resolveURL(base, targets[i][0]);
+      Assert.assertEquals(targets[i][1], targets[i][1], u.toString());
+    }
+  }
+
+  @Test
+  public void testToUNICODE() throws Exception {
+    Assert.assertEquals("http://www.�evir.com",
+        URLUtil.toUNICODE("http://www.xn--evir-zoa.com"));
+    Assert.assertEquals("http://uni-t�bingen.de/",
+        URLUtil.toUNICODE("http://xn--uni-tbingen-xhb.de/"));
+    Assert
+        .assertEquals(
+            "http://www.medizin.uni-t�bingen.de:8080/search.php?q=abc#p1",
+            URLUtil
+                .toUNICODE("http://www.medizin.xn--uni-tbingen-xhb.de:8080/search.php?q=abc#p1"));
+
+  }
+
+  @Test
+  public void testToASCII() throws Exception {
+    Assert.assertEquals("http://www.xn--evir-zoa.com",
+        URLUtil.toASCII("http://www.�evir.com"));
+    Assert.assertEquals("http://xn--uni-tbingen-xhb.de/",
+        URLUtil.toASCII("http://uni-t�bingen.de/"));
+    Assert
+        .assertEquals(
+            "http://www.medizin.xn--uni-tbingen-xhb.de:8080/search.php?q=abc#p1",
+            URLUtil
+                .toASCII("http://www.medizin.uni-t�bingen.de:8080/search.php?q=abc#p1"));
+  }
+
+  @Test
+  public void testFileProtocol() throws Exception {
+    // keep one single slash NUTCH-XXX
+    Assert.assertEquals("file:/path/file.html",
+        URLUtil.toASCII("file:/path/file.html"));
+    Assert.assertEquals("file:/path/file.html",
+        URLUtil.toUNICODE("file:/path/file.html"));
+  }
+
+}

[05/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build for nutch-core and nutch-plugins

Posted by th...@apache.org.

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-interactiveselenium/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-interactiveselenium/ivy.xml b/nutch-plugins/protocol-interactiveselenium/ivy.xml
new file mode 100644
index 0000000..ff07f8c
--- /dev/null
+++ b/nutch-plugins/protocol-interactiveselenium/ivy.xml
@@ -0,0 +1,42 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../../ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="default"/>
+  </publications>
+
+  <dependencies>
+    <!-- Note: only dependencies which are not contained in lib-selenium have to be listed here! -->
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-interactiveselenium/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-interactiveselenium/plugin.xml b/nutch-plugins/protocol-interactiveselenium/plugin.xml
new file mode 100644
index 0000000..a69a1e5
--- /dev/null
+++ b/nutch-plugins/protocol-interactiveselenium/plugin.xml
@@ -0,0 +1,47 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="protocol-interactiveselenium"
+   name="Http Protocol Plug-in"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="protocol-interactiveselenium.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+      <import plugin="lib-http"/>
+      <import plugin="lib-selenium"/>
+   </requires>
+
+   <extension id="org.apache.nutch.protocol.interactiveselenium"
+              name="HttpProtocol"
+              point="org.apache.nutch.protocol.Protocol">
+
+      <implementation id="org.apache.nutch.protocol.interactiveselenium.Http"
+                      class="org.apache.nutch.protocol.interactiveselenium.Http">
+        <parameter name="protocolName" value="http"/>
+      </implementation>
+
+   </extension>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-interactiveselenium/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-interactiveselenium/pom.xml b/nutch-plugins/protocol-interactiveselenium/pom.xml
new file mode 100644
index 0000000..ced9cdc
--- /dev/null
+++ b/nutch-plugins/protocol-interactiveselenium/pom.xml
@@ -0,0 +1,50 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>protocol-interactiveselenium</artifactId>
+    <packaging>jar</packaging>
+
+    <name>protocol-interactiveselenium</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+    <dependencies>
+        <dependency>
+            <groupId>org.apache.nutch</groupId>
+            <artifactId>lib-http</artifactId>
+            <version>${project.parent.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.nutch</groupId>
+            <artifactId>lib-selenium</artifactId>
+            <version>${project.parent.version}</version>
+        </dependency>
+    </dependencies>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-interactiveselenium/src/main/java/org/apache/nutch/protocol/interactiveselenium/Http.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-interactiveselenium/src/main/java/org/apache/nutch/protocol/interactiveselenium/Http.java b/nutch-plugins/protocol-interactiveselenium/src/main/java/org/apache/nutch/protocol/interactiveselenium/Http.java
new file mode 100644
index 0000000..9449fa1
--- /dev/null
+++ b/nutch-plugins/protocol-interactiveselenium/src/main/java/org/apache/nutch/protocol/interactiveselenium/Http.java
@@ -0,0 +1,59 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.interactiveselenium;
+
+// JDK imports
+import java.io.IOException;
+import java.net.URL;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.http.api.HttpBase;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.util.NutchConfiguration;
+
+import org.apache.nutch.protocol.interactiveselenium.HttpResponse;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class Http extends HttpBase {
+
+  public static final Logger LOG = LoggerFactory.getLogger(Http.class);
+
+  public Http() {
+    super(LOG);
+  }
+
+  @Override
+  public void setConf(Configuration conf) {
+    super.setConf(conf);
+  }
+
+  public static void main(String[] args) throws Exception {
+    Http http = new Http();
+    http.setConf(NutchConfiguration.create());
+    main(http, args);
+  }
+
+  @Override
+  protected Response getResponse(URL url, CrawlDatum datum, boolean redirect)
+      throws ProtocolException, IOException {
+    return new HttpResponse(this, url, datum);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-interactiveselenium/src/main/java/org/apache/nutch/protocol/interactiveselenium/HttpResponse.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-interactiveselenium/src/main/java/org/apache/nutch/protocol/interactiveselenium/HttpResponse.java b/nutch-plugins/protocol-interactiveselenium/src/main/java/org/apache/nutch/protocol/interactiveselenium/HttpResponse.java
new file mode 100644
index 0000000..a1ccf29
--- /dev/null
+++ b/nutch-plugins/protocol-interactiveselenium/src/main/java/org/apache/nutch/protocol/interactiveselenium/HttpResponse.java
@@ -0,0 +1,399 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.interactiveselenium;
+
+// JDK imports
+import java.io.BufferedInputStream;
+import java.io.EOFException;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.PushbackInputStream;
+import java.net.InetSocketAddress;
+import java.net.Socket;
+import java.net.URL;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.metadata.SpellCheckedMetadata;
+import org.apache.nutch.net.protocols.HttpDateFormat;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.protocol.http.api.HttpException;
+import org.apache.nutch.protocol.http.api.HttpBase;
+import org.openqa.selenium.WebDriver;
+
+import org.apache.nutch.protocol.selenium.HttpWebClient;
+
+/* Most of this code was borrowed from protocol-htmlunit; which in turn borrowed it from protocol-httpclient */
+
+public class HttpResponse implements Response {
+
+  private Http http;
+  private URL url;
+  private String orig;
+  private String base;
+  private byte[] content;
+  private int code;
+  private Metadata headers = new SpellCheckedMetadata();
+  private static InteractiveSeleniumHandler[] handlers;
+
+  /** The nutch configuration */
+  private Configuration conf = null;
+
+  public HttpResponse(Http http, URL url, CrawlDatum datum) throws ProtocolException, IOException {
+
+    this.conf = http.getConf();
+    this.http = http;
+    this.url = url;
+    this.orig = url.toString();
+    this.base = url.toString();
+
+    if (!"http".equals(url.getProtocol()))
+      throw new HttpException("Not an HTTP url:" + url);
+
+    if (Http.LOG.isTraceEnabled()) {
+      Http.LOG.trace("fetching " + url);
+    }
+
+    String path = "".equals(url.getFile()) ? "/" : url.getFile();
+
+    // some servers will redirect a request with a host line like
+    // "Host: <hostname>:80" to "http://<hpstname>/<orig_path>"- they
+    // don't want the :80...
+
+    String host = url.getHost();
+    int port;
+    String portString;
+    if (url.getPort() == -1) {
+      port = 80;
+      portString = "";
+    } else {
+      port = url.getPort();
+      portString = ":" + port;
+    }
+    Socket socket = null;
+
+    try {
+      socket = new Socket(); // create the socket
+      socket.setSoTimeout(http.getTimeout());
+
+      // connect
+      String sockHost = http.useProxy(url) ? http.getProxyHost() : host;
+      int sockPort = http.useProxy(url) ? http.getProxyPort() : port;
+      InetSocketAddress sockAddr = new InetSocketAddress(sockHost, sockPort);
+      socket.connect(sockAddr, http.getTimeout());
+
+      // make request
+      OutputStream req = socket.getOutputStream();
+
+      StringBuffer reqStr = new StringBuffer("GET ");
+      if (http.useProxy(url)) {
+        reqStr.append(url.getProtocol() + "://" + host + portString + path);
+      } else {
+        reqStr.append(path);
+      }
+
+      reqStr.append(" HTTP/1.0\r\n");
+
+      reqStr.append("Host: ");
+      reqStr.append(host);
+      reqStr.append(portString);
+      reqStr.append("\r\n");
+
+      reqStr.append("Accept-Encoding: x-gzip, gzip, deflate\r\n");
+
+      String userAgent = http.getUserAgent();
+      if ((userAgent == null) || (userAgent.length() == 0)) {
+        if (Http.LOG.isErrorEnabled()) {
+          Http.LOG.error("User-agent is not set!");
+        }
+      } else {
+        reqStr.append("User-Agent: ");
+        reqStr.append(userAgent);
+        reqStr.append("\r\n");
+      }
+
+      reqStr.append("Accept-Language: ");
+      reqStr.append(this.http.getAcceptLanguage());
+      reqStr.append("\r\n");
+
+      reqStr.append("Accept: ");
+      reqStr.append(this.http.getAccept());
+      reqStr.append("\r\n");
+
+      if (datum.getModifiedTime() > 0) {
+        reqStr.append("If-Modified-Since: " + HttpDateFormat.toString(datum.getModifiedTime()));
+        reqStr.append("\r\n");
+      }
+      reqStr.append("\r\n");
+
+      byte[] reqBytes = reqStr.toString().getBytes();
+
+      req.write(reqBytes);
+      req.flush();
+
+      PushbackInputStream in = // process response
+          new PushbackInputStream(new BufferedInputStream(socket.getInputStream(), Http.BUFFER_SIZE),
+              Http.BUFFER_SIZE);
+
+      StringBuffer line = new StringBuffer();
+
+      boolean haveSeenNonContinueStatus = false;
+      while (!haveSeenNonContinueStatus) {
+        // parse status code line
+        this.code = parseStatusLine(in, line);
+        // parse headers
+        parseHeaders(in, line);
+        haveSeenNonContinueStatus = code != 100; // 100 is "Continue"
+      }
+
+      // Get Content type header
+      String contentType = getHeader(Response.CONTENT_TYPE);
+
+      // handle with Selenium only if content type in HTML or XHTML 
+      if (contentType != null) {
+        if (contentType.contains("text/html") || contentType.contains("application/xhtml")) {
+          readPlainContent(url);
+        } else {
+          try {
+            int contentLength = Integer.MAX_VALUE;
+            String contentLengthString = headers.get(Response.CONTENT_LENGTH);
+            if (contentLengthString != null) {
+              try {
+                contentLength = Integer.parseInt(contentLengthString.trim());
+              } catch (NumberFormatException ex) {
+                throw new HttpException("bad content length: " + contentLengthString);
+              }
+            }
+
+            if (http.getMaxContent() >= 0 && contentLength > http.getMaxContent()) {
+              contentLength = http.getMaxContent();
+            }
+
+            byte[] buffer = new byte[HttpBase.BUFFER_SIZE];
+            int bufferFilled = 0;
+            int totalRead = 0;
+            ByteArrayOutputStream out = new ByteArrayOutputStream();
+            while ((bufferFilled = in.read(buffer, 0, buffer.length)) != -1
+                && totalRead + bufferFilled <= contentLength) {
+              totalRead += bufferFilled;
+              out.write(buffer, 0, bufferFilled);
+            }
+
+            content = out.toByteArray();
+
+          } catch (Exception e) {
+            if (code == 200)
+              throw new IOException(e.toString());
+            // for codes other than 200 OK, we are fine with empty content
+          } finally {
+            if (in != null) {
+              in.close();
+            }
+          }
+        }
+      } 
+
+    } finally {
+      if (socket != null)
+        socket.close();
+    }
+  }
+
+  /* ------------------------- *
+   * <implementation:Response> *
+   * ------------------------- */
+
+  public URL getUrl() {
+    return url;
+  }
+
+  public int getCode() {
+    return code;
+  }
+
+  public String getHeader(String name) {
+    return headers.get(name);
+  }
+
+  public Metadata getHeaders() {
+    return headers;
+  }
+
+  public byte[] getContent() {
+    return content;
+  }
+
+  /* ------------------------- *
+   * <implementation:Response> *
+   * ------------------------- */
+  private void loadSeleniumHandlers() {
+    if (handlers != null) return;
+
+    String handlerConfig = this.conf.get("interactiveselenium.handlers", "DefaultHandler");
+    String[] handlerNames = handlerConfig.split(",");
+    handlers = new InteractiveSeleniumHandler[handlerNames.length];
+    for (int i = 0; i < handlerNames.length; i++) {
+        try {
+            String classToLoad = this.getClass().getPackage().getName() + "." + handlerNames[i];
+            handlers[i] = InteractiveSeleniumHandler.class.cast(Class.forName(classToLoad).newInstance());
+            Http.LOG.info("Successfully loaded " + classToLoad);
+        } catch (ClassNotFoundException e) {
+            Http.LOG.info("Unable to load Handler class for: " + handlerNames[i]);
+        } catch (InstantiationException e) {
+            Http.LOG.info("Unable to instantiate Handler: " + handlerNames[i]);
+        } catch (IllegalAccessException e) {
+            Http.LOG.info("Illegal access with Handler: " + handlerNames[i]);
+        }
+    }
+  }
+
+  private void readPlainContent(URL url) throws IOException {
+    if (handlers == null)
+        loadSeleniumHandlers();
+
+    String processedPage = "";
+
+    for (InteractiveSeleniumHandler handler : this.handlers) {
+        if (! handler.shouldProcessURL(url.toString())) {
+            continue;
+        }
+
+        WebDriver driver = HttpWebClient.getDriverForPage(url.toString(), conf);
+
+        processedPage += handler.processDriver(driver);
+
+        HttpWebClient.cleanUpDriver(driver);
+    }
+
+    content = processedPage.getBytes("UTF-8");
+  }
+
+  private int parseStatusLine(PushbackInputStream in, StringBuffer line) throws IOException, HttpException {
+    readLine(in, line, false);
+
+    int codeStart = line.indexOf(" ");
+    int codeEnd = line.indexOf(" ", codeStart + 1);
+
+    // handle lines with no plaintext result code, ie:
+    // "HTTP/1.1 200" vs "HTTP/1.1 200 OK"
+    if (codeEnd == -1)
+      codeEnd = line.length();
+
+    int code;
+    try {
+      code = Integer.parseInt(line.substring(codeStart + 1, codeEnd));
+    } catch (NumberFormatException e) {
+      throw new HttpException("bad status line '" + line + "': " + e.getMessage(), e);
+    }
+
+    return code;
+  }
+
+  private void processHeaderLine(StringBuffer line) throws IOException, HttpException {
+
+    int colonIndex = line.indexOf(":"); // key is up to colon
+    if (colonIndex == -1) {
+      int i;
+      for (i = 0; i < line.length(); i++)
+        if (!Character.isWhitespace(line.charAt(i)))
+          break;
+      if (i == line.length())
+        return;
+      throw new HttpException("No colon in header:" + line);
+    }
+    String key = line.substring(0, colonIndex);
+
+    int valueStart = colonIndex + 1; // skip whitespace
+    while (valueStart < line.length()) {
+      int c = line.charAt(valueStart);
+      if (c != ' ' && c != '\t')
+        break;
+      valueStart++;
+    }
+    String value = line.substring(valueStart);
+    headers.set(key, value);
+  }
+
+  // Adds headers to our headers Metadata
+  private void parseHeaders(PushbackInputStream in, StringBuffer line) throws IOException, HttpException {
+
+    while (readLine(in, line, true) != 0) {
+
+      // handle HTTP responses with missing blank line after headers
+      int pos;
+      if (((pos = line.indexOf("<!DOCTYPE")) != -1) || ((pos = line.indexOf("<HTML")) != -1)
+          || ((pos = line.indexOf("<html")) != -1)) {
+
+        in.unread(line.substring(pos).getBytes("UTF-8"));
+        line.setLength(pos);
+
+        try {
+          //TODO: (CM) We don't know the header names here
+          //since we're just handling them generically. It would
+          //be nice to provide some sort of mapping function here
+          //for the returned header names to the standard metadata
+          //names in the ParseData class
+          processHeaderLine(line);
+        } catch (Exception e) {
+          // fixme:
+          Http.LOG.warn("Error: ", e);
+        }
+        return;
+      }
+
+      processHeaderLine(line);
+    }
+  }
+
+  private static int readLine(PushbackInputStream in, StringBuffer line, boolean allowContinuedLine)
+      throws IOException {
+    line.setLength(0);
+    for (int c = in.read(); c != -1; c = in.read()) {
+      switch (c) {
+      case '\r':
+        if (peek(in) == '\n') {
+          in.read();
+        }
+      case '\n':
+        if (line.length() > 0) {
+          // at EOL -- check for continued line if the current
+          // (possibly continued) line wasn't blank
+          if (allowContinuedLine)
+            switch (peek(in)) {
+            case ' ':
+            case '\t': // line is continued
+              in.read();
+              continue;
+            }
+        }
+        return line.length(); // else complete
+      default:
+        line.append((char) c);
+      }
+    }
+    throw new EOFException();
+  }
+
+  private static int peek(PushbackInputStream in) throws IOException {
+    int value = in.read();
+    in.unread(value);
+    return value;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-interactiveselenium/src/main/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefalultMultiInteractionHandler.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-interactiveselenium/src/main/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefalultMultiInteractionHandler.java b/nutch-plugins/protocol-interactiveselenium/src/main/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefalultMultiInteractionHandler.java
new file mode 100644
index 0000000..f3c0f6f
--- /dev/null
+++ b/nutch-plugins/protocol-interactiveselenium/src/main/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefalultMultiInteractionHandler.java
@@ -0,0 +1,53 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.interactiveselenium;
+
+import org.apache.hadoop.util.StringUtils;
+import org.openqa.selenium.JavascriptExecutor;
+import org.openqa.selenium.WebDriver;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * This is a placeholder/example of a technique or use case where we do multiple 
+ * interaction with the web driver and need data from each such interaction in the end. This code shows that after you have 
+ * done multiple interactions and accumulated data you can in the end append that to the driver.  
+ */
+public class DefalultMultiInteractionHandler implements
+    InteractiveSeleniumHandler {
+  private static final Logger LOG = LoggerFactory
+      .getLogger(DefalultMultiInteractionHandler.class);
+
+  public String processDriver(WebDriver driver) {
+    // loop and get multiple pages in this string
+    String accumulatedData = "";
+    try {
+      
+      // append the string to the last page's driver
+      JavascriptExecutor jsx = (JavascriptExecutor) driver;
+      jsx.executeScript("document.body.innerHTML=document.body.innerHTML "
+          + accumulatedData + ";");
+    } catch (Exception e) {
+      LOG.info(StringUtils.stringifyException(e));
+    }
+    return accumulatedData;
+  }
+
+  public boolean shouldProcessURL(String URL) {
+    return true;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-interactiveselenium/src/main/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefaultClickAllAjaxLinksHandler.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-interactiveselenium/src/main/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefaultClickAllAjaxLinksHandler.java b/nutch-plugins/protocol-interactiveselenium/src/main/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefaultClickAllAjaxLinksHandler.java
new file mode 100644
index 0000000..e3423d5
--- /dev/null
+++ b/nutch-plugins/protocol-interactiveselenium/src/main/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefaultClickAllAjaxLinksHandler.java
@@ -0,0 +1,88 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.interactiveselenium;
+
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.nutch.util.NutchConfiguration;
+import org.openqa.selenium.By;
+import org.openqa.selenium.JavascriptExecutor;
+import org.openqa.selenium.WebDriver;
+import org.openqa.selenium.WebElement;
+import org.openqa.selenium.support.ui.WebDriverWait;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * This handler clicks all the <a hfer="javascript:void(null);"> tags
+ * because it considers them as not usual links but ajax links/interactions. This uses the same logic of 
+ * DefalultMultiInteractionHandler. 
+ */
+public class DefaultClickAllAjaxLinksHandler implements InteractiveSeleniumHandler {
+  private static final Logger LOG = LoggerFactory
+      .getLogger(DefaultClickAllAjaxLinksHandler.class);
+
+  public String processDriver(WebDriver driver) {
+    
+    String accumulatedData = "";
+    try {
+      
+
+      driver.findElement(By.tagName("body")).getAttribute("innerHTML");
+      Configuration conf = NutchConfiguration.create();
+      new WebDriverWait(driver, conf.getLong("libselenium.page.load.delay", 3));
+
+      List<WebElement> atags = driver.findElements(By.tagName("a"));
+      int numberofajaxlinks = atags.size();
+      for (int i = 0; i < numberofajaxlinks; i++) {
+
+        if (atags.get(i).getAttribute("href") != null
+            && atags.get(i).getAttribute("href")
+                .equals("javascript:void(null);")) {
+
+          atags.get(i).click();
+
+          if (i == numberofajaxlinks - 1) {
+            // append everything to the driver in the last round
+            JavascriptExecutor jsx = (JavascriptExecutor) driver;
+            jsx.executeScript("document.body.innerHTML=document.body.innerHTML "
+                + accumulatedData + ";");
+            continue;
+          }
+
+          accumulatedData += driver.findElement(By.tagName("body"))
+              .getAttribute("innerHTML");
+
+          // refreshing the handlers as the page was interacted with
+          driver.navigate().refresh();
+          new WebDriverWait(driver, conf.getLong("libselenium.page.load.delay",
+              3));
+          atags = driver.findElements(By.tagName("a"));
+        }
+      }
+    } catch (Exception e) {
+      LOG.info(StringUtils.stringifyException(e));
+    }
+    return accumulatedData;
+  }
+
+  public boolean shouldProcessURL(String URL) {
+    return true;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-interactiveselenium/src/main/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefaultHandler.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-interactiveselenium/src/main/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefaultHandler.java b/nutch-plugins/protocol-interactiveselenium/src/main/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefaultHandler.java
new file mode 100644
index 0000000..ae7b97e
--- /dev/null
+++ b/nutch-plugins/protocol-interactiveselenium/src/main/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefaultHandler.java
@@ -0,0 +1,30 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol.interactiveselenium;
+
+import org.openqa.selenium.WebDriver;
+
+public class DefaultHandler implements InteractiveSeleniumHandler {
+    public String processDriver(WebDriver driver) {
+      return null;
+    }
+
+    public boolean shouldProcessURL(String URL) {
+        return true;
+    }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-interactiveselenium/src/main/java/org/apache/nutch/protocol/interactiveselenium/handlers/InteractiveSeleniumHandler.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-interactiveselenium/src/main/java/org/apache/nutch/protocol/interactiveselenium/handlers/InteractiveSeleniumHandler.java b/nutch-plugins/protocol-interactiveselenium/src/main/java/org/apache/nutch/protocol/interactiveselenium/handlers/InteractiveSeleniumHandler.java
new file mode 100644
index 0000000..9ce1e26
--- /dev/null
+++ b/nutch-plugins/protocol-interactiveselenium/src/main/java/org/apache/nutch/protocol/interactiveselenium/handlers/InteractiveSeleniumHandler.java
@@ -0,0 +1,25 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol.interactiveselenium;
+
+import org.openqa.selenium.WebDriver;
+
+public interface InteractiveSeleniumHandler {
+    public String processDriver(WebDriver driver);
+    public boolean shouldProcessURL(String URL);
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-interactiveselenium/src/main/java/org/apache/nutch/protocol/interactiveselenium/package.html
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-interactiveselenium/src/main/java/org/apache/nutch/protocol/interactiveselenium/package.html b/nutch-plugins/protocol-interactiveselenium/src/main/java/org/apache/nutch/protocol/interactiveselenium/package.html
new file mode 100644
index 0000000..75cd5b5
--- /dev/null
+++ b/nutch-plugins/protocol-interactiveselenium/src/main/java/org/apache/nutch/protocol/interactiveselenium/package.html
@@ -0,0 +1,5 @@
+<html>
+<body>
+<p>Protocol plugin which supports retrieving documents via selenium.</p><p></p>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-selenium/README.md
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-selenium/README.md b/nutch-plugins/protocol-selenium/README.md
new file mode 100644
index 0000000..1462b47
--- /dev/null
+++ b/nutch-plugins/protocol-selenium/README.md
@@ -0,0 +1,208 @@
+Nutch Selenium
+==============
+
+# Introduction
+
+This plugin allows you to fetch Javascript pages using [Selenium](http://www.seleniumhq.org/), while relying on the rest of the awesome Nutch stack!
+
+The underlying code is based on the nutch-htmlunit plugin, which was in turn based on nutch-httpclient.
+
+There are essentially two ways in which Nutch can be used with Selenium.
+
+ * Locally (on each node) as a self contained process, or
+ * via the RemoteWebDriver which connects to [Selenium-Grid](http://www.seleniumhq.org/docs/07_selenium_grid.jsp). A grid consists of a single hub, and one or more nodes.
+
+# Installation
+
+## Part 1: 
+
+### A) Setting up Selenium (local mode)
+
+ * Ensure that you have your prefered browser installed. Currently Chrome, Safari, Opera, PhantomJS and Firefox are supported. Here there example of installing Firefox is provided. More info about the package @ [launchpad](https://launchpad.net/ubuntu/trusty/+source/firefox)
+```
+sudo apt-get install firefox
+```
+
+ * Install Xvfb and its associates
+
+This step is not necessary for the PhantomJs broswer and may not be needed for all browsers.
+
+```
+sudo apt-get install xorg synaptic xvfb gtk2-engines-pixbuf xfonts-cyrillic xfonts-100dpi \
+    xfonts-75dpi xfonts-base xfonts-scalable freeglut3-dev dbus-x11 openbox x11-xserver-utils \
+    libxrender1 cabextract
+```
+
+ * Set a display for Xvfb, so that firefox believes a display is connected
+ 
+```
+sudo /usr/bin/Xvfb :11 -screen 0 1024x768x24 &
+sudo export DISPLAY=:11
+```
+### B) Setting up a Selenium Grid 
+
+Using the Selenium Grid will allow you to parallelize the job by facilitating access of several instances of browsers whether on one machine or on several machines. Note that grid facilitates heterogeneity with regards to browser types used. However, these steps have been tested using a homogenous Selenium Grid with Firefox and PhantomJS browsers. 
+
+ * Download the [Selenium Standalone Server](http://www.seleniumhq.org/download/) and follow the installation instructions.
+ 
+ * Some important configurations to note while setting up the selenium-hub and the selenium-nodes are:
+    * For the hub: 
+      - maxSession (how many browser sessions to allow on the grid at a time)
+      - browserTimeout (how long to wait before timing out a browser session. This is dependent on the interactivity to be completed on the page)
+      
+    * For the nodes:
+      - browserName=<browser>, maxInstances (the max number of instances of the same version browser to allow per a system)
+      - browserName=<browser>, maxSession (the max number of sessions of any type of browser/version to allow per a system)
+      
+  * Go headless with your selenium Grid installation. There are different ways to this. See [this resource](http://elementalselenium.com/tips/38-headless) for further details. 
+ 
+  * For Nutch efficiency, and optimization of the grid, consider editing the following configs in **nutch-site.xml**
+    - fetcher.threads.per.queue (change value to the value of the maxSession config on the hub)
+    - fetcher.threads.fetch (change value to the value of the maxSession config on the hub)
+    - fetcher.server.delay (As multiple threads may be accessing a single server at a time, consider changing this value to 4-5 seconds for politeness)
+    - fetcher.server.min.delay (As multiple threads may be accessing a single server at a time, consider changing this values to 4-5 seconds for politeness)
+    - Ensure all configs for the hub mentioned in Part 2 are appropriately set. 
+
+  * To activate the full selenium grid, edit **$NUTCH_HOME/runtime/local/bin/crawl** script:
+    - numThreads = maxSession on nodes * num of nodes
+
+
+## Part 2: Installing plugin for Nutch (where NUTCH_HOME is the root of your nutch install)
+
+ * Ensure that the plugin will be used as the protocol parser in your config
+
+```
+<!-- NUTCH_HOME/conf/nutch-site.xml -->
+
+<configuration>
+  ...
+  <property>
+    <name>plugin.includes</name>
+    <value>protocol-selenium|urlfilter-regex|parse-(html|tika)|index-(basic|anchor)|urlnormalizer-(pass|regex|basic)|scoring-opic</value>
+    <description>Regular expression naming plugin directory names to
+    include.  Any plugin not matching this expression is excluded.
+    In any case you need at least include the nutch-extensionpoints plugin. By
+    default Nutch includes crawling just HTML and plain text via HTTP,
+    and basic indexing and search plugins. In order to use HTTPS please enable 
+    protocol-httpclient, but be aware of possible intermittent problems with the 
+    underlying commons-httpclient library.
+    </description>
+  </property>
+```
+
+* Then ensure that you have the correct configuration set within the following configuration options
+
+```
+<!-- protocol-selenium plugin properties -->
+
+<property>
+  <name>selenium.driver</name>
+  <value>firefox</value>
+  <description>
+    A String value representing the flavour of Selenium 
+    WebDriver() to use. Currently the following options
+    exist - 'firefox', 'chrome', 'safari', 'opera', 'phantomJS', and 'remote'.
+    If 'remote' is used it is essential to also set correct properties for
+    'selenium.hub.port', 'selenium.hub.path', 'selenium.hub.host' and
+    'selenium.hub.protocol'.
+  </description>
+</property>
+
+<property>
+  <name>selenium.take.screenshot</name>
+  <value>false</value>
+  <description>
+    Boolean property determining whether the protocol-selenium
+    WebDriver should capture a screenshot of the URL. If set to
+    true remember to define the 'selenium.screenshot.location' 
+    property as this determines the location screenshots should be 
+    persisted to on HDFS. If that property is not set, screenshots
+    are simply discarded.
+  </description>
+</property>
+
+<property>
+  <name>selenium.screenshot.location</name>
+  <value></value>
+  <description>
+    The location on disk where a URL screenshot should be saved
+    to if the 'selenium.take.screenshot' proerty is set to true.
+    By default this is null, in this case screenshots held in memory
+    are simply discarded.
+  </description>
+</property>
+
+<property>
+  <name>selenium.hub.port</name>
+  <value>4444</value>
+  <description>Selenium Hub Location connection port</description>
+</property>
+
+<property>
+  <name>selenium.hub.path</name>
+  <value>/wd/hub</value>
+  <description>Selenium Hub Location connection path</description>
+</property>
+
+<property>
+  <name>selenium.hub.host</name>
+  <value>localhost</value>
+  <description>Selenium Hub Location connection host</description>
+</property>
+
+<property>
+  <name>selenium.hub.protocol</name>
+  <value>http</value>
+  <description>Selenium Hub Location connection protocol</description>
+</property>
+
+<property>
+  <name>selenium.grid.driver</name>
+  <value>firefox</value>
+  <description>A String value representing the flavour of Selenium 
+    WebDriver() used on the selenium grid. Currently the following options
+    exist - 'firefox' or 'phantomJS' </description>
+</property>
+
+<property>
+  <name>selenium.grid.binary</name>
+  <value></value>
+  <description>A String value representing the path to the browser binary 
+    location for each node
+ </description>
+</property>
+
+<!-- lib-selenium configuration -->
+<property>
+  <name>libselenium.page.load.delay</name>
+  <value>3</value>
+  <description>
+    The delay in seconds to use when loading a page with lib-selenium. This
+    setting is used by protocol-selenium and protocol-interactiveselenium
+    since they depending on lib-selenium for fetching.
+  </description>
+</property>
+```
+ * If you've selected 'remote' value for the 'selenium.driver' property, ensure that you've configured
+ the additional properties based on your [Selenium-Grid installation](http://www.seleniumhq.org/docs/07_selenium_grid.jsp#installation).
+
+ * Compile nutch
+```
+ant runtime
+```
+
+ * Start your web crawl (Ensure that you followed the above steps and have started your xvfb display as shown above)
+
+## Part 3: Common Pitfalls
+
+* Be sure your browser version and selenium version are compatible (See list in 'Tested configurations' section below) 
+* Be sure to start the Xvfb window then start selenium (not a necessary step for PhantomJS)
+* Disconnecting and reconnect nodes after a hub config change has proven useful in our tests. 
+* Be sure that each browser session deallocates its webdriver resource independently of any other tests running on other broswers (check out driver.quit() and driver.close()). 
+
+### Tested configurations 
+
+* Firefox 31.4.0 and Selenium 2.48.2
+* PhantomJS 2.1.1 and Selenium 2.48.2
+* PhantomJS 2.1.1 and Selenium 2.53.0
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-selenium/build-ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-selenium/build-ivy.xml b/nutch-plugins/protocol-selenium/build-ivy.xml
new file mode 100644
index 0000000..67d39cd
--- /dev/null
+++ b/nutch-plugins/protocol-selenium/build-ivy.xml
@@ -0,0 +1,54 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="protocol-selenium" default="deps-jar" xmlns:ivy="antlib:org.apache.ivy.ant">
+
+    <property name="ivy.install.version" value="2.1.0" />
+    <condition property="ivy.home" value="${env.IVY_HOME}">
+      <isset property="env.IVY_HOME" />
+    </condition>
+    <property name="ivy.home" value="${user.home}/.ant" />
+    <property name="ivy.checksums" value="" />
+    <property name="ivy.jar.dir" value="${ivy.home}/lib" />
+    <property name="ivy.jar.file" value="${ivy.jar.dir}/ivy.jar" />
+
+    <target name="download-ivy" unless="offline">
+
+        <mkdir dir="${ivy.jar.dir}"/>
+        <!-- download Ivy from web site so that it can be used even without any special installation -->
+        <get src="http://repo2.maven.org/maven2/org/apache/ivy/ivy/${ivy.install.version}/ivy-${ivy.install.version}.jar" 
+             dest="${ivy.jar.file}" usetimestamp="true"/>
+    </target>
+
+    <target name="init-ivy" depends="download-ivy">
+      <!-- try to load ivy here from ivy home, in case the user has not already dropped
+              it into ant's lib dir (note that the latter copy will always take precedence).
+              We will not fail as long as local lib dir exists (it may be empty) and
+              ivy is in at least one of ant's lib dir or the local lib dir. -->
+        <path id="ivy.lib.path">
+            <fileset dir="${ivy.jar.dir}" includes="*.jar"/>
+
+        </path>
+        <taskdef resource="org/apache/ivy/ant/antlib.xml"
+                 uri="antlib:org.apache.ivy.ant" classpathref="ivy.lib.path"/>
+    </target>
+
+  <target name="deps-jar" depends="init-ivy">
+    <ivy:retrieve pattern="lib/[artifact]-[revision].[ext]" sync="true"/>
+  </target>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-selenium/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-selenium/build.xml b/nutch-plugins/protocol-selenium/build.xml
new file mode 100644
index 0000000..055018f
--- /dev/null
+++ b/nutch-plugins/protocol-selenium/build.xml
@@ -0,0 +1,36 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="protocol-selenium" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+  <!-- Build compilation dependencies -->
+  <target name="deps-jar">
+    <ant target="jar" inheritall="false" dir="../lib-http"/>
+    <ant target="jar" inheritall="false" dir="../lib-selenium"/>
+  </target>
+
+  <!-- Add compilation dependencies to classpath -->
+  <path id="plugin.deps">
+    <fileset dir="${nutch.root}/build">
+      <include name="**/lib-http/*.jar" />
+      <include name="**/lib-selenium/*.jar" />
+    </fileset>
+  </path>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-selenium/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-selenium/ivy.xml b/nutch-plugins/protocol-selenium/ivy.xml
new file mode 100644
index 0000000..ff07f8c
--- /dev/null
+++ b/nutch-plugins/protocol-selenium/ivy.xml
@@ -0,0 +1,42 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../../ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="default"/>
+  </publications>
+
+  <dependencies>
+    <!-- Note: only dependencies which are not contained in lib-selenium have to be listed here! -->
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-selenium/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-selenium/plugin.xml b/nutch-plugins/protocol-selenium/plugin.xml
new file mode 100644
index 0000000..1454c1b
--- /dev/null
+++ b/nutch-plugins/protocol-selenium/plugin.xml
@@ -0,0 +1,47 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="protocol-selenium"
+   name="Http Protocol Plug-in"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="protocol-selenium.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+      <import plugin="lib-http"/>
+      <import plugin="lib-selenium"/>
+   </requires>
+
+   <extension id="org.apache.nutch.protocol.selenium"
+              name="HttpProtocol"
+              point="org.apache.nutch.protocol.Protocol">
+
+      <implementation id="org.apache.nutch.protocol.selenium.Http"
+                      class="org.apache.nutch.protocol.selenium.Http">
+        <parameter name="protocolName" value="http"/>
+      </implementation>
+
+   </extension>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-selenium/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-selenium/pom.xml b/nutch-plugins/protocol-selenium/pom.xml
new file mode 100644
index 0000000..a94c7ec
--- /dev/null
+++ b/nutch-plugins/protocol-selenium/pom.xml
@@ -0,0 +1,50 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>protocol-selenium</artifactId>
+    <packaging>jar</packaging>
+
+    <name>protocol-selenium</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+    <dependencies>
+        <dependency>
+            <groupId>org.apache.nutch</groupId>
+            <artifactId>lib-http</artifactId>
+            <version>${project.parent.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.nutch</groupId>
+            <artifactId>lib-selenium</artifactId>
+            <version>${project.parent.version}</version>
+        </dependency>
+    </dependencies>
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-selenium/src/main/java/org/apache/nutch/protocol/selenium/Http.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-selenium/src/main/java/org/apache/nutch/protocol/selenium/Http.java b/nutch-plugins/protocol-selenium/src/main/java/org/apache/nutch/protocol/selenium/Http.java
new file mode 100644
index 0000000..7726bdf
--- /dev/null
+++ b/nutch-plugins/protocol-selenium/src/main/java/org/apache/nutch/protocol/selenium/Http.java
@@ -0,0 +1,59 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.selenium;
+
+// JDK imports
+import java.io.IOException;
+import java.net.URL;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.http.api.HttpBase;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.util.NutchConfiguration;
+
+import org.apache.nutch.protocol.selenium.HttpResponse;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class Http extends HttpBase {
+
+  public static final Logger LOG = LoggerFactory.getLogger(Http.class);
+
+  public Http() {
+    super(LOG);
+  }
+
+  @Override
+  public void setConf(Configuration conf) {
+    super.setConf(conf);
+  }
+
+  public static void main(String[] args) throws Exception {
+    Http http = new Http();
+    http.setConf(NutchConfiguration.create());
+    main(http, args);
+  }
+
+  @Override
+  protected Response getResponse(URL url, CrawlDatum datum, boolean redirect)
+      throws ProtocolException, IOException {
+    return new HttpResponse(this, url, datum);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-selenium/src/main/java/org/apache/nutch/protocol/selenium/HttpResponse.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-selenium/src/main/java/org/apache/nutch/protocol/selenium/HttpResponse.java b/nutch-plugins/protocol-selenium/src/main/java/org/apache/nutch/protocol/selenium/HttpResponse.java
new file mode 100644
index 0000000..681e838
--- /dev/null
+++ b/nutch-plugins/protocol-selenium/src/main/java/org/apache/nutch/protocol/selenium/HttpResponse.java
@@ -0,0 +1,360 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.selenium;
+
+// JDK imports
+import java.io.BufferedInputStream;
+import java.io.EOFException;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.PushbackInputStream;
+import java.net.InetSocketAddress;
+import java.net.Socket;
+import java.net.URL;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.metadata.SpellCheckedMetadata;
+import org.apache.nutch.net.protocols.HttpDateFormat;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.protocol.http.api.HttpException;
+import org.apache.nutch.protocol.http.api.HttpBase;
+
+/* Most of this code was borrowed from protocol-htmlunit; which in turn borrowed it from protocol-httpclient */
+
+public class HttpResponse implements Response {
+
+  private Http http;
+  private URL url;
+  private String orig;
+  private String base;
+  private byte[] content;
+  private int code;
+  private Metadata headers = new SpellCheckedMetadata();
+
+  /** The nutch configuration */
+  private Configuration conf = null;
+
+  public HttpResponse(Http http, URL url, CrawlDatum datum) throws ProtocolException, IOException {
+
+    this.conf = http.getConf();
+    this.http = http;
+    this.url = url;
+    this.orig = url.toString();
+    this.base = url.toString();
+
+    if (!"http".equals(url.getProtocol()))
+      throw new HttpException("Not an HTTP url:" + url);
+
+    if (Http.LOG.isTraceEnabled()) {
+      Http.LOG.trace("fetching " + url);
+    }
+
+    String path = "".equals(url.getFile()) ? "/" : url.getFile();
+
+    // some servers will redirect a request with a host line like
+    // "Host: <hostname>:80" to "http://<hpstname>/<orig_path>"- they
+    // don't want the :80...
+
+    String host = url.getHost();
+    int port;
+    String portString;
+    if (url.getPort() == -1) {
+      port = 80;
+      portString = "";
+    } else {
+      port = url.getPort();
+      portString = ":" + port;
+    }
+    Socket socket = null;
+
+    try {
+      socket = new Socket(); // create the socket
+      socket.setSoTimeout(http.getTimeout());
+
+      // connect
+      String sockHost = http.useProxy(url) ? http.getProxyHost() : host;
+      int sockPort = http.useProxy(url) ? http.getProxyPort() : port;
+      InetSocketAddress sockAddr = new InetSocketAddress(sockHost, sockPort);
+      socket.connect(sockAddr, http.getTimeout());
+
+      // make request
+      OutputStream req = socket.getOutputStream();
+
+      StringBuffer reqStr = new StringBuffer("GET ");
+      if (http.useProxy(url)) {
+        reqStr.append(url.getProtocol() + "://" + host + portString + path);
+      } else {
+        reqStr.append(path);
+      }
+
+      reqStr.append(" HTTP/1.0\r\n");
+
+      reqStr.append("Host: ");
+      reqStr.append(host);
+      reqStr.append(portString);
+      reqStr.append("\r\n");
+
+      reqStr.append("Accept-Encoding: x-gzip, gzip, deflate\r\n");
+
+      String userAgent = http.getUserAgent();
+      if ((userAgent == null) || (userAgent.length() == 0)) {
+        if (Http.LOG.isErrorEnabled()) {
+          Http.LOG.error("User-agent is not set!");
+        }
+      } else {
+        reqStr.append("User-Agent: ");
+        reqStr.append(userAgent);
+        reqStr.append("\r\n");
+      }
+
+      reqStr.append("Accept-Language: ");
+      reqStr.append(this.http.getAcceptLanguage());
+      reqStr.append("\r\n");
+
+      reqStr.append("Accept: ");
+      reqStr.append(this.http.getAccept());
+      reqStr.append("\r\n");
+
+      if (datum.getModifiedTime() > 0) {
+        reqStr.append("If-Modified-Since: " + HttpDateFormat.toString(datum.getModifiedTime()));
+        reqStr.append("\r\n");
+      }
+      reqStr.append("\r\n");
+
+      byte[] reqBytes = reqStr.toString().getBytes();
+
+      req.write(reqBytes);
+      req.flush();
+
+      PushbackInputStream in = // process response
+          new PushbackInputStream(new BufferedInputStream(socket.getInputStream(), Http.BUFFER_SIZE),
+              Http.BUFFER_SIZE);
+
+      StringBuffer line = new StringBuffer();
+
+      boolean haveSeenNonContinueStatus = false;
+      while (!haveSeenNonContinueStatus) {
+        // parse status code line
+        this.code = parseStatusLine(in, line);
+        // parse headers
+        parseHeaders(in, line);
+        haveSeenNonContinueStatus = code != 100; // 100 is "Continue"
+      }
+
+      // Get Content type header
+      String contentType = getHeader(Response.CONTENT_TYPE);
+
+      // handle with Selenium only if content type in HTML or XHTML 
+      if (contentType != null) {
+        if (contentType.contains("text/html") || contentType.contains("application/xhtml")) {
+          readPlainContent(url);
+        } else {
+          try {
+            int contentLength = Integer.MAX_VALUE;
+            String contentLengthString = headers.get(Response.CONTENT_LENGTH);
+            if (contentLengthString != null) {
+              try {
+                contentLength = Integer.parseInt(contentLengthString.trim());
+              } catch (NumberFormatException ex) {
+                throw new HttpException("bad content length: " + contentLengthString);
+              }
+            }
+
+            if (http.getMaxContent() >= 0 && contentLength > http.getMaxContent()) {
+              contentLength = http.getMaxContent();
+            }
+
+            byte[] buffer = new byte[HttpBase.BUFFER_SIZE];
+            int bufferFilled = 0;
+            int totalRead = 0;
+            ByteArrayOutputStream out = new ByteArrayOutputStream();
+            while ((bufferFilled = in.read(buffer, 0, buffer.length)) != -1
+                && totalRead + bufferFilled <= contentLength) {
+              totalRead += bufferFilled;
+              out.write(buffer, 0, bufferFilled);
+            }
+
+            content = out.toByteArray();
+
+          } catch (Exception e) {
+            if (code == 200)
+              throw new IOException(e.toString());
+            // for codes other than 200 OK, we are fine with empty content
+          } finally {
+            if (in != null) {
+              in.close();
+            }
+          }
+        }
+      } 
+
+    } finally {
+      if (socket != null)
+        socket.close();
+    }
+  }
+
+  /* ------------------------- *
+   * <implementation:Response> *
+   * ------------------------- */
+
+  public URL getUrl() {
+    return url;
+  }
+
+  public int getCode() {
+    return code;
+  }
+
+  public String getHeader(String name) {
+    return headers.get(name);
+  }
+
+  public Metadata getHeaders() {
+    return headers;
+  }
+
+  public byte[] getContent() {
+    return content;
+  }
+
+  /* ------------------------- *
+   * <implementation:Response> *
+   * ------------------------- */
+
+  private void readPlainContent(URL url) throws IOException {
+    String page = HttpWebClient.getHtmlPage(url.toString(), conf);
+
+    content = page.getBytes("UTF-8");
+  }
+
+  private int parseStatusLine(PushbackInputStream in, StringBuffer line) throws IOException, HttpException {
+    readLine(in, line, false);
+
+    int codeStart = line.indexOf(" ");
+    int codeEnd = line.indexOf(" ", codeStart + 1);
+
+    // handle lines with no plaintext result code, ie:
+    // "HTTP/1.1 200" vs "HTTP/1.1 200 OK"
+    if (codeEnd == -1)
+      codeEnd = line.length();
+
+    int code;
+    try {
+      code = Integer.parseInt(line.substring(codeStart + 1, codeEnd));
+    } catch (NumberFormatException e) {
+      throw new HttpException("bad status line '" + line + "': " + e.getMessage(), e);
+    }
+
+    return code;
+  }
+
+  private void processHeaderLine(StringBuffer line) throws IOException, HttpException {
+
+    int colonIndex = line.indexOf(":"); // key is up to colon
+    if (colonIndex == -1) {
+      int i;
+      for (i = 0; i < line.length(); i++)
+        if (!Character.isWhitespace(line.charAt(i)))
+          break;
+      if (i == line.length())
+        return;
+      throw new HttpException("No colon in header:" + line);
+    }
+    String key = line.substring(0, colonIndex);
+
+    int valueStart = colonIndex + 1; // skip whitespace
+    while (valueStart < line.length()) {
+      int c = line.charAt(valueStart);
+      if (c != ' ' && c != '\t')
+        break;
+      valueStart++;
+    }
+    String value = line.substring(valueStart);
+    headers.set(key, value);
+  }
+
+  // Adds headers to our headers Metadata
+  private void parseHeaders(PushbackInputStream in, StringBuffer line) throws IOException, HttpException {
+
+    while (readLine(in, line, true) != 0) {
+
+      // handle HTTP responses with missing blank line after headers
+      int pos;
+      if (((pos = line.indexOf("<!DOCTYPE")) != -1) || ((pos = line.indexOf("<HTML")) != -1)
+          || ((pos = line.indexOf("<html")) != -1)) {
+
+        in.unread(line.substring(pos).getBytes("UTF-8"));
+        line.setLength(pos);
+
+        try {
+          //TODO: (CM) We don't know the header names here
+          //since we're just handling them generically. It would
+          //be nice to provide some sort of mapping function here
+          //for the returned header names to the standard metadata
+          //names in the ParseData class
+          processHeaderLine(line);
+        } catch (Exception e) {
+          // fixme:
+          Http.LOG.warn("Error: ", e);
+        }
+        return;
+      }
+
+      processHeaderLine(line);
+    }
+  }
+
+  private static int readLine(PushbackInputStream in, StringBuffer line, boolean allowContinuedLine)
+      throws IOException {
+    line.setLength(0);
+    for (int c = in.read(); c != -1; c = in.read()) {
+      switch (c) {
+      case '\r':
+        if (peek(in) == '\n') {
+          in.read();
+        }
+      case '\n':
+        if (line.length() > 0) {
+          // at EOL -- check for continued line if the current
+          // (possibly continued) line wasn't blank
+          if (allowContinuedLine)
+            switch (peek(in)) {
+            case ' ':
+            case '\t': // line is continued
+              in.read();
+              continue;
+            }
+        }
+        return line.length(); // else complete
+      default:
+        line.append((char) c);
+      }
+    }
+    throw new EOFException();
+  }
+
+  private static int peek(PushbackInputStream in) throws IOException {
+    int value = in.read();
+    in.unread(value);
+    return value;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-selenium/src/main/java/org/apache/nutch/protocol/selenium/package.html
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-selenium/src/main/java/org/apache/nutch/protocol/selenium/package.html b/nutch-plugins/protocol-selenium/src/main/java/org/apache/nutch/protocol/selenium/package.html
new file mode 100644
index 0000000..75cd5b5
--- /dev/null
+++ b/nutch-plugins/protocol-selenium/src/main/java/org/apache/nutch/protocol/selenium/package.html
@@ -0,0 +1,5 @@
+<html>
+<body>
+<p>Protocol plugin which supports retrieving documents via selenium.</p><p></p>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-depth/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-depth/build.xml b/nutch-plugins/scoring-depth/build.xml
new file mode 100644
index 0000000..6c041ed
--- /dev/null
+++ b/nutch-plugins/scoring-depth/build.xml
@@ -0,0 +1,6 @@
+<?xml version="1.0"?>
+<project name="scoring-depth" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-depth/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-depth/ivy.xml b/nutch-plugins/scoring-depth/ivy.xml
new file mode 100644
index 0000000..24d7606
--- /dev/null
+++ b/nutch-plugins/scoring-depth/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../../ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-depth/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-depth/plugin.xml b/nutch-plugins/scoring-depth/plugin.xml
new file mode 100644
index 0000000..ea57dc6
--- /dev/null
+++ b/nutch-plugins/scoring-depth/plugin.xml
@@ -0,0 +1,24 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<plugin
+   id="scoring-depth"
+   name="Scoring plugin for depth-limited crawling."
+   version="1.0.0"
+   provider-name="ant.com">
+
+   <runtime>
+      <library name="scoring-depth.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+   <extension id="org.apache.nutch.scoring.depth"
+              name="Depth Scoring Filter"
+              point="org.apache.nutch.scoring.ScoringFilter">
+      <implementation id="DepthScoringFilter"
+                      class="org.apache.nutch.scoring.depth.DepthScoringFilter"/>
+   </extension>
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-depth/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-depth/pom.xml b/nutch-plugins/scoring-depth/pom.xml
new file mode 100644
index 0000000..64ebe18
--- /dev/null
+++ b/nutch-plugins/scoring-depth/pom.xml
@@ -0,0 +1,38 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>scoring-depth</artifactId>
+    <packaging>jar</packaging>
+
+    <name>scoring-depth</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-depth/src/main/java/org/apache/nutch/scoring/depth/DepthScoringFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-depth/src/main/java/org/apache/nutch/scoring/depth/DepthScoringFilter.java b/nutch-plugins/scoring-depth/src/main/java/org/apache/nutch/scoring/depth/DepthScoringFilter.java
new file mode 100644
index 0000000..0a0dd27
--- /dev/null
+++ b/nutch-plugins/scoring-depth/src/main/java/org/apache/nutch/scoring/depth/DepthScoringFilter.java
@@ -0,0 +1,207 @@
+package org.apache.nutch.scoring.depth;
+
+import java.util.Collection;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map.Entry;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.scoring.ScoringFilter;
+import org.apache.nutch.scoring.ScoringFilterException;
+
+/**
+ * This scoring filter limits the number of hops from the initial seed urls. If
+ * the number of hops exceeds the depth (either the default value, or the one
+ * set in the injector file) then all outlinks from that url are discarded,
+ * effectively stopping further crawling along this path.
+ */
+public class DepthScoringFilter extends Configured implements ScoringFilter {
+  private static final Log LOG = LogFactory.getLog(DepthScoringFilter.class);
+
+  public static final String DEPTH_KEY = "_depth_";
+  public static final Text DEPTH_KEY_W = new Text(DEPTH_KEY);
+  public static final String MAX_DEPTH_KEY = "_maxdepth_";
+  public static final Text MAX_DEPTH_KEY_W = new Text(MAX_DEPTH_KEY);
+
+  // maximum value that we are never likely to reach
+  // because the depth of the Web graph is that high only
+  // for spam cliques.
+  public static final int DEFAULT_MAX_DEPTH = 1000;
+
+  private int defaultMaxDepth;
+
+  @Override
+  public void setConf(Configuration conf) {
+    super.setConf(conf);
+    if (conf == null)
+      return;
+    defaultMaxDepth = conf.getInt("scoring.depth.max", DEFAULT_MAX_DEPTH);
+    if (defaultMaxDepth <= 0) {
+      defaultMaxDepth = DEFAULT_MAX_DEPTH;
+    }
+  }
+
+  @Override
+  public CrawlDatum distributeScoreToOutlinks(Text fromUrl,
+      ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets,
+      CrawlDatum adjust, int allCount) throws ScoringFilterException {
+    String depthString = parseData.getMeta(DEPTH_KEY);
+    if (depthString == null) {
+      LOG.warn("Missing depth, removing all outlinks from url " + fromUrl);
+      targets.clear();
+      return adjust;
+    }
+    int curDepth = Integer.parseInt(depthString);
+    int curMaxDepth = defaultMaxDepth;
+    IntWritable customMaxDepth = null;
+    // allow overrides from injector
+    String maxDepthString = parseData.getMeta(MAX_DEPTH_KEY);
+    if (maxDepthString != null) {
+      curMaxDepth = Integer.parseInt(maxDepthString);
+      customMaxDepth = new IntWritable(curMaxDepth);
+    }
+    if (curDepth >= curMaxDepth) {
+      // depth exceeded - throw away
+      LOG.info("Depth limit (" + curMaxDepth
+          + ") reached, ignoring outlinks for " + fromUrl);
+      targets.clear();
+      return adjust;
+    }
+    Iterator<Entry<Text, CrawlDatum>> it = targets.iterator();
+    while (it.hasNext()) {
+      Entry<Text, CrawlDatum> e = it.next();
+      // record increased depth
+      e.getValue().getMetaData()
+          .put(DEPTH_KEY_W, new IntWritable(curDepth + 1));
+      // record maxDepth if any
+      if (customMaxDepth != null) {
+        e.getValue().getMetaData().put(MAX_DEPTH_KEY_W, customMaxDepth);
+      }
+    }
+    return adjust;
+  }
+
+  // prioritize by smaller values of depth
+  @Override
+  public float generatorSortValue(Text url, CrawlDatum datum, float initSort)
+      throws ScoringFilterException {
+    // boost up by current depth
+    int curDepth, curMaxDepth;
+    IntWritable maxDepth = (IntWritable) datum.getMetaData().get(
+        MAX_DEPTH_KEY_W);
+    if (maxDepth != null) {
+      curMaxDepth = maxDepth.get();
+    } else {
+      curMaxDepth = defaultMaxDepth;
+    }
+    IntWritable depth = (IntWritable) datum.getMetaData().get(DEPTH_KEY_W);
+    if (depth == null) {
+      // penalize
+      curDepth = curMaxDepth;
+    } else {
+      curDepth = depth.get();
+    }
+    int mul = curMaxDepth - curDepth;
+    return initSort * (1 + mul);
+  }
+
+  public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum,
+      CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore)
+      throws ScoringFilterException {
+    return initScore;
+  }
+
+  @Override
+  public void initialScore(Text url, CrawlDatum datum)
+      throws ScoringFilterException {
+    // the datum might already have some values set
+    // e.g. obtained from redirection
+    // in which case we don't want to override them
+    if (datum.getMetaData().get(MAX_DEPTH_KEY_W) == null)
+      datum.getMetaData()
+          .put(MAX_DEPTH_KEY_W, new IntWritable(defaultMaxDepth));
+    // initial depth is 1
+    if (datum.getMetaData().get(DEPTH_KEY_W) == null)
+      datum.getMetaData().put(DEPTH_KEY_W, new IntWritable(1));
+  }
+
+  @Override
+  public void injectedScore(Text url, CrawlDatum datum)
+      throws ScoringFilterException {
+
+    // check for the presence of the depth limit key
+    if (datum.getMetaData().get(MAX_DEPTH_KEY_W) != null) {
+      // convert from Text to Int
+      String depthString = datum.getMetaData().get(MAX_DEPTH_KEY_W).toString();
+      datum.getMetaData().remove(MAX_DEPTH_KEY_W);
+      int depth = Integer.parseInt(depthString);
+      datum.getMetaData().put(MAX_DEPTH_KEY_W, new IntWritable(depth));
+    } else { // put the default
+      datum.getMetaData()
+          .put(MAX_DEPTH_KEY_W, new IntWritable(defaultMaxDepth));
+    }
+    // initial depth is 1
+    datum.getMetaData().put(DEPTH_KEY_W, new IntWritable(1));
+  }
+
+  @Override
+  public void passScoreAfterParsing(Text url, Content content, Parse parse)
+      throws ScoringFilterException {
+    String depth = content.getMetadata().get(DEPTH_KEY);
+    if (depth != null) {
+      parse.getData().getParseMeta().set(DEPTH_KEY, depth);
+    }
+    String maxdepth = content.getMetadata().get(MAX_DEPTH_KEY);
+    if (maxdepth != null) {
+      parse.getData().getParseMeta().set(MAX_DEPTH_KEY, maxdepth);
+    }
+  }
+
+  @Override
+  public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content)
+      throws ScoringFilterException {
+    IntWritable depth = (IntWritable) datum.getMetaData().get(DEPTH_KEY_W);
+    if (depth != null) {
+      content.getMetadata().set(DEPTH_KEY, depth.toString());
+    }
+    IntWritable maxdepth = (IntWritable) datum.getMetaData().get(
+        MAX_DEPTH_KEY_W);
+    if (maxdepth != null) {
+      content.getMetadata().set(MAX_DEPTH_KEY, maxdepth.toString());
+    }
+  }
+
+  @Override
+  public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum,
+      List<CrawlDatum> inlinked) throws ScoringFilterException {
+    // find a minimum of all depths
+    int newDepth = DEFAULT_MAX_DEPTH;
+    if (old != null) {
+      IntWritable oldDepth = (IntWritable) old.getMetaData().get(DEPTH_KEY_W);
+      if (oldDepth != null) {
+        newDepth = oldDepth.get();
+      } else {
+        // not set ?
+        initialScore(url, old);
+      }
+    }
+    for (CrawlDatum lnk : inlinked) {
+      IntWritable depth = (IntWritable) lnk.getMetaData().get(DEPTH_KEY_W);
+      if (depth != null && depth.get() < newDepth) {
+        newDepth = depth.get();
+      }
+    }
+    datum.getMetaData().put(DEPTH_KEY_W, new IntWritable(newDepth));
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-depth/src/main/java/org/apache/nutch/scoring/depth/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-depth/src/main/java/org/apache/nutch/scoring/depth/package-info.java b/nutch-plugins/scoring-depth/src/main/java/org/apache/nutch/scoring/depth/package-info.java
new file mode 100644
index 0000000..aa89797
--- /dev/null
+++ b/nutch-plugins/scoring-depth/src/main/java/org/apache/nutch/scoring/depth/package-info.java
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Scoring filter to stop crawling at a configurable depth
+ * (number of "hops" from seed URLs).
+ */
+package org.apache.nutch.scoring.depth;
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-link/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-link/build.xml b/nutch-plugins/scoring-link/build.xml
new file mode 100644
index 0000000..123b1ea
--- /dev/null
+++ b/nutch-plugins/scoring-link/build.xml
@@ -0,0 +1,27 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="scoring-link" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+  <!-- Deploy Unit test dependencies -->
+  <target name="deps-test">
+    <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
+  </target>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/scoring-link/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/scoring-link/ivy.xml b/nutch-plugins/scoring-link/ivy.xml
new file mode 100644
index 0000000..1a86d68
--- /dev/null
+++ b/nutch-plugins/scoring-link/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

[48/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build for nutch-core and nutch-plugins

Posted by th...@apache.org.

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/crawl/FetchSchedule.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/crawl/FetchSchedule.java b/nutch-core/src/main/java/org/apache/nutch/crawl/FetchSchedule.java
new file mode 100755
index 0000000..10ee185
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/crawl/FetchSchedule.java
@@ -0,0 +1,208 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+import org.apache.hadoop.conf.Configurable;
+import org.apache.hadoop.io.Text;
+
+/**
+ * This interface defines the contract for implementations that manipulate fetch
+ * times and re-fetch intervals.
+ * 
+ * @author Andrzej Bialecki
+ */
+public interface FetchSchedule extends Configurable {
+
+  /** It is unknown whether page was changed since our last visit. */
+  public static final int STATUS_UNKNOWN = 0;
+  /** Page is known to have been modified since our last visit. */
+  public static final int STATUS_MODIFIED = 1;
+  /** Page is known to remain unmodified since our last visit. */
+  public static final int STATUS_NOTMODIFIED = 2;
+
+  public static final int SECONDS_PER_DAY = 3600 * 24;
+
+  /**
+   * Initialize fetch schedule related data. Implementations should at least set
+   * the <code>fetchTime</code> and <code>fetchInterval</code>. The default
+   * implementation set the <code>fetchTime</code> to now, using the default
+   * <code>fetchInterval</code>.
+   * 
+   * @param url
+   *          URL of the page.
+   * 
+   * @param datum
+   *          datum instance to be initialized.
+   * 
+   * @return adjusted page information, including all original information.
+   *         NOTE: this may be a different instance than @see CrawlDatum, but
+   *         implementations should make sure that it contains at least all
+   *         information from @see CrawlDatum.
+   */
+  public CrawlDatum initializeSchedule(Text url, CrawlDatum datum);
+
+  /**
+   * Sets the <code>fetchInterval</code> and <code>fetchTime</code> on a
+   * successfully fetched page. Implementations may use supplied arguments to
+   * support different re-fetching schedules.
+   * 
+   * @param url
+   *          url of the page
+   * 
+   * @param datum
+   *          page description to be adjusted. NOTE: this instance, passed by
+   *          reference, may be modified inside the method.
+   * 
+   * @param prevFetchTime
+   *          previous value of fetch time, or 0 if not available.
+   * 
+   * @param prevModifiedTime
+   *          previous value of modifiedTime, or 0 if not available.
+   * 
+   * @param fetchTime
+   *          the latest time, when the page was recently re-fetched. Most
+   *          FetchSchedule implementations should update the value in @see
+   *          CrawlDatum to something greater than this value.
+   * 
+   * @param modifiedTime
+   *          last time the content was modified. This information comes from
+   *          the protocol implementations, or is set to < 0 if not available.
+   *          Most FetchSchedule implementations should update the value in @see
+   *          CrawlDatum to this value.
+   * 
+   * @param state
+   *          if {@link #STATUS_MODIFIED}, then the content is considered to be
+   *          "changed" before the <code>fetchTime</code>, if
+   *          {@link #STATUS_NOTMODIFIED} then the content is known to be
+   *          unchanged. This information may be obtained by comparing page
+   *          signatures before and after fetching. If this is set to
+   *          {@link #STATUS_UNKNOWN}, then it is unknown whether the page was
+   *          changed; implementations are free to follow a sensible default
+   *          behavior.
+   * 
+   * @return adjusted page information, including all original information.
+   *         NOTE: this may be a different instance than @see CrawlDatum, but
+   *         implementations should make sure that it contains at least all
+   *         information from @see CrawlDatum}.
+   */
+  public CrawlDatum setFetchSchedule(Text url, CrawlDatum datum,
+      long prevFetchTime, long prevModifiedTime, long fetchTime,
+      long modifiedTime, int state);
+
+  /**
+   * This method specifies how to schedule refetching of pages marked as GONE.
+   * Default implementation increases fetchInterval by 50%, and if it exceeds
+   * the <code>maxInterval</code> it calls
+   * {@link #forceRefetch(Text, CrawlDatum, boolean)}.
+   * 
+   * @param url
+   *          URL of the page
+   * 
+   * @param datum
+   *          datum instance to be adjusted.
+   * 
+   * @return adjusted page information, including all original information.
+   *         NOTE: this may be a different instance than @see CrawlDatum, but
+   *         implementations should make sure that it contains at least all
+   *         information from @see CrawlDatum.
+   */
+  public CrawlDatum setPageGoneSchedule(Text url, CrawlDatum datum,
+      long prevFetchTime, long prevModifiedTime, long fetchTime);
+
+  /**
+   * This method adjusts the fetch schedule if fetching needs to be re-tried due
+   * to transient errors. The default implementation sets the next fetch time 1
+   * day in the future and increases the retry counter.
+   * 
+   * @param url
+   *          URL of the page.
+   * 
+   * @param datum
+   *          page information.
+   * 
+   * @param prevFetchTime
+   *          previous fetch time.
+   * 
+   * @param prevModifiedTime
+   *          previous modified time.
+   * 
+   * @param fetchTime
+   *          current fetch time.
+   * 
+   * @return adjusted page information, including all original information.
+   *         NOTE: this may be a different instance than @see CrawlDatum, but
+   *         implementations should make sure that it contains at least all
+   *         information from @see CrawlDatum.
+   */
+  public CrawlDatum setPageRetrySchedule(Text url, CrawlDatum datum,
+      long prevFetchTime, long prevModifiedTime, long fetchTime);
+
+  /**
+   * Calculates last fetch time of the given CrawlDatum.
+   * 
+   * @return the date as a long.
+   */
+  public long calculateLastFetchTime(CrawlDatum datum);
+
+  /**
+   * This method provides information whether the page is suitable for selection
+   * in the current fetchlist. NOTE: a true return value does not guarantee that
+   * the page will be fetched, it just allows it to be included in the further
+   * selection process based on scores. The default implementation checks
+   * <code>fetchTime</code>, if it is higher than the curTime it returns false,
+   * and true otherwise. It will also check that fetchTime is not too remote
+   * (more than <code>maxInterval</code), in which case it lowers the interval
+   * and returns true.
+   * 
+   * @param url
+   *          URL of the page.
+   * 
+   * @param datum
+   *          datum instance.
+   * 
+   * @param curTime
+   *          reference time (usually set to the time when the fetchlist
+   *          generation process was started).
+   * 
+   * @return true, if the page should be considered for inclusion in the current
+   *         fetchlist, otherwise false.
+   */
+  public boolean shouldFetch(Text url, CrawlDatum datum, long curTime);
+
+  /**
+   * This method resets fetchTime, fetchInterval, modifiedTime and page
+   * signature, so that it forces refetching.
+   * 
+   * @param url
+   *          URL of the page.
+   * 
+   * @param datum
+   *          datum instance.
+   * 
+   * @param asap
+   *          if true, force refetch as soon as possible - this sets the
+   *          fetchTime to now. If false, force refetch whenever the next fetch
+   *          time is set.
+   * 
+   * @return adjusted page information, including all original information.
+   *         NOTE: this may be a different instance than @see CrawlDatum, but
+   *         implementations should make sure that it contains at least all
+   *         information from @see CrawlDatum.
+   */
+  public CrawlDatum forceRefetch(Text url, CrawlDatum datum, boolean asap);
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/crawl/FetchScheduleFactory.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/crawl/FetchScheduleFactory.java b/nutch-core/src/main/java/org/apache/nutch/crawl/FetchScheduleFactory.java
new file mode 100755
index 0000000..7a84524
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/crawl/FetchScheduleFactory.java
@@ -0,0 +1,53 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.ObjectCache;
+
+/** Creates and caches a {@link FetchSchedule} implementation. */
+public class FetchScheduleFactory {
+
+  public static final Logger LOG = LoggerFactory
+      .getLogger(FetchScheduleFactory.class);
+
+  private FetchScheduleFactory() {
+  } // no public ctor
+
+  /** Return the FetchSchedule implementation. */
+  public synchronized static FetchSchedule getFetchSchedule(Configuration conf) {
+    String clazz = conf.get("db.fetch.schedule.class",
+        DefaultFetchSchedule.class.getName());
+    ObjectCache objectCache = ObjectCache.get(conf);
+    FetchSchedule impl = (FetchSchedule) objectCache.getObject(clazz);
+    if (impl == null) {
+      try {
+        LOG.info("Using FetchSchedule impl: " + clazz);
+        Class<?> implClass = Class.forName(clazz);
+        impl = (FetchSchedule) implClass.newInstance();
+        impl.setConf(conf);
+        objectCache.setObject(clazz, impl);
+      } catch (Exception e) {
+        throw new RuntimeException("Couldn't create " + clazz, e);
+      }
+    }
+    return impl;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/crawl/Generator.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/crawl/Generator.java b/nutch-core/src/main/java/org/apache/nutch/crawl/Generator.java
new file mode 100644
index 0000000..9a82089
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/crawl/Generator.java
@@ -0,0 +1,859 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+import java.io.*;
+import java.net.*;
+import java.util.*;
+import java.text.*;
+
+// rLogging imports
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.commons.jexl2.Expression;
+import org.apache.hadoop.io.*;
+import org.apache.hadoop.conf.*;
+import org.apache.hadoop.mapred.*;
+import org.apache.hadoop.mapred.lib.MultipleSequenceFileOutputFormat;
+import org.apache.hadoop.util.*;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.net.URLFilterException;
+import org.apache.nutch.net.URLFilters;
+import org.apache.nutch.net.URLNormalizers;
+import org.apache.nutch.scoring.ScoringFilterException;
+import org.apache.nutch.scoring.ScoringFilters;
+import org.apache.nutch.util.JexlUtil;
+import org.apache.nutch.util.LockUtil;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.NutchJob;
+import org.apache.nutch.util.NutchTool;
+import org.apache.nutch.util.TimingUtil;
+import org.apache.nutch.util.URLUtil;
+
+
+/**
+ * Generates a subset of a crawl db to fetch. This version allows to generate
+ * fetchlists for several segments in one go. Unlike in the initial version
+ * (OldGenerator), the IP resolution is done ONLY on the entries which have been
+ * selected for fetching. The URLs are partitioned by IP, domain or host within
+ * a segment. We can chose separately how to count the URLS i.e. by domain or
+ * host to limit the entries.
+ **/
+public class Generator extends NutchTool implements Tool {
+
+  public static final Logger LOG = LoggerFactory.getLogger(Generator.class);
+
+  public static final String GENERATE_UPDATE_CRAWLDB = "generate.update.crawldb";
+  public static final String GENERATOR_MIN_SCORE = "generate.min.score";
+  public static final String GENERATOR_MIN_INTERVAL = "generate.min.interval";
+  public static final String GENERATOR_RESTRICT_STATUS = "generate.restrict.status";
+  public static final String GENERATOR_FILTER = "generate.filter";
+  public static final String GENERATOR_NORMALISE = "generate.normalise";
+  public static final String GENERATOR_MAX_COUNT = "generate.max.count";
+  public static final String GENERATOR_COUNT_MODE = "generate.count.mode";
+  public static final String GENERATOR_COUNT_VALUE_DOMAIN = "domain";
+  public static final String GENERATOR_COUNT_VALUE_HOST = "host";
+  public static final String GENERATOR_TOP_N = "generate.topN";
+  public static final String GENERATOR_CUR_TIME = "generate.curTime";
+  public static final String GENERATOR_DELAY = "crawl.gen.delay";
+  public static final String GENERATOR_MAX_NUM_SEGMENTS = "generate.max.num.segments";
+  public static final String GENERATOR_EXPR = "generate.expr";
+
+  public static class SelectorEntry implements Writable {
+    public Text url;
+    public CrawlDatum datum;
+    public IntWritable segnum;
+
+    public SelectorEntry() {
+      url = new Text();
+      datum = new CrawlDatum();
+      segnum = new IntWritable(0);
+    }
+
+    public void readFields(DataInput in) throws IOException {
+      url.readFields(in);
+      datum.readFields(in);
+      segnum.readFields(in);
+    }
+
+    public void write(DataOutput out) throws IOException {
+      url.write(out);
+      datum.write(out);
+      segnum.write(out);
+    }
+
+    public String toString() {
+      return "url=" + url.toString() + ", datum=" + datum.toString()
+          + ", segnum=" + segnum.toString();
+    }
+  }
+
+  /** Selects entries due for fetch. */
+  public static class Selector implements
+      Mapper<Text, CrawlDatum, FloatWritable, SelectorEntry>,
+      Partitioner<FloatWritable, Writable>,
+      Reducer<FloatWritable, SelectorEntry, FloatWritable, SelectorEntry> {
+    private LongWritable genTime = new LongWritable(System.currentTimeMillis());
+    private long curTime;
+    private long limit;
+    private long count;
+    private HashMap<String, int[]> hostCounts = new HashMap<String, int[]>();
+    private int segCounts[];
+    private int maxCount;
+    private boolean byDomain = false;
+    private Partitioner<Text, Writable> partitioner = new URLPartitioner();
+    private URLFilters filters;
+    private URLNormalizers normalizers;
+    private ScoringFilters scfilters;
+    private SelectorEntry entry = new SelectorEntry();
+    private FloatWritable sortValue = new FloatWritable();
+    private boolean filter;
+    private boolean normalise;
+    private long genDelay;
+    private FetchSchedule schedule;
+    private float scoreThreshold = 0f;
+    private int intervalThreshold = -1;
+    private String restrictStatus = null;
+    private int maxNumSegments = 1;
+    private Expression expr = null;
+    private int currentsegmentnum = 1;
+
+    public void configure(JobConf job) {
+      curTime = job.getLong(GENERATOR_CUR_TIME, System.currentTimeMillis());
+      limit = job.getLong(GENERATOR_TOP_N, Long.MAX_VALUE)
+          / job.getNumReduceTasks();
+      maxCount = job.getInt(GENERATOR_MAX_COUNT, -1);
+      if (maxCount == -1) {
+        byDomain = false;
+      }
+      if (GENERATOR_COUNT_VALUE_DOMAIN.equals(job.get(GENERATOR_COUNT_MODE)))
+        byDomain = true;
+      filters = new URLFilters(job);
+      normalise = job.getBoolean(GENERATOR_NORMALISE, true);
+      if (normalise)
+        normalizers = new URLNormalizers(job,
+            URLNormalizers.SCOPE_GENERATE_HOST_COUNT);
+      scfilters = new ScoringFilters(job);
+      partitioner.configure(job);
+      filter = job.getBoolean(GENERATOR_FILTER, true);
+      genDelay = job.getLong(GENERATOR_DELAY, 7L) * 3600L * 24L * 1000L;
+      long time = job.getLong(Nutch.GENERATE_TIME_KEY, 0L);
+      if (time > 0)
+        genTime.set(time);
+      schedule = FetchScheduleFactory.getFetchSchedule(job);
+      scoreThreshold = job.getFloat(GENERATOR_MIN_SCORE, Float.NaN);
+      intervalThreshold = job.getInt(GENERATOR_MIN_INTERVAL, -1);
+      restrictStatus = job.get(GENERATOR_RESTRICT_STATUS, null);
+      expr = JexlUtil.parseExpression(job.get(GENERATOR_EXPR, null));
+      maxNumSegments = job.getInt(GENERATOR_MAX_NUM_SEGMENTS, 1);
+      segCounts = new int[maxNumSegments];
+    }
+
+    public void close() {
+    }
+
+    /** Select & invert subset due for fetch. */
+    public void map(Text key, CrawlDatum value,
+        OutputCollector<FloatWritable, SelectorEntry> output, Reporter reporter)
+        throws IOException {
+      Text url = key;
+      if (filter) {
+        // If filtering is on don't generate URLs that don't pass
+        // URLFilters
+        try {
+          if (filters.filter(url.toString()) == null)
+            return;
+        } catch (URLFilterException e) {
+          if (LOG.isWarnEnabled()) {
+            LOG.warn("Couldn't filter url: " + url + " (" + e.getMessage()
+                + ")");
+          }
+        }
+      }
+      CrawlDatum crawlDatum = value;
+
+      // check fetch schedule
+      if (!schedule.shouldFetch(url, crawlDatum, curTime)) {
+        LOG.debug("-shouldFetch rejected '" + url + "', fetchTime="
+            + crawlDatum.getFetchTime() + ", curTime=" + curTime);
+        return;
+      }
+
+      LongWritable oldGenTime = (LongWritable) crawlDatum.getMetaData().get(
+          Nutch.WRITABLE_GENERATE_TIME_KEY);
+      if (oldGenTime != null) { // awaiting fetch & update
+        if (oldGenTime.get() + genDelay > curTime) // still wait for
+          // update
+          return;
+      }
+      float sort = 1.0f;
+      try {
+        sort = scfilters.generatorSortValue(key, crawlDatum, sort);
+      } catch (ScoringFilterException sfe) {
+        if (LOG.isWarnEnabled()) {
+          LOG.warn("Couldn't filter generatorSortValue for " + key + ": " + sfe);
+        }
+      }
+      
+      // check expr
+      if (expr != null) {
+        if (!crawlDatum.evaluate(expr)) {
+          return;
+        }
+      }
+
+      if (restrictStatus != null
+          && !restrictStatus.equalsIgnoreCase(CrawlDatum
+              .getStatusName(crawlDatum.getStatus())))
+        return;
+
+      // consider only entries with a score superior to the threshold
+      if (scoreThreshold != Float.NaN && sort < scoreThreshold)
+        return;
+
+      // consider only entries with a retry (or fetch) interval lower than
+      // threshold
+      if (intervalThreshold != -1
+          && crawlDatum.getFetchInterval() > intervalThreshold)
+        return;
+
+      // sort by decreasing score, using DecreasingFloatComparator
+      sortValue.set(sort);
+      // record generation time
+      crawlDatum.getMetaData().put(Nutch.WRITABLE_GENERATE_TIME_KEY, genTime);
+      entry.datum = crawlDatum;
+      entry.url = key;
+      output.collect(sortValue, entry); // invert for sort by score
+    }
+
+    /** Partition by host / domain or IP. */
+    public int getPartition(FloatWritable key, Writable value,
+        int numReduceTasks) {
+      return partitioner.getPartition(((SelectorEntry) value).url, key,
+          numReduceTasks);
+    }
+
+    /** Collect until limit is reached. */
+    public void reduce(FloatWritable key, Iterator<SelectorEntry> values,
+        OutputCollector<FloatWritable, SelectorEntry> output, Reporter reporter)
+        throws IOException {
+
+      while (values.hasNext()) {
+
+        if (count == limit) {
+          // do we have any segments left?
+          if (currentsegmentnum < maxNumSegments) {
+            count = 0;
+            currentsegmentnum++;
+          } else
+            break;
+        }
+
+        SelectorEntry entry = values.next();
+        Text url = entry.url;
+        String urlString = url.toString();
+        URL u = null;
+
+        String hostordomain = null;
+
+        try {
+          if (normalise && normalizers != null) {
+            urlString = normalizers.normalize(urlString,
+                URLNormalizers.SCOPE_GENERATE_HOST_COUNT);
+          }
+          u = new URL(urlString);
+          if (byDomain) {
+            hostordomain = URLUtil.getDomainName(u);
+          } else {
+            hostordomain = new URL(urlString).getHost();
+          }
+        } catch (Exception e) {
+          LOG.warn("Malformed URL: '" + urlString + "', skipping ("
+              + StringUtils.stringifyException(e) + ")");
+          reporter.getCounter("Generator", "MALFORMED_URL").increment(1);
+          continue;
+        }
+
+        hostordomain = hostordomain.toLowerCase();
+
+        // only filter if we are counting hosts or domains
+        if (maxCount > 0) {
+          int[] hostCount = hostCounts.get(hostordomain);
+          if (hostCount == null) {
+            hostCount = new int[] { 1, 0 };
+            hostCounts.put(hostordomain, hostCount);
+          }
+
+          // increment hostCount
+          hostCount[1]++;
+
+          // check if topN reached, select next segment if it is
+          while (segCounts[hostCount[0] - 1] >= limit
+              && hostCount[0] < maxNumSegments) {
+            hostCount[0]++;
+            hostCount[1] = 0;
+          }
+
+          // reached the limit of allowed URLs per host / domain
+          // see if we can put it in the next segment?
+          if (hostCount[1] >= maxCount) {
+            if (hostCount[0] < maxNumSegments) {
+              hostCount[0]++;
+              hostCount[1] = 0;
+            } else {
+              if (hostCount[1] == maxCount + 1 && LOG.isInfoEnabled()) {
+                LOG.info("Host or domain "
+                    + hostordomain
+                    + " has more than "
+                    + maxCount
+                    + " URLs for all "
+                    + maxNumSegments
+                    + " segments. Additional URLs won't be included in the fetchlist.");
+              }
+              // skip this entry
+              continue;
+            }
+          }
+          entry.segnum = new IntWritable(hostCount[0]);
+          segCounts[hostCount[0] - 1]++;
+        } else {
+          entry.segnum = new IntWritable(currentsegmentnum);
+          segCounts[currentsegmentnum - 1]++;
+        }
+
+        output.collect(key, entry);
+
+        // Count is incremented only when we keep the URL
+        // maxCount may cause us to skip it.
+        count++;
+      }
+    }
+  }
+
+  // Allows the reducers to generate one subfile per
+  public static class GeneratorOutputFormat extends
+      MultipleSequenceFileOutputFormat<FloatWritable, SelectorEntry> {
+    // generate a filename based on the segnum stored for this entry
+    protected String generateFileNameForKeyValue(FloatWritable key,
+        SelectorEntry value, String name) {
+      return "fetchlist-" + value.segnum.toString() + "/" + name;
+    }
+
+  }
+
+  public static class DecreasingFloatComparator extends
+      FloatWritable.Comparator {
+
+    /** Compares two FloatWritables decreasing. */
+    public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
+      return super.compare(b2, s2, l2, b1, s1, l1);
+    }
+  }
+
+  public static class SelectorInverseMapper extends MapReduceBase implements
+      Mapper<FloatWritable, SelectorEntry, Text, SelectorEntry> {
+
+    public void map(FloatWritable key, SelectorEntry value,
+        OutputCollector<Text, SelectorEntry> output, Reporter reporter)
+        throws IOException {
+      SelectorEntry entry = value;
+      output.collect(entry.url, entry);
+    }
+  }
+
+  public static class PartitionReducer extends MapReduceBase implements
+      Reducer<Text, SelectorEntry, Text, CrawlDatum> {
+
+    public void reduce(Text key, Iterator<SelectorEntry> values,
+        OutputCollector<Text, CrawlDatum> output, Reporter reporter)
+        throws IOException {
+      // if using HashComparator, we get only one input key in case of
+      // hash collision
+      // so use only URLs from values
+      while (values.hasNext()) {
+        SelectorEntry entry = values.next();
+        output.collect(entry.url, entry.datum);
+      }
+    }
+
+  }
+
+  /** Sort fetch lists by hash of URL. */
+  public static class HashComparator extends WritableComparator {
+    public HashComparator() {
+      super(Text.class);
+    }
+
+    @SuppressWarnings("rawtypes")
+    public int compare(WritableComparable a, WritableComparable b) {
+      Text url1 = (Text) a;
+      Text url2 = (Text) b;
+      int hash1 = hash(url1.getBytes(), 0, url1.getLength());
+      int hash2 = hash(url2.getBytes(), 0, url2.getLength());
+      return (hash1 < hash2 ? -1 : (hash1 == hash2 ? 0 : 1));
+    }
+
+    public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
+      int hash1 = hash(b1, s1, l1);
+      int hash2 = hash(b2, s2, l2);
+      return (hash1 < hash2 ? -1 : (hash1 == hash2 ? 0 : 1));
+    }
+
+    private static int hash(byte[] bytes, int start, int length) {
+      int hash = 1;
+      // make later bytes more significant in hash code, so that sorting
+      // by
+      // hashcode correlates less with by-host ordering.
+      for (int i = length - 1; i >= 0; i--)
+        hash = (31 * hash) + (int) bytes[start + i];
+      return hash;
+    }
+  }
+
+  /**
+   * Update the CrawlDB so that the next generate won't include the same URLs.
+   */
+  public static class CrawlDbUpdater extends MapReduceBase implements
+      Mapper<Text, CrawlDatum, Text, CrawlDatum>,
+      Reducer<Text, CrawlDatum, Text, CrawlDatum> {
+    long generateTime;
+
+    public void configure(JobConf job) {
+      generateTime = job.getLong(Nutch.GENERATE_TIME_KEY, 0L);
+    }
+
+    public void map(Text key, CrawlDatum value,
+        OutputCollector<Text, CrawlDatum> output, Reporter reporter)
+        throws IOException {
+      output.collect(key, value);
+    }
+
+    private CrawlDatum orig = new CrawlDatum();
+    private LongWritable genTime = new LongWritable(0L);
+
+    public void reduce(Text key, Iterator<CrawlDatum> values,
+        OutputCollector<Text, CrawlDatum> output, Reporter reporter)
+        throws IOException {
+      genTime.set(0L);
+      while (values.hasNext()) {
+        CrawlDatum val = values.next();
+        if (val.getMetaData().containsKey(Nutch.WRITABLE_GENERATE_TIME_KEY)) {
+          LongWritable gt = (LongWritable) val.getMetaData().get(
+              Nutch.WRITABLE_GENERATE_TIME_KEY);
+          genTime.set(gt.get());
+          if (genTime.get() != generateTime) {
+            orig.set(val);
+            genTime.set(0L);
+            continue;
+          }
+        } else {
+          orig.set(val);
+        }
+      }
+      if (genTime.get() != 0L) {
+        orig.getMetaData().put(Nutch.WRITABLE_GENERATE_TIME_KEY, genTime);
+      }
+      output.collect(key, orig);
+    }
+  }
+
+  public Generator() {
+  }
+
+  public Generator(Configuration conf) {
+    setConf(conf);
+  }
+
+  public Path[] generate(Path dbDir, Path segments, int numLists, long topN,
+      long curTime) throws IOException {
+
+    JobConf job = new NutchJob(getConf());
+    boolean filter = job.getBoolean(GENERATOR_FILTER, true);
+    boolean normalise = job.getBoolean(GENERATOR_NORMALISE, true);
+    return generate(dbDir, segments, numLists, topN, curTime, filter,
+        normalise, false, 1, null);
+  }
+
+  /**
+   * old signature used for compatibility - does not specify whether or not to
+   * normalise and set the number of segments to 1
+   **/
+  public Path[] generate(Path dbDir, Path segments, int numLists, long topN,
+      long curTime, boolean filter, boolean force) throws IOException {
+    return generate(dbDir, segments, numLists, topN, curTime, filter, true,
+        force, 1, null);
+  }
+
+  /**
+   * Generate fetchlists in one or more segments. Whether to filter URLs or not
+   * is read from the crawl.generate.filter property in the configuration files.
+   * If the property is not found, the URLs are filtered. Same for the
+   * normalisation.
+   * 
+   * @param dbDir
+   *          Crawl database directory
+   * @param segments
+   *          Segments directory
+   * @param numLists
+   *          Number of reduce tasks
+   * @param topN
+   *          Number of top URLs to be selected
+   * @param curTime
+   *          Current time in milliseconds
+   * 
+   * @return Path to generated segment or null if no entries were selected
+   * 
+   * @throws IOException
+   *           When an I/O error occurs
+   */
+  public Path[] generate(Path dbDir, Path segments, int numLists, long topN,
+      long curTime, boolean filter, boolean norm, boolean force,
+      int maxNumSegments, String expr) throws IOException {
+
+    Path tempDir = new Path(getConf().get("mapred.temp.dir", ".")
+        + "/generate-temp-" + java.util.UUID.randomUUID().toString());
+
+    Path lock = new Path(dbDir, CrawlDb.LOCK_NAME);
+    FileSystem fs = FileSystem.get(getConf());
+    LockUtil.createLockFile(fs, lock, force);
+    
+    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+    long start = System.currentTimeMillis();
+    LOG.info("Generator: starting at " + sdf.format(start));
+    LOG.info("Generator: Selecting best-scoring urls due for fetch.");
+    LOG.info("Generator: filtering: " + filter);
+    LOG.info("Generator: normalizing: " + norm);
+    if (topN != Long.MAX_VALUE) {
+      LOG.info("Generator: topN: " + topN);
+    }
+    if (expr != null) {
+      LOG.info("Generator: expr: " + expr);
+    }
+    
+    // map to inverted subset due for fetch, sort by score
+    JobConf job = new NutchJob(getConf());
+    job.setJobName("generate: select from " + dbDir);
+
+    if (numLists == -1) { // for politeness make
+      numLists = job.getNumMapTasks(); // a partition per fetch task
+    }
+    if ("local".equals(job.get("mapreduce.framework.name")) && numLists != 1) {
+      // override
+      LOG.info("Generator: running in local mode, generating exactly one partition.");
+      numLists = 1;
+    }
+    job.setLong(GENERATOR_CUR_TIME, curTime);
+    // record real generation time
+    long generateTime = System.currentTimeMillis();
+    job.setLong(Nutch.GENERATE_TIME_KEY, generateTime);
+    job.setLong(GENERATOR_TOP_N, topN);
+    job.setBoolean(GENERATOR_FILTER, filter);
+    job.setBoolean(GENERATOR_NORMALISE, norm);
+    job.setInt(GENERATOR_MAX_NUM_SEGMENTS, maxNumSegments);
+    if (expr != null) {
+      job.set(GENERATOR_EXPR, expr);
+    }
+    FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME));
+    job.setInputFormat(SequenceFileInputFormat.class);
+
+    job.setMapperClass(Selector.class);
+    job.setPartitionerClass(Selector.class);
+    job.setReducerClass(Selector.class);
+
+    FileOutputFormat.setOutputPath(job, tempDir);
+    job.setOutputFormat(SequenceFileOutputFormat.class);
+    job.setOutputKeyClass(FloatWritable.class);
+    job.setOutputKeyComparatorClass(DecreasingFloatComparator.class);
+    job.setOutputValueClass(SelectorEntry.class);
+    job.setOutputFormat(GeneratorOutputFormat.class);
+
+    try {
+      JobClient.runJob(job);
+    } catch (IOException e) {
+      LockUtil.removeLockFile(fs, lock);
+      fs.delete(tempDir, true);
+      throw e;
+    }
+
+    // read the subdirectories generated in the temp
+    // output and turn them into segments
+    List<Path> generatedSegments = new ArrayList<Path>();
+
+    FileStatus[] status = fs.listStatus(tempDir);
+    try {
+      for (FileStatus stat : status) {
+        Path subfetchlist = stat.getPath();
+        if (!subfetchlist.getName().startsWith("fetchlist-"))
+          continue;
+        // start a new partition job for this segment
+        Path newSeg = partitionSegment(fs, segments, subfetchlist, numLists);
+        generatedSegments.add(newSeg);
+      }
+    } catch (Exception e) {
+      LOG.warn("Generator: exception while partitioning segments, exiting ...");
+      fs.delete(tempDir, true);
+      return null;
+    }
+
+    if (generatedSegments.size() == 0) {
+      LOG.warn("Generator: 0 records selected for fetching, exiting ...");
+      LockUtil.removeLockFile(fs, lock);
+      fs.delete(tempDir, true);
+      return null;
+    }
+
+    if (getConf().getBoolean(GENERATE_UPDATE_CRAWLDB, false)) {
+      // update the db from tempDir
+      Path tempDir2 = new Path(getConf().get("mapred.temp.dir", ".")
+          + "/generate-temp-" + java.util.UUID.randomUUID().toString());
+
+      job = new NutchJob(getConf());
+      job.setJobName("generate: updatedb " + dbDir);
+      job.setLong(Nutch.GENERATE_TIME_KEY, generateTime);
+      for (Path segmpaths : generatedSegments) {
+        Path subGenDir = new Path(segmpaths, CrawlDatum.GENERATE_DIR_NAME);
+        FileInputFormat.addInputPath(job, subGenDir);
+      }
+      FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME));
+      job.setInputFormat(SequenceFileInputFormat.class);
+      job.setMapperClass(CrawlDbUpdater.class);
+      job.setReducerClass(CrawlDbUpdater.class);
+      job.setOutputFormat(MapFileOutputFormat.class);
+      job.setOutputKeyClass(Text.class);
+      job.setOutputValueClass(CrawlDatum.class);
+      FileOutputFormat.setOutputPath(job, tempDir2);
+      try {
+        JobClient.runJob(job);
+        CrawlDb.install(job, dbDir);
+      } catch (IOException e) {
+        LockUtil.removeLockFile(fs, lock);
+        fs.delete(tempDir, true);
+        fs.delete(tempDir2, true);
+        throw e;
+      }
+      fs.delete(tempDir2, true);
+    }
+
+    LockUtil.removeLockFile(fs, lock);
+    fs.delete(tempDir, true);
+
+    long end = System.currentTimeMillis();
+    LOG.info("Generator: finished at " + sdf.format(end) + ", elapsed: "
+        + TimingUtil.elapsedTime(start, end));
+
+    Path[] patharray = new Path[generatedSegments.size()];
+    return generatedSegments.toArray(patharray);
+  }
+
+  private Path partitionSegment(FileSystem fs, Path segmentsDir, Path inputDir,
+      int numLists) throws IOException {
+    // invert again, partition by host/domain/IP, sort by url hash
+    if (LOG.isInfoEnabled()) {
+      LOG.info("Generator: Partitioning selected urls for politeness.");
+    }
+    Path segment = new Path(segmentsDir, generateSegmentName());
+    Path output = new Path(segment, CrawlDatum.GENERATE_DIR_NAME);
+
+    LOG.info("Generator: segment: " + segment);
+
+    NutchJob job = new NutchJob(getConf());
+    job.setJobName("generate: partition " + segment);
+
+    job.setInt("partition.url.seed", new Random().nextInt());
+
+    FileInputFormat.addInputPath(job, inputDir);
+    job.setInputFormat(SequenceFileInputFormat.class);
+
+    job.setMapperClass(SelectorInverseMapper.class);
+    job.setMapOutputKeyClass(Text.class);
+    job.setMapOutputValueClass(SelectorEntry.class);
+    job.setPartitionerClass(URLPartitioner.class);
+    job.setReducerClass(PartitionReducer.class);
+    job.setNumReduceTasks(numLists);
+
+    FileOutputFormat.setOutputPath(job, output);
+    job.setOutputFormat(SequenceFileOutputFormat.class);
+    job.setOutputKeyClass(Text.class);
+    job.setOutputValueClass(CrawlDatum.class);
+    job.setOutputKeyComparatorClass(HashComparator.class);
+    JobClient.runJob(job);
+    return segment;
+  }
+
+  private static SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMddHHmmss");
+
+  public static synchronized String generateSegmentName() {
+    try {
+      Thread.sleep(1000);
+    } catch (Throwable t) {
+    }
+    ;
+    return sdf.format(new Date(System.currentTimeMillis()));
+  }
+
+  /**
+   * Generate a fetchlist from the crawldb.
+   */
+  public static void main(String args[]) throws Exception {
+    int res = ToolRunner
+        .run(NutchConfiguration.create(), new Generator(), args);
+    System.exit(res);
+  }
+
+  public int run(String[] args) throws Exception {
+    if (args.length < 2) {
+      System.out
+          .println("Usage: Generator <crawldb> <segments_dir> [-force] [-topN N] [-numFetchers numFetchers] [-expr <expr>] [-adddays <numDays>] [-noFilter] [-noNorm] [-maxNumSegments <num>]");
+      return -1;
+    }
+
+    Path dbDir = new Path(args[0]);
+    Path segmentsDir = new Path(args[1]);
+    long curTime = System.currentTimeMillis();
+    long topN = Long.MAX_VALUE;
+    int numFetchers = -1;
+    boolean filter = true;
+    boolean norm = true;
+    boolean force = false;
+    String expr = null;
+    int maxNumSegments = 1;
+
+    for (int i = 2; i < args.length; i++) {
+      if ("-topN".equals(args[i])) {
+        topN = Long.parseLong(args[i + 1]);
+        i++;
+      } else if ("-numFetchers".equals(args[i])) {
+        numFetchers = Integer.parseInt(args[i + 1]);
+        i++;
+      } else if ("-adddays".equals(args[i])) {
+        long numDays = Integer.parseInt(args[i + 1]);
+        curTime += numDays * 1000L * 60 * 60 * 24;
+      } else if ("-noFilter".equals(args[i])) {
+        filter = false;
+      } else if ("-noNorm".equals(args[i])) {
+        norm = false;
+      } else if ("-force".equals(args[i])) {
+        force = true;
+      } else if ("-maxNumSegments".equals(args[i])) {
+        maxNumSegments = Integer.parseInt(args[i + 1]);
+      } else if ("-expr".equals(args[i])) {
+        expr = args[i + 1];
+      }
+
+    }
+
+    try {
+      Path[] segs = generate(dbDir, segmentsDir, numFetchers, topN, curTime,
+          filter, norm, force, maxNumSegments, expr);
+      if (segs == null)
+        return 1;
+    } catch (Exception e) {
+      LOG.error("Generator: " + StringUtils.stringifyException(e));
+      return -1;
+    }
+    return 0;
+  }
+
+  @Override
+  public Map<String, Object> run(Map<String, Object> args, String crawlId) throws Exception {
+
+    Map<String, Object> results = new HashMap<String, Object>();
+
+    long curTime = System.currentTimeMillis();
+    long topN = Long.MAX_VALUE;
+    int numFetchers = -1;
+    boolean filter = true;
+    boolean norm = true;
+    boolean force = false;
+    int maxNumSegments = 1;
+    String expr = null;
+
+    Path crawlDb;
+    if(args.containsKey(Nutch.ARG_CRAWLDB)) {
+      Object crawldbPath = args.get(Nutch.ARG_CRAWLDB);
+      if(crawldbPath instanceof Path) {
+        crawlDb = (Path) crawldbPath;
+      }
+      else {
+        crawlDb = new Path(crawldbPath.toString());
+      }
+    }
+    else {
+      crawlDb = new Path(crawlId+"/crawldb");
+    }
+
+    Path segmentsDir;
+    if(args.containsKey(Nutch.ARG_SEGMENTDIR)) {
+      Object segDir = args.get(Nutch.ARG_SEGMENTDIR);
+      if(segDir instanceof Path) {
+        segmentsDir = (Path) segDir;
+      }
+      else {
+        segmentsDir = new Path(segDir.toString());
+      }
+    }
+    else {
+      segmentsDir = new Path(crawlId+"/segments");
+    }
+    
+    if (args.containsKey("expr")) {
+      expr = (String)args.get("expr");
+    }
+    if (args.containsKey("topN")) {
+      topN = Long.parseLong((String)args.get("topN"));
+    }
+    if (args.containsKey("numFetchers")) {
+      numFetchers = Integer.parseInt((String)args.get("numFetchers"));
+    }
+    if (args.containsKey("adddays")) {
+      long numDays = Integer.parseInt((String)args.get("adddays"));
+      curTime += numDays * 1000L * 60 * 60 * 24;
+    }
+    if (args.containsKey("noFilter")) {
+      filter = false;
+    } 
+    if (args.containsKey("noNorm")) {
+      norm = false;
+    } 
+    if (args.containsKey("force")) {
+      force = true;
+    } 
+    if (args.containsKey("maxNumSegments")) {
+      maxNumSegments = Integer.parseInt((String)args.get("maxNumSegments"));
+    }
+
+    try {
+      Path[] segs = generate(crawlDb, segmentsDir, numFetchers, topN, curTime,
+          filter, norm, force, maxNumSegments, expr);
+      if (segs == null){
+        results.put(Nutch.VAL_RESULT, Integer.toString(1));
+        return results;
+      }
+
+    } catch (Exception e) {
+      LOG.error("Generator: " + StringUtils.stringifyException(e));
+      results.put(Nutch.VAL_RESULT, Integer.toString(-1));
+      return results;
+    }
+    results.put(Nutch.VAL_RESULT, Integer.toString(0));
+    return results;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/crawl/Injector.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/crawl/Injector.java b/nutch-core/src/main/java/org/apache/nutch/crawl/Injector.java
new file mode 100644
index 0000000..383aaf1
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/crawl/Injector.java
@@ -0,0 +1,510 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.FloatWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
+import org.apache.hadoop.mapreduce.lib.input.MultipleInputs;
+import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.mapreduce.lib.output.MapFileOutputFormat;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.net.URLFilters;
+import org.apache.nutch.net.URLNormalizers;
+import org.apache.nutch.scoring.ScoringFilterException;
+import org.apache.nutch.scoring.ScoringFilters;
+import org.apache.nutch.util.LockUtil;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.NutchTool;
+import org.apache.nutch.util.TimingUtil;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.text.SimpleDateFormat;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Random;
+
+/**
+ * Injector takes a flat file of URLs and merges ("injects") these URLs into the
+ * CrawlDb. Useful for bootstrapping a Nutch crawl. The URL files contain one
+ * URL per line, optionally followed by custom metadata separated by tabs with
+ * the metadata key separated from the corresponding value by '='.
+ * </p>
+ * <p>
+ * Note, that some metadata keys are reserved:
+ * <dl>
+ * <dt>nutch.score</dt>
+ * <dd>allows to set a custom score for a specific URL</dd>
+ * <dt>nutch.fetchInterval</dt>
+ * <dd>allows to set a custom fetch interval for a specific URL</dd>
+ * <dt>nutch.fetchInterval.fixed</dt>
+ * <dd>allows to set a custom fetch interval for a specific URL that is not
+ * changed by AdaptiveFetchSchedule</dd>
+ * </dl>
+ * </p>
+ * <p>
+ * Example:
+ * 
+ * <pre>
+ *  http://www.nutch.org/ \t nutch.score=10 \t nutch.fetchInterval=2592000 \t userType=open_source
+ * </pre>
+ * </p>
+ **/
+public class Injector extends NutchTool implements Tool {
+  public static final Logger LOG = LoggerFactory.getLogger(Injector.class);
+
+  /** metadata key reserved for setting a custom score for a specific URL */
+  public static String nutchScoreMDName = "nutch.score";
+
+  /**
+   * metadata key reserved for setting a custom fetchInterval for a specific URL
+   */
+  public static String nutchFetchIntervalMDName = "nutch.fetchInterval";
+
+  /**
+   * metadata key reserved for setting a fixed custom fetchInterval for a
+   * specific URL
+   */
+  public static String nutchFixedFetchIntervalMDName = "nutch.fetchInterval.fixed";
+
+  public static class InjectMapper
+      extends Mapper<Text, Writable, Text, CrawlDatum> {
+    public static final String URL_NORMALIZING_SCOPE = "crawldb.url.normalizers.scope";
+    public static final String TAB_CHARACTER = "\t";
+    public static final String EQUAL_CHARACTER = "=";
+
+    private URLNormalizers urlNormalizers;
+    private int interval;
+    private float scoreInjected;
+    private URLFilters filters;
+    private ScoringFilters scfilters;
+    private long curTime;
+    private boolean url404Purging;
+    private String scope;
+
+    public void setup(Context context) {
+      Configuration conf = context.getConfiguration();
+      scope = conf.get(URL_NORMALIZING_SCOPE, URLNormalizers.SCOPE_INJECT);
+      urlNormalizers = new URLNormalizers(conf, scope);
+      interval = conf.getInt("db.fetch.interval.default", 2592000);
+      filters = new URLFilters(conf);
+      scfilters = new ScoringFilters(conf);
+      scoreInjected = conf.getFloat("db.score.injected", 1.0f);
+      curTime = conf.getLong("injector.current.time",
+          System.currentTimeMillis());
+      url404Purging = conf.getBoolean(CrawlDb.CRAWLDB_PURGE_404, false);
+    }
+
+    /* Filter and normalize the input url */
+    private String filterNormalize(String url) {
+      if (url != null) {
+        try {
+          url = urlNormalizers.normalize(url, scope); // normalize the url
+          url = filters.filter(url); // filter the url
+        } catch (Exception e) {
+          LOG.warn("Skipping " + url + ":" + e);
+          url = null;
+        }
+      }
+      return url;
+    }
+
+    /**
+     * Extract metadata that could be passed along with url in a seeds file.
+     * Metadata must be key-value pair(s) and separated by a TAB_CHARACTER
+     */
+    private void processMetaData(String metadata, CrawlDatum datum,
+        String url) {
+      String[] splits = metadata.split(TAB_CHARACTER);
+
+      for (String split : splits) {
+        // find separation between name and value
+        int indexEquals = split.indexOf(EQUAL_CHARACTER);
+        if (indexEquals == -1) // skip anything without a EQUAL_CHARACTER
+          continue;
+
+        String metaname = split.substring(0, indexEquals);
+        String metavalue = split.substring(indexEquals + 1);
+
+        try {
+          if (metaname.equals(nutchScoreMDName)) {
+            datum.setScore(Float.parseFloat(metavalue));
+          } else if (metaname.equals(nutchFetchIntervalMDName)) {
+            datum.setFetchInterval(Integer.parseInt(metavalue));
+          } else if (metaname.equals(nutchFixedFetchIntervalMDName)) {
+            int fixedInterval = Integer.parseInt(metavalue);
+            if (fixedInterval > -1) {
+              // Set writable using float. Float is used by
+              // AdaptiveFetchSchedule
+              datum.getMetaData().put(Nutch.WRITABLE_FIXED_INTERVAL_KEY,
+                  new FloatWritable(fixedInterval));
+              datum.setFetchInterval(fixedInterval);
+            }
+          } else {
+            datum.getMetaData().put(new Text(metaname), new Text(metavalue));
+          }
+        } catch (NumberFormatException nfe) {
+          LOG.error("Invalid number '" + metavalue + "' in metadata '"
+              + metaname + "' for url " + url);
+        }
+      }
+    }
+
+    public void map(Text key, Writable value, Context context)
+        throws IOException, InterruptedException {
+      if (value instanceof Text) {
+        // if its a url from the seed list
+        String url = key.toString().trim();
+
+        // remove empty string or string starting with '#'
+        if (url.length() == 0 || url.startsWith("#"))
+          return;
+
+        url = filterNormalize(url);
+        if (url == null) {
+          context.getCounter("injector", "urls_filtered").increment(1);
+        } else {
+          CrawlDatum datum = new CrawlDatum();
+          datum.setStatus(CrawlDatum.STATUS_INJECTED);
+          datum.setFetchTime(curTime);
+          datum.setScore(scoreInjected);
+          datum.setFetchInterval(interval);
+
+          String metadata = value.toString().trim();
+          if (metadata.length() > 0)
+            processMetaData(metadata, datum, url);
+
+          try {
+            key.set(url);
+            scfilters.injectedScore(key, datum);
+          } catch (ScoringFilterException e) {
+            if (LOG.isWarnEnabled()) {
+              LOG.warn("Cannot filter injected score for url " + url
+                  + ", using default (" + e.getMessage() + ")");
+            }
+          }
+          context.getCounter("injector", "urls_injected").increment(1);
+          context.write(key, datum);
+        }
+      } else if (value instanceof CrawlDatum) {
+        // if its a crawlDatum from the input crawldb, emulate CrawlDbFilter's
+        // map()
+        CrawlDatum datum = (CrawlDatum) value;
+
+        // remove 404 urls
+        if (url404Purging && CrawlDatum.STATUS_DB_GONE == datum.getStatus())
+          return;
+
+        String url = filterNormalize(key.toString());
+        if (url != null) {
+          key.set(url);
+          context.write(key, datum);
+        }
+      }
+    }
+  }
+
+  /** Combine multiple new entries for a url. */
+  public static class InjectReducer
+      extends Reducer<Text, CrawlDatum, Text, CrawlDatum> {
+    private int interval;
+    private float scoreInjected;
+    private boolean overwrite = false;
+    private boolean update = false;
+    private CrawlDatum old = new CrawlDatum();
+    private CrawlDatum injected = new CrawlDatum();
+
+    public void setup(Context context) {
+      Configuration conf = context.getConfiguration();
+      interval = conf.getInt("db.fetch.interval.default", 2592000);
+      scoreInjected = conf.getFloat("db.score.injected", 1.0f);
+      overwrite = conf.getBoolean("db.injector.overwrite", false);
+      update = conf.getBoolean("db.injector.update", false);
+      LOG.info("Injector: overwrite: " + overwrite);
+      LOG.info("Injector: update: " + update);
+    }
+
+    /**
+     * Merge the input records as per rules below :
+     * 
+     * <pre>
+     * 1. If there is ONLY new injected record ==> emit injected record
+     * 2. If there is ONLY old record          ==> emit existing record
+     * 3. If BOTH new and old records are present:
+     *    (a) If 'overwrite' is true           ==> emit injected record
+     *    (b) If 'overwrite' is false :
+     *        (i)  If 'update' is false        ==> emit existing record
+     *        (ii) If 'update' is true         ==> update existing record and emit it
+     * </pre>
+     * 
+     * For more details @see NUTCH-1405
+     */
+    public void reduce(Text key, Iterable<CrawlDatum> values, Context context)
+        throws IOException, InterruptedException {
+
+      boolean oldSet = false;
+      boolean injectedSet = false;
+
+      // If we encounter a datum with status as STATUS_INJECTED, then its a
+      // newly injected record. All other statuses correspond to an old record.
+      for (CrawlDatum val : values) {
+        if (val.getStatus() == CrawlDatum.STATUS_INJECTED) {
+          injected.set(val);
+          injected.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
+          injectedSet = true;
+        } else {
+          old.set(val);
+          oldSet = true;
+        }
+      }
+
+      CrawlDatum result;
+      if (injectedSet && (!oldSet || overwrite)) {
+        // corresponds to rules (1) and (3.a) in the method description
+        result = injected;
+      } else {
+        // corresponds to rules (2) and (3.b) in the method description
+        result = old;
+
+        if (injectedSet && update) {
+          // corresponds to rule (3.b.ii) in the method description
+          old.putAllMetaData(injected);
+          old.setScore(injected.getScore() != scoreInjected
+              ? injected.getScore() : old.getScore());
+          old.setFetchInterval(injected.getFetchInterval() != interval
+              ? injected.getFetchInterval() : old.getFetchInterval());
+        }
+      }
+      if (injectedSet && oldSet) {
+        context.getCounter("injector", "urls_merged").increment(1);
+      }
+      context.write(key, result);
+    }
+  }
+
+  public Injector() {
+  }
+
+  public Injector(Configuration conf) {
+    setConf(conf);
+  }
+
+  public void inject(Path crawlDb, Path urlDir)
+      throws IOException, ClassNotFoundException, InterruptedException {
+    inject(crawlDb, urlDir, false, false);
+  }
+
+  public void inject(Path crawlDb, Path urlDir, boolean overwrite,
+      boolean update) throws IOException, ClassNotFoundException, InterruptedException {
+    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+    long start = System.currentTimeMillis();
+
+    if (LOG.isInfoEnabled()) {
+      LOG.info("Injector: starting at " + sdf.format(start));
+      LOG.info("Injector: crawlDb: " + crawlDb);
+      LOG.info("Injector: urlDir: " + urlDir);
+      LOG.info("Injector: Converting injected urls to crawl db entries.");
+    }
+
+    // set configuration
+    Configuration conf = getConf();
+    conf.setLong("injector.current.time", System.currentTimeMillis());
+    conf.setBoolean("db.injector.overwrite", overwrite);
+    conf.setBoolean("db.injector.update", update);
+    conf.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);
+
+    // create all the required paths
+    FileSystem fs = FileSystem.get(conf);
+    Path current = new Path(crawlDb, CrawlDb.CURRENT_NAME);
+    if (!fs.exists(current))
+      fs.mkdirs(current);
+
+    Path tempCrawlDb = new Path(crawlDb,
+        "crawldb-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
+
+    // lock an existing crawldb to prevent multiple simultaneous updates
+    Path lock = new Path(crawlDb, CrawlDb.LOCK_NAME);
+    LockUtil.createLockFile(fs, lock, false);
+
+    // configure job
+    Job job = Job.getInstance(conf, "inject " + urlDir);
+    job.setJarByClass(Injector.class);
+    job.setMapperClass(InjectMapper.class);
+    job.setReducerClass(InjectReducer.class);
+    job.setOutputFormatClass(MapFileOutputFormat.class);
+    job.setOutputKeyClass(Text.class);
+    job.setOutputValueClass(CrawlDatum.class);
+    job.setSpeculativeExecution(false);
+
+    // set input and output paths of the job
+    MultipleInputs.addInputPath(job, current, SequenceFileInputFormat.class);
+    MultipleInputs.addInputPath(job, urlDir, KeyValueTextInputFormat.class);
+    FileOutputFormat.setOutputPath(job, tempCrawlDb);
+
+    try {
+      // run the job
+      job.waitForCompletion(true);
+
+      // save output and perform cleanup
+      CrawlDb.install(job, crawlDb);
+
+      if (LOG.isInfoEnabled()) {
+        long urlsInjected = job.getCounters()
+            .findCounter("injector", "urls_injected").getValue();
+        long urlsFiltered = job.getCounters()
+            .findCounter("injector", "urls_filtered").getValue();
+        long urlsMerged = job.getCounters()
+            .findCounter("injector", "urls_merged").getValue();
+        LOG.info("Injector: Total urls rejected by filters: " + urlsFiltered);
+        LOG.info(
+            "Injector: Total urls injected after normalization and filtering: "
+                + urlsInjected);
+        LOG.info("Injector: Total urls injected but already in CrawlDb: "
+            + urlsMerged);
+        LOG.info("Injector: Total new urls injected: "
+            + (urlsInjected - urlsMerged));
+
+        long end = System.currentTimeMillis();
+        LOG.info("Injector: finished at " + sdf.format(end) + ", elapsed: "
+            + TimingUtil.elapsedTime(start, end));
+      }
+    } catch (IOException e) {
+      if (fs.exists(tempCrawlDb)) {
+        fs.delete(tempCrawlDb, true);
+      }
+      LockUtil.removeLockFile(fs, lock);
+      throw e;
+    }
+  }
+
+  public void usage() {
+    System.err.println(
+        "Usage: Injector <crawldb> <url_dir> [-overwrite] [-update]\n");
+    System.err.println(
+        "  <crawldb>\tPath to a crawldb directory. If not present, a new one would be created.");
+    System.err.println(
+        "  <url_dir>\tPath to directory with URL file(s) containing urls to be injected. A URL file");
+    System.err.println(
+        "           \tshould have one URL per line, optionally followed by custom metadata.");
+    System.err.println(
+        "           \tBlank lines or lines starting with a '#' would be ignored. Custom metadata must");
+    System.err
+        .println("           \tbe of form 'key=value' and separated by tabs.");
+    System.err.println("           \tBelow are reserved metadata keys:\n");
+    System.err.println("           \t\tnutch.score: A custom score for a url");
+    System.err.println(
+        "           \t\tnutch.fetchInterval: A custom fetch interval for a url");
+    System.err.println(
+        "           \t\tnutch.fetchInterval.fixed: A custom fetch interval for a url that is not "
+            + "changed by AdaptiveFetchSchedule\n");
+    System.err.println("           \tExample:");
+    System.err.println("           \t http://www.apache.org/");
+    System.err.println(
+        "           \t http://www.nutch.org/ \\t nutch.score=10 \\t nutch.fetchInterval=2592000 \\t userType=open_source\n");
+    System.err.println(
+        " -overwrite\tOverwite existing crawldb records by the injected records. Has precedence over 'update'");
+    System.err.println(
+        " -update   \tUpdate existing crawldb records with the injected records. Old metadata is preserved");
+  }
+
+  public static void main(String[] args) throws Exception {
+    int res = ToolRunner.run(NutchConfiguration.create(), new Injector(), args);
+    System.exit(res);
+  }
+
+  public int run(String[] args) throws Exception {
+    if (args.length < 2) {
+      usage();
+      return -1;
+    }
+
+    boolean overwrite = false;
+    boolean update = false;
+
+    for (int i = 2; i < args.length; i++) {
+      if (args[i].equals("-overwrite")) {
+        overwrite = true;
+      } else if (args[i].equals("-update")) {
+        update = true;
+      } else {
+        LOG.info("Injector: Found invalid argument \"" + args[i] + "\"\n");
+        usage();
+        return -1;
+      }
+    }
+
+    try {
+      inject(new Path(args[0]), new Path(args[1]), overwrite, update);
+      return 0;
+    } catch (Exception e) {
+      LOG.error("Injector: " + StringUtils.stringifyException(e));
+      return -1;
+    }
+  }
+
+  /**
+   * Used by the Nutch REST service
+   */
+  public Map<String, Object> run(Map<String, Object> args, String crawlId)
+      throws Exception {
+    if (args.size() < 1) {
+      throw new IllegalArgumentException("Required arguments <url_dir>");
+    }
+    Map<String, Object> results = new HashMap<String, Object>();
+
+    Path crawlDb;
+    if (args.containsKey(Nutch.ARG_CRAWLDB)) {
+      Object crawldbPath = args.get(Nutch.ARG_CRAWLDB);
+      if (crawldbPath instanceof Path) {
+        crawlDb = (Path) crawldbPath;
+      } else {
+        crawlDb = new Path(crawldbPath.toString());
+      }
+    } else {
+      crawlDb = new Path(crawlId + "/crawldb");
+    }
+
+    Path input;
+    Object path = args.get(Nutch.ARG_SEEDDIR);
+    if (path instanceof Path) {
+      input = (Path) path;
+    } else {
+      input = new Path(path.toString());
+    }
+
+    inject(crawlDb, input);
+    results.put(Nutch.VAL_RESULT, Integer.toString(0));
+    return results;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/crawl/Inlink.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/crawl/Inlink.java b/nutch-core/src/main/java/org/apache/nutch/crawl/Inlink.java
new file mode 100644
index 0000000..67df357
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/crawl/Inlink.java
@@ -0,0 +1,83 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+import java.io.*;
+import org.apache.hadoop.io.*;
+
+/* An incoming link to a page. */
+public class Inlink implements Writable {
+
+  private String fromUrl;
+  private String anchor;
+
+  public Inlink() {
+  }
+
+  public Inlink(String fromUrl, String anchor) {
+    this.fromUrl = fromUrl;
+    this.anchor = anchor;
+  }
+
+  public void readFields(DataInput in) throws IOException {
+    fromUrl = Text.readString(in);
+    anchor = Text.readString(in);
+  }
+
+  /** Skips over one Inlink in the input. */
+  public static void skip(DataInput in) throws IOException {
+    Text.skip(in); // skip fromUrl
+    Text.skip(in); // skip anchor
+  }
+
+  public void write(DataOutput out) throws IOException {
+    Text.writeString(out, fromUrl);
+    Text.writeString(out, anchor);
+  }
+
+  public static Inlink read(DataInput in) throws IOException {
+    Inlink inlink = new Inlink();
+    inlink.readFields(in);
+    return inlink;
+  }
+
+  public String getFromUrl() {
+    return fromUrl;
+  }
+
+  public String getAnchor() {
+    return anchor;
+  }
+
+  public boolean equals(Object o) {
+    if (!(o instanceof Inlink))
+      return false;
+    Inlink other = (Inlink) o;
+    return this.fromUrl.equals(other.fromUrl)
+        && this.anchor.equals(other.anchor);
+  }
+
+  public int hashCode() {
+    return fromUrl.hashCode() ^ anchor.hashCode();
+  }
+
+  public String toString() {
+    return "fromUrl: " + fromUrl + " anchor: " + anchor;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/crawl/Inlinks.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/crawl/Inlinks.java b/nutch-core/src/main/java/org/apache/nutch/crawl/Inlinks.java
new file mode 100644
index 0000000..89f9731
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/crawl/Inlinks.java
@@ -0,0 +1,110 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+import java.io.*;
+import java.net.*;
+import java.util.*;
+
+import org.apache.hadoop.io.*;
+
+/** A list of {@link Inlink}s. */
+public class Inlinks implements Writable {
+  private HashSet<Inlink> inlinks = new HashSet<Inlink>(1);
+
+  public void add(Inlink inlink) {
+    inlinks.add(inlink);
+  }
+
+  public void add(Inlinks inlinks) {
+    this.inlinks.addAll(inlinks.inlinks);
+  }
+
+  public Iterator<Inlink> iterator() {
+    return this.inlinks.iterator();
+  }
+
+  public int size() {
+    return inlinks.size();
+  }
+
+  public void clear() {
+    inlinks.clear();
+  }
+
+  public void readFields(DataInput in) throws IOException {
+    int length = in.readInt();
+    inlinks.clear();
+    for (int i = 0; i < length; i++) {
+      add(Inlink.read(in));
+    }
+  }
+
+  public void write(DataOutput out) throws IOException {
+    out.writeInt(inlinks.size());
+    Iterator<Inlink> it = inlinks.iterator();
+    while (it.hasNext()) {
+      it.next().write(out);
+    }
+  }
+
+  public String toString() {
+    StringBuffer buffer = new StringBuffer();
+    buffer.append("Inlinks:\n");
+    Iterator<Inlink> it = inlinks.iterator();
+    while (it.hasNext()) {
+      buffer.append(" ");
+      buffer.append(it.next());
+      buffer.append("\n");
+    }
+    return buffer.toString();
+  }
+
+  /**
+   * Return the set of anchor texts. Only a single anchor with a given text is
+   * permitted from a given domain.
+   */
+  public String[] getAnchors() {
+    HashMap<String, Set<String>> domainToAnchors = new HashMap<String, Set<String>>();
+    ArrayList<String> results = new ArrayList<String>();
+    Iterator<Inlink> it = inlinks.iterator();
+    while (it.hasNext()) {
+      Inlink inlink = it.next();
+      String anchor = inlink.getAnchor();
+
+      if (anchor.length() == 0) // skip empty anchors
+        continue;
+      String domain = null; // extract domain name
+      try {
+        domain = new URL(inlink.getFromUrl()).getHost();
+      } catch (MalformedURLException e) {
+      }
+      Set<String> domainAnchors = domainToAnchors.get(domain);
+      if (domainAnchors == null) {
+        domainAnchors = new HashSet<String>();
+        domainToAnchors.put(domain, domainAnchors);
+      }
+      if (domainAnchors.add(anchor)) { // new anchor from domain
+        results.add(anchor); // collect it
+      }
+    }
+
+    return results.toArray(new String[results.size()]);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/crawl/LinkDb.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/crawl/LinkDb.java b/nutch-core/src/main/java/org/apache/nutch/crawl/LinkDb.java
new file mode 100644
index 0000000..908a8e9
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/crawl/LinkDb.java
@@ -0,0 +1,428 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+import java.io.*;
+import java.text.SimpleDateFormat;
+import java.util.*;
+import java.net.*;
+
+// Commons Logging imports
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.io.*;
+import org.apache.hadoop.fs.*;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.conf.*;
+import org.apache.hadoop.mapred.*;
+import org.apache.hadoop.util.*;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.net.URLFilters;
+import org.apache.nutch.net.URLNormalizers;
+import org.apache.nutch.parse.*;
+import org.apache.nutch.util.HadoopFSUtil;
+import org.apache.nutch.util.LockUtil;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.NutchJob;
+import org.apache.nutch.util.NutchTool;
+import org.apache.nutch.util.TimingUtil;
+
+/** Maintains an inverted link map, listing incoming links for each url. */
+public class LinkDb extends NutchTool implements Tool,
+    Mapper<Text, ParseData, Text, Inlinks> {
+
+  public static final Logger LOG = LoggerFactory.getLogger(LinkDb.class);
+
+  public static final String IGNORE_INTERNAL_LINKS = "linkdb.ignore.internal.links";
+  public static final String IGNORE_EXTERNAL_LINKS = "linkdb.ignore.external.links";
+
+  public static final String CURRENT_NAME = "current";
+  public static final String LOCK_NAME = ".locked";
+
+  private int maxAnchorLength;
+  private boolean ignoreInternalLinks;
+  private boolean ignoreExternalLinks;
+  private URLFilters urlFilters;
+  private URLNormalizers urlNormalizers;
+
+  public LinkDb() {
+  }
+
+  public LinkDb(Configuration conf) {
+    setConf(conf);
+  }
+
+  public void configure(JobConf job) {
+    maxAnchorLength = job.getInt("linkdb.max.anchor.length", 100);
+    ignoreInternalLinks = job.getBoolean(IGNORE_INTERNAL_LINKS, true);
+    ignoreExternalLinks = job.getBoolean(IGNORE_EXTERNAL_LINKS, false);
+
+    if (job.getBoolean(LinkDbFilter.URL_FILTERING, false)) {
+      urlFilters = new URLFilters(job);
+    }
+    if (job.getBoolean(LinkDbFilter.URL_NORMALIZING, false)) {
+      urlNormalizers = new URLNormalizers(job, URLNormalizers.SCOPE_LINKDB);
+    }
+  }
+
+  public void close() {
+  }
+
+  public void map(Text key, ParseData parseData,
+      OutputCollector<Text, Inlinks> output, Reporter reporter)
+      throws IOException {
+    String fromUrl = key.toString();
+    String fromHost = getHost(fromUrl);
+    if (urlNormalizers != null) {
+      try {
+        fromUrl = urlNormalizers
+            .normalize(fromUrl, URLNormalizers.SCOPE_LINKDB); // normalize the
+                                                              // url
+      } catch (Exception e) {
+        LOG.warn("Skipping " + fromUrl + ":" + e);
+        fromUrl = null;
+      }
+    }
+    if (fromUrl != null && urlFilters != null) {
+      try {
+        fromUrl = urlFilters.filter(fromUrl); // filter the url
+      } catch (Exception e) {
+        LOG.warn("Skipping " + fromUrl + ":" + e);
+        fromUrl = null;
+      }
+    }
+    if (fromUrl == null)
+      return; // discard all outlinks
+    Outlink[] outlinks = parseData.getOutlinks();
+    Inlinks inlinks = new Inlinks();
+    for (int i = 0; i < outlinks.length; i++) {
+      Outlink outlink = outlinks[i];
+      String toUrl = outlink.getToUrl();
+
+      if (ignoreInternalLinks) {
+        String toHost = getHost(toUrl);
+        if (toHost == null || toHost.equals(fromHost)) { // internal link
+          continue; // skip it
+        }
+      } else if (ignoreExternalLinks) {
+        String toHost = getHost(toUrl);
+        if (toHost == null || !toHost.equals(fromHost)) { // external link
+          continue;                               // skip it
+        }
+      }
+      if (urlNormalizers != null) {
+        try {
+          toUrl = urlNormalizers.normalize(toUrl, URLNormalizers.SCOPE_LINKDB); // normalize
+                                                                                // the
+                                                                                // url
+        } catch (Exception e) {
+          LOG.warn("Skipping " + toUrl + ":" + e);
+          toUrl = null;
+        }
+      }
+      if (toUrl != null && urlFilters != null) {
+        try {
+          toUrl = urlFilters.filter(toUrl); // filter the url
+        } catch (Exception e) {
+          LOG.warn("Skipping " + toUrl + ":" + e);
+          toUrl = null;
+        }
+      }
+      if (toUrl == null)
+        continue;
+      inlinks.clear();
+      String anchor = outlink.getAnchor(); // truncate long anchors
+      if (anchor.length() > maxAnchorLength) {
+        anchor = anchor.substring(0, maxAnchorLength);
+      }
+      inlinks.add(new Inlink(fromUrl, anchor)); // collect inverted link
+      output.collect(new Text(toUrl), inlinks);
+    }
+  }
+
+  private String getHost(String url) {
+    try {
+      return new URL(url).getHost().toLowerCase();
+    } catch (MalformedURLException e) {
+      return null;
+    }
+  }
+
+  public void invert(Path linkDb, final Path segmentsDir, boolean normalize,
+      boolean filter, boolean force) throws IOException {
+    final FileSystem fs = FileSystem.get(getConf());
+    FileStatus[] files = fs.listStatus(segmentsDir,
+        HadoopFSUtil.getPassDirectoriesFilter(fs));
+    invert(linkDb, HadoopFSUtil.getPaths(files), normalize, filter, force);
+  }
+
+  public void invert(Path linkDb, Path[] segments, boolean normalize,
+      boolean filter, boolean force) throws IOException {
+    JobConf job = LinkDb.createJob(getConf(), linkDb, normalize, filter);
+    Path lock = new Path(linkDb, LOCK_NAME);
+    FileSystem fs = FileSystem.get(getConf());
+    LockUtil.createLockFile(fs, lock, force);
+    Path currentLinkDb = new Path(linkDb, CURRENT_NAME);
+
+    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+    long start = System.currentTimeMillis();
+    if (LOG.isInfoEnabled()) {
+      LOG.info("LinkDb: starting at " + sdf.format(start));
+      LOG.info("LinkDb: linkdb: " + linkDb);
+      LOG.info("LinkDb: URL normalize: " + normalize);
+      LOG.info("LinkDb: URL filter: " + filter);
+      if (job.getBoolean(IGNORE_INTERNAL_LINKS, true)) {
+        LOG.info("LinkDb: internal links will be ignored.");
+      }
+      if (job.getBoolean(IGNORE_EXTERNAL_LINKS, false)) {
+        LOG.info("LinkDb: external links will be ignored.");
+      }
+    }
+    if (job.getBoolean(IGNORE_INTERNAL_LINKS, true)
+        && job.getBoolean(IGNORE_EXTERNAL_LINKS, false)) {
+      LOG.warn("LinkDb: internal and external links are ignored! "
+          + "Nothing to do, actually. Exiting.");
+      LockUtil.removeLockFile(fs, lock);
+      return;
+    }
+
+    for (int i = 0; i < segments.length; i++) {
+      if (LOG.isInfoEnabled()) {
+        LOG.info("LinkDb: adding segment: " + segments[i]);
+      }
+      FileInputFormat.addInputPath(job, new Path(segments[i],
+          ParseData.DIR_NAME));
+    }
+    try {
+      JobClient.runJob(job);
+    } catch (IOException e) {
+      LockUtil.removeLockFile(fs, lock);
+      throw e;
+    }
+    if (fs.exists(currentLinkDb)) {
+      if (LOG.isInfoEnabled()) {
+        LOG.info("LinkDb: merging with existing linkdb: " + linkDb);
+      }
+      // try to merge
+      Path newLinkDb = FileOutputFormat.getOutputPath(job);
+      job = LinkDbMerger.createMergeJob(getConf(), linkDb, normalize, filter);
+      FileInputFormat.addInputPath(job, currentLinkDb);
+      FileInputFormat.addInputPath(job, newLinkDb);
+      try {
+        JobClient.runJob(job);
+      } catch (IOException e) {
+        LockUtil.removeLockFile(fs, lock);
+        fs.delete(newLinkDb, true);
+        throw e;
+      }
+      fs.delete(newLinkDb, true);
+    }
+    LinkDb.install(job, linkDb);
+
+    long end = System.currentTimeMillis();
+    LOG.info("LinkDb: finished at " + sdf.format(end) + ", elapsed: "
+        + TimingUtil.elapsedTime(start, end));
+  }
+
+  private static JobConf createJob(Configuration config, Path linkDb,
+      boolean normalize, boolean filter) {
+    Path newLinkDb = new Path("linkdb-"
+        + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
+
+    JobConf job = new NutchJob(config);
+    job.setJobName("linkdb " + linkDb);
+
+    job.setInputFormat(SequenceFileInputFormat.class);
+
+    job.setMapperClass(LinkDb.class);
+    job.setCombinerClass(LinkDbMerger.class);
+    // if we don't run the mergeJob, perform normalization/filtering now
+    if (normalize || filter) {
+      try {
+        FileSystem fs = FileSystem.get(config);
+        if (!fs.exists(linkDb)) {
+          job.setBoolean(LinkDbFilter.URL_FILTERING, filter);
+          job.setBoolean(LinkDbFilter.URL_NORMALIZING, normalize);
+        }
+      } catch (Exception e) {
+        LOG.warn("LinkDb createJob: " + e);
+      }
+    }
+    job.setReducerClass(LinkDbMerger.class);
+
+    FileOutputFormat.setOutputPath(job, newLinkDb);
+    job.setOutputFormat(MapFileOutputFormat.class);
+    job.setBoolean("mapred.output.compress", true);
+    job.setOutputKeyClass(Text.class);
+    job.setOutputValueClass(Inlinks.class);
+
+    return job;
+  }
+
+  public static void install(JobConf job, Path linkDb) throws IOException {
+    Path newLinkDb = FileOutputFormat.getOutputPath(job);
+    FileSystem fs = new JobClient(job).getFs();
+    Path old = new Path(linkDb, "old");
+    Path current = new Path(linkDb, CURRENT_NAME);
+    if (fs.exists(current)) {
+      if (fs.exists(old))
+        fs.delete(old, true);
+      fs.rename(current, old);
+    }
+    fs.mkdirs(linkDb);
+    fs.rename(newLinkDb, current);
+    if (fs.exists(old))
+      fs.delete(old, true);
+    LockUtil.removeLockFile(fs, new Path(linkDb, LOCK_NAME));
+  }
+
+  public static void main(String[] args) throws Exception {
+    int res = ToolRunner.run(NutchConfiguration.create(), new LinkDb(), args);
+    System.exit(res);
+  }
+
+  public int run(String[] args) throws Exception {
+    if (args.length < 2) {
+      System.err
+          .println("Usage: LinkDb <linkdb> (-dir <segmentsDir> | <seg1> <seg2> ...) [-force] [-noNormalize] [-noFilter]");
+      System.err.println("\tlinkdb\toutput LinkDb to create or update");
+      System.err
+          .println("\t-dir segmentsDir\tparent directory of several segments, OR");
+      System.err.println("\tseg1 seg2 ...\t list of segment directories");
+      System.err
+          .println("\t-force\tforce update even if LinkDb appears to be locked (CAUTION advised)");
+      System.err.println("\t-noNormalize\tdon't normalize link URLs");
+      System.err.println("\t-noFilter\tdon't apply URLFilters to link URLs");
+      return -1;
+    }
+    final FileSystem fs = FileSystem.get(getConf());
+    Path db = new Path(args[0]);
+    ArrayList<Path> segs = new ArrayList<Path>();
+    boolean filter = true;
+    boolean normalize = true;
+    boolean force = false;
+    for (int i = 1; i < args.length; i++) {
+      if (args[i].equals("-dir")) {
+        FileStatus[] paths = fs.listStatus(new Path(args[++i]),
+            HadoopFSUtil.getPassDirectoriesFilter(fs));
+        segs.addAll(Arrays.asList(HadoopFSUtil.getPaths(paths)));
+      } else if (args[i].equalsIgnoreCase("-noNormalize")) {
+        normalize = false;
+      } else if (args[i].equalsIgnoreCase("-noFilter")) {
+        filter = false;
+      } else if (args[i].equalsIgnoreCase("-force")) {
+        force = true;
+      } else
+        segs.add(new Path(args[i]));
+    }
+    try {
+      invert(db, segs.toArray(new Path[segs.size()]), normalize, filter, force);
+      return 0;
+    } catch (Exception e) {
+      LOG.error("LinkDb: " + StringUtils.stringifyException(e));
+      return -1;
+    }
+  }
+
+  /*
+   * Used for Nutch REST service
+   */
+  @Override
+  public Map<String, Object> run(Map<String, Object> args, String crawlId) throws Exception {
+
+    Map<String, Object> results = new HashMap<String, Object>();
+
+    Path linkdb;
+    if(args.containsKey(Nutch.ARG_LINKDB)) {
+      Object path = args.get(Nutch.ARG_LINKDB);
+      if(path instanceof Path) {
+        linkdb = (Path) path;
+      }
+      else {
+        linkdb = new Path(path.toString());
+      }
+    }
+    else {
+      linkdb = new Path(crawlId+"/linkdb");
+    }
+
+
+    ArrayList<Path> segs = new ArrayList<Path>();
+    boolean filter = true;
+    boolean normalize = true;
+    boolean force = false;
+    if (args.containsKey("noNormalize")) {
+      normalize = false;
+    } 
+    if (args.containsKey("noFilter")) {
+      filter = false;
+    } 
+    if (args.containsKey("force")) {
+      force = true;
+    }
+
+    Path segmentsDir;
+    final FileSystem fs = FileSystem.get(getConf());
+    if(args.containsKey(Nutch.ARG_SEGMENTDIR)) {
+      Object segDir = args.get(Nutch.ARG_SEGMENTDIR);
+      if(segDir instanceof Path) {
+        segmentsDir = (Path) segDir;
+      }
+      else {
+        segmentsDir = new Path(segDir.toString());
+      }
+      FileStatus[] paths = fs.listStatus(segmentsDir,
+          HadoopFSUtil.getPassDirectoriesFilter(fs));
+      segs.addAll(Arrays.asList(HadoopFSUtil.getPaths(paths)));
+    }
+    else if(args.containsKey(Nutch.ARG_SEGMENT)) {
+      Object segments = args.get(Nutch.ARG_SEGMENT);
+      ArrayList<String> segmentList = new ArrayList<String>();
+      if(segments instanceof ArrayList) {
+        segmentList = (ArrayList<String>)segments;
+      }
+      for(String segment: segmentList) {
+        segs.add(new Path(segment));
+      }
+    }
+    else {
+      String segment_dir = crawlId+"/segments";
+      File dir = new File(segment_dir);
+      File[] segmentsList = dir.listFiles();  
+      Arrays.sort(segmentsList, new Comparator<File>(){
+        @Override
+        public int compare(File f1, File f2) {
+          if(f1.lastModified()>f2.lastModified())
+            return -1;
+          else
+            return 0;
+        }      
+      });
+      segs.add(new Path(segmentsList[0].getPath()));
+    }
+    try {
+      invert(linkdb, segs.toArray(new Path[segs.size()]), normalize, filter, force);
+      results.put(Nutch.VAL_RESULT, Integer.toString(0));
+      return results;
+    } catch (Exception e) {
+      LOG.error("LinkDb: " + StringUtils.stringifyException(e));
+      results.put(Nutch.VAL_RESULT, Integer.toString(-1));
+      return results;
+    }
+  }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/crawl/LinkDbFilter.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/crawl/LinkDbFilter.java b/nutch-core/src/main/java/org/apache/nutch/crawl/LinkDbFilter.java
new file mode 100644
index 0000000..1ff9b05
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/crawl/LinkDbFilter.java
@@ -0,0 +1,128 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+import java.io.IOException;
+import java.util.Iterator;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.Mapper;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.nutch.net.URLFilters;
+import org.apache.nutch.net.URLNormalizers;
+
+/**
+ * This class provides a way to separate the URL normalization and filtering
+ * steps from the rest of LinkDb manipulation code.
+ * 
+ * @author Andrzej Bialecki
+ */
+public class LinkDbFilter implements Mapper<Text, Inlinks, Text, Inlinks> {
+  public static final String URL_FILTERING = "linkdb.url.filters";
+
+  public static final String URL_NORMALIZING = "linkdb.url.normalizer";
+
+  public static final String URL_NORMALIZING_SCOPE = "linkdb.url.normalizer.scope";
+
+  private boolean filter;
+
+  private boolean normalize;
+
+  private URLFilters filters;
+
+  private URLNormalizers normalizers;
+
+  private String scope;
+
+  public static final Logger LOG = LoggerFactory.getLogger(LinkDbFilter.class);
+
+  private Text newKey = new Text();
+
+  public void configure(JobConf job) {
+    filter = job.getBoolean(URL_FILTERING, false);
+    normalize = job.getBoolean(URL_NORMALIZING, false);
+    if (filter) {
+      filters = new URLFilters(job);
+    }
+    if (normalize) {
+      scope = job.get(URL_NORMALIZING_SCOPE, URLNormalizers.SCOPE_LINKDB);
+      normalizers = new URLNormalizers(job, scope);
+    }
+  }
+
+  public void close() {
+  }
+
+  public void map(Text key, Inlinks value,
+      OutputCollector<Text, Inlinks> output, Reporter reporter)
+      throws IOException {
+    String url = key.toString();
+    Inlinks result = new Inlinks();
+    if (normalize) {
+      try {
+        url = normalizers.normalize(url, scope); // normalize the url
+      } catch (Exception e) {
+        LOG.warn("Skipping " + url + ":" + e);
+        url = null;
+      }
+    }
+    if (url != null && filter) {
+      try {
+        url = filters.filter(url); // filter the url
+      } catch (Exception e) {
+        LOG.warn("Skipping " + url + ":" + e);
+        url = null;
+      }
+    }
+    if (url == null)
+      return; // didn't pass the filters
+    Iterator<Inlink> it = value.iterator();
+    String fromUrl = null;
+    while (it.hasNext()) {
+      Inlink inlink = it.next();
+      fromUrl = inlink.getFromUrl();
+      if (normalize) {
+        try {
+          fromUrl = normalizers.normalize(fromUrl, scope); // normalize the url
+        } catch (Exception e) {
+          LOG.warn("Skipping " + fromUrl + ":" + e);
+          fromUrl = null;
+        }
+      }
+      if (fromUrl != null && filter) {
+        try {
+          fromUrl = filters.filter(fromUrl); // filter the url
+        } catch (Exception e) {
+          LOG.warn("Skipping " + fromUrl + ":" + e);
+          fromUrl = null;
+        }
+      }
+      if (fromUrl != null) {
+        result.add(new Inlink(fromUrl, inlink.getAnchor()));
+      }
+    }
+    if (result.size() > 0) { // don't collect empty inlinks
+      newKey.set(url);
+      output.collect(newKey, result);
+    }
+  }
+}

[32/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build for nutch-core and nutch-plugins

Posted by th...@apache.org.

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/domain/DomainStatistics.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/util/domain/DomainStatistics.java b/nutch-core/src/main/java/org/apache/nutch/util/domain/DomainStatistics.java
new file mode 100644
index 0000000..6c1bd9e
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/util/domain/DomainStatistics.java
@@ -0,0 +1,234 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util.domain;
+
+import java.io.File;
+import java.io.IOException;
+import java.net.URL;
+import java.text.SimpleDateFormat;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.TimingUtil;
+import org.apache.nutch.util.URLUtil;
+
+/**
+ * Extracts some very basic statistics about domains from the crawldb
+ */
+public class DomainStatistics extends Configured implements Tool {
+
+  private static final Logger LOG = LoggerFactory
+      .getLogger(DomainStatistics.class);
+
+  private static final Text FETCHED_TEXT = new Text("FETCHED");
+  private static final Text NOT_FETCHED_TEXT = new Text("NOT_FETCHED");
+
+  public static enum MyCounter {
+    FETCHED, NOT_FETCHED, EMPTY_RESULT
+  };
+
+  private static final int MODE_HOST = 1;
+  private static final int MODE_DOMAIN = 2;
+  private static final int MODE_SUFFIX = 3;
+  private static final int MODE_TLD = 4;
+
+  private int mode = 0;
+
+  public int run(String[] args) throws Exception {
+    if (args.length < 3) {
+      System.err.println("Usage: DomainStatistics inputDirs outDir mode [numOfReducer]");
+
+      System.err.println("\tinputDirs\tComma separated list of crawldb input directories");
+      System.err.println("\t\t\tE.g.: crawl/crawldb/");
+
+      System.err.println("\toutDir\t\tOutput directory where results should be dumped");
+
+      System.err.println("\tmode\t\tSet statistics gathering mode");
+      System.err.println("\t\t\t\thost\tGather statistics by host");
+      System.err.println("\t\t\t\tdomain\tGather statistics by domain");
+      System.err.println("\t\t\t\tsuffix\tGather statistics by suffix");
+      System.err.println("\t\t\t\ttld\tGather statistics by top level directory");
+
+      System.err.println("\t[numOfReducers]\tOptional number of reduce jobs to use. Defaults to 1.");
+      
+      return 1;
+    }
+    String inputDir = args[0];
+    String outputDir = args[1];
+    int numOfReducers = 1;
+
+    if (args.length > 3) {
+      numOfReducers = Integer.parseInt(args[3]);
+    }
+
+    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+    long start = System.currentTimeMillis();
+    LOG.info("DomainStatistics: starting at " + sdf.format(start));
+
+    int mode = 0;
+    String jobName = "DomainStatistics";
+    if (args[2].equals("host")) {
+      jobName = "Host statistics";
+      mode = MODE_HOST;
+    } else if (args[2].equals("domain")) {
+      jobName = "Domain statistics";
+      mode = MODE_DOMAIN;
+    } else if (args[2].equals("suffix")) {
+      jobName = "Suffix statistics";
+      mode = MODE_SUFFIX;
+    } else if (args[2].equals("tld")) {
+      jobName = "TLD statistics";
+      mode = MODE_TLD;
+    }
+
+    Configuration conf = getConf();
+    conf.setInt("domain.statistics.mode", mode);
+    conf.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);
+
+    Job job = Job.getInstance(conf, jobName);
+    job.setJarByClass(DomainStatistics.class);
+
+    String[] inputDirsSpecs = inputDir.split(",");
+    for (int i = 0; i < inputDirsSpecs.length; i++) {
+      File completeInputPath = new File(new File(inputDirsSpecs[i]), "current");
+      FileInputFormat.addInputPath(job, new Path(completeInputPath.toString()));
+    }
+
+    job.setInputFormatClass(SequenceFileInputFormat.class);
+    FileOutputFormat.setOutputPath(job, new Path(outputDir));
+    job.setOutputFormatClass(TextOutputFormat.class);
+
+    job.setMapOutputKeyClass(Text.class);
+    job.setMapOutputValueClass(LongWritable.class);
+    job.setOutputKeyClass(Text.class);
+    job.setOutputValueClass(LongWritable.class);
+
+    job.setMapperClass(DomainStatisticsMapper.class);
+    job.setReducerClass(DomainStatisticsReducer.class);
+    job.setCombinerClass(DomainStatisticsCombiner.class);
+    job.setNumReduceTasks(numOfReducers);
+
+    try {
+      job.waitForCompletion(true);
+    } catch (Exception e) {
+      throw e;
+    }
+
+    long end = System.currentTimeMillis();
+    LOG.info("DomainStatistics: finished at " + sdf.format(end) + ", elapsed: "
+        + TimingUtil.elapsedTime(start, end));
+    return 0;
+  }
+
+  static class DomainStatisticsMapper extends
+      Mapper<Text, CrawlDatum, Text, LongWritable> {
+    int mode = 0;
+
+    public void setup(Context context) {
+      mode = context.getConfiguration().getInt("domain.statistics.mode",
+          MODE_DOMAIN);
+    }
+
+    public void map(Text urlText, CrawlDatum datum, Context context)
+        throws IOException, InterruptedException {
+
+      if (datum.getStatus() == CrawlDatum.STATUS_DB_FETCHED
+          || datum.getStatus() == CrawlDatum.STATUS_DB_NOTMODIFIED) {
+
+        try {
+          URL url = new URL(urlText.toString());
+          String out = null;
+          switch (mode) {
+          case MODE_HOST:
+            out = url.getHost();
+            break;
+          case MODE_DOMAIN:
+            out = URLUtil.getDomainName(url);
+            break;
+          case MODE_SUFFIX:
+            out = URLUtil.getDomainSuffix(url).getDomain();
+            break;
+          case MODE_TLD:
+            out = URLUtil.getTopLevelDomainName(url);
+            break;
+          }
+          if (out.trim().equals("")) {
+            LOG.info("url : " + url);
+            context.getCounter(MyCounter.EMPTY_RESULT).increment(1);
+          }
+
+          context.write(new Text(out), new LongWritable(1));
+        } catch (Exception ex) {
+        }
+
+        context.getCounter(MyCounter.FETCHED).increment(1);
+        context.write(FETCHED_TEXT, new LongWritable(1));
+      } else {
+        context.getCounter(MyCounter.NOT_FETCHED).increment(1);
+        context.write(NOT_FETCHED_TEXT, new LongWritable(1));
+      }
+    }
+  }
+
+  static class DomainStatisticsReducer extends
+      Reducer<Text, LongWritable, LongWritable, Text> {
+    public void reduce(Text key, Iterable<LongWritable> values, Context context)
+        throws IOException, InterruptedException {
+      long total = 0;
+
+      for (LongWritable val : values) {
+        total += val.get();
+      }
+
+      context.write(new LongWritable(total), key);
+    }
+  }
+
+  public static class DomainStatisticsCombiner extends
+      Reducer<Text, LongWritable, Text, LongWritable> {
+    public void reduce(Text key, Iterable<LongWritable> values, Context context)
+        throws IOException, InterruptedException {
+      long total = 0;
+
+      for (LongWritable val : values) {
+        total += val.get();
+      }
+      context.write(key, new LongWritable(total));
+    }
+  }
+
+  public static void main(String[] args) throws Exception {
+    ToolRunner.run(NutchConfiguration.create(), new DomainStatistics(), args);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/domain/DomainSuffix.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/util/domain/DomainSuffix.java b/nutch-core/src/main/java/org/apache/nutch/util/domain/DomainSuffix.java
new file mode 100644
index 0000000..d40ebe9
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/util/domain/DomainSuffix.java
@@ -0,0 +1,79 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util.domain;
+
+/**
+ * This class represents the last part of the host name, which is operated by
+ * authoritives, not individuals. This information is needed to find the domain
+ * name of a host. The domain name of a host is defined to be the last part
+ * before the domain suffix, w/o subdomain names. As an example the domain name
+ * of <br>
+ * <code> http://lucene.apache.org/ 
+ * </code><br>
+ * is <code> apache.org</code> <br>
+ * This class holds three fields, <strong>domain</strong> field represents the
+ * suffix (such as "co.uk") <strong>boost</strong> is a float for boosting score
+ * of url's with this suffix <strong>status</strong> field represents domain's
+ * status
+ * 
+ * @author Enis Soztutar &lt;enis.soz.nutch@gmail.com&gt;
+ * @see TopLevelDomain for info please see conf/domain-suffixes.xml
+ */
+public class DomainSuffix {
+
+  /**
+   * Enumeration of the status of the tld. Please see domain-suffixes.xml.
+   */
+  public enum Status {
+    INFRASTRUCTURE, SPONSORED, UNSPONSORED, STARTUP, PROPOSED, DELETED, PSEUDO_DOMAIN, DEPRECATED, IN_USE, NOT_IN_USE, REJECTED
+  };
+
+  private String domain;
+  private Status status;
+  private float boost;
+
+  public static final float DEFAULT_BOOST = 1.0f;
+  public static final Status DEFAULT_STATUS = Status.IN_USE;
+
+  public DomainSuffix(String domain, Status status, float boost) {
+    this.domain = domain;
+    this.status = status;
+    this.boost = boost;
+  }
+
+  public DomainSuffix(String domain) {
+    this(domain, DEFAULT_STATUS, DEFAULT_BOOST);
+  }
+
+  public String getDomain() {
+    return domain;
+  }
+
+  public Status getStatus() {
+    return status;
+  }
+
+  public float getBoost() {
+    return boost;
+  }
+
+  @Override
+  public String toString() {
+    return domain;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/domain/DomainSuffixes.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/util/domain/DomainSuffixes.java b/nutch-core/src/main/java/org/apache/nutch/util/domain/DomainSuffixes.java
new file mode 100644
index 0000000..765457e
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/util/domain/DomainSuffixes.java
@@ -0,0 +1,86 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util.domain;
+
+import java.io.InputStream;
+import java.util.HashMap;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.util.StringUtils;
+
+/**
+ * Storage class for <code>DomainSuffix</code> objects Note: this class is
+ * singleton
+ * 
+ * @author Enis Soztutar &lt;enis.soz.nutch@gmail.com&gt;
+ */
+public class DomainSuffixes {
+  private static final Logger LOG = LoggerFactory
+      .getLogger(DomainSuffixes.class);
+
+  private HashMap<String, DomainSuffix> domains = new HashMap<String, DomainSuffix>();
+
+  private static DomainSuffixes instance;
+
+  /** private ctor */
+  private DomainSuffixes() {
+    String file = "domain-suffixes.xml";
+    InputStream input = this.getClass().getClassLoader()
+        .getResourceAsStream(file);
+    try {
+      new DomainSuffixesReader().read(this, input);
+    } catch (Exception ex) {
+      LOG.warn(StringUtils.stringifyException(ex));
+    }
+  }
+
+  /**
+   * Singleton instance, lazy instantination
+   * 
+   * @return returns the domain suffix instance
+   */
+  public static DomainSuffixes getInstance() {
+    if (instance == null) {
+      instance = new DomainSuffixes();
+    }
+    return instance;
+  }
+
+  void addDomainSuffix(DomainSuffix tld) {
+    domains.put(tld.getDomain(), tld);
+  }
+
+  /** return whether the extension is a registered domain entry */
+  public boolean isDomainSuffix(String extension) {
+    return domains.containsKey(extension);
+  }
+
+  /**
+   * Return the {@link DomainSuffix} object for the extension, if extension is a
+   * top level domain returned object will be an instance of
+   * {@link TopLevelDomain}
+   * 
+   * @param extension
+   *          of the domain
+   */
+  public DomainSuffix get(String extension) {
+    return domains.get(extension);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/domain/DomainSuffixesReader.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/util/domain/DomainSuffixesReader.java b/nutch-core/src/main/java/org/apache/nutch/util/domain/DomainSuffixesReader.java
new file mode 100644
index 0000000..a2a60e2
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/util/domain/DomainSuffixesReader.java
@@ -0,0 +1,164 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util.domain;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.parsers.ParserConfigurationException;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.nutch.util.domain.DomainSuffix.Status;
+import org.apache.nutch.util.domain.TopLevelDomain.Type;
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.NodeList;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+
+/**
+ * For parsing xml files containing domain suffix definitions. Parsed xml files
+ * should validate against <code>domain-suffixes.xsd</code>
+ * 
+ * @author Enis Soztutar &lt;enis.soz.nutch@gmail.com&gt;
+ */
+class DomainSuffixesReader {
+
+  private static final Logger LOG = LoggerFactory
+      .getLogger(DomainSuffixesReader.class);
+
+  void read(DomainSuffixes tldEntries, InputStream input) throws IOException {
+    try {
+
+      DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
+      factory.setIgnoringComments(true);
+      DocumentBuilder builder = factory.newDocumentBuilder();
+      Document document = builder.parse(new InputSource(input));
+
+      Element root = document.getDocumentElement();
+
+      if (root != null && root.getTagName().equals("domains")) {
+
+        Element tlds = (Element) root.getElementsByTagName("tlds").item(0);
+        Element suffixes = (Element) root.getElementsByTagName("suffixes")
+            .item(0);
+
+        // read tlds
+        readITLDs(tldEntries, (Element) tlds.getElementsByTagName("itlds")
+            .item(0));
+        readGTLDs(tldEntries, (Element) tlds.getElementsByTagName("gtlds")
+            .item(0));
+        readCCTLDs(tldEntries, (Element) tlds.getElementsByTagName("cctlds")
+            .item(0));
+
+        readSuffixes(tldEntries, suffixes);
+      } else {
+        throw new IOException("xml file is not valid");
+      }
+    } catch (ParserConfigurationException ex) {
+      LOG.warn(StringUtils.stringifyException(ex));
+      throw new IOException(ex.getMessage());
+    } catch (SAXException ex) {
+      LOG.warn(StringUtils.stringifyException(ex));
+      throw new IOException(ex.getMessage());
+    }
+  }
+
+  void readITLDs(DomainSuffixes tldEntries, Element el) {
+    NodeList children = el.getElementsByTagName("tld");
+    for (int i = 0; i < children.getLength(); i++) {
+      tldEntries.addDomainSuffix(readGTLD((Element) children.item(i),
+          Type.INFRASTRUCTURE));
+    }
+  }
+
+  void readGTLDs(DomainSuffixes tldEntries, Element el) {
+    NodeList children = el.getElementsByTagName("tld");
+    for (int i = 0; i < children.getLength(); i++) {
+      tldEntries.addDomainSuffix(readGTLD((Element) children.item(i),
+          Type.GENERIC));
+    }
+  }
+
+  void readCCTLDs(DomainSuffixes tldEntries, Element el) throws IOException {
+    NodeList children = el.getElementsByTagName("tld");
+    for (int i = 0; i < children.getLength(); i++) {
+      tldEntries.addDomainSuffix(readCCTLD((Element) children.item(i)));
+    }
+  }
+
+  TopLevelDomain readGTLD(Element el, Type type) {
+    String domain = el.getAttribute("domain");
+    Status status = readStatus(el);
+    float boost = readBoost(el);
+    return new TopLevelDomain(domain, type, status, boost);
+  }
+
+  TopLevelDomain readCCTLD(Element el) throws IOException {
+    String domain = el.getAttribute("domain");
+    Status status = readStatus(el);
+    float boost = readBoost(el);
+    String countryName = readCountryName(el);
+    return new TopLevelDomain(domain, status, boost, countryName);
+  }
+
+  /** read optional field status */
+  Status readStatus(Element el) {
+    NodeList list = el.getElementsByTagName("status");
+    if (list == null || list.getLength() == 0)
+      return DomainSuffix.DEFAULT_STATUS;
+    return Status.valueOf(list.item(0).getFirstChild().getNodeValue());
+  }
+
+  /** read optional field boost */
+  float readBoost(Element el) {
+    NodeList list = el.getElementsByTagName("boost");
+    if (list == null || list.getLength() == 0)
+      return DomainSuffix.DEFAULT_BOOST;
+    return Float.parseFloat(list.item(0).getFirstChild().getNodeValue());
+  }
+
+  /**
+   * read field countryname
+   */
+  String readCountryName(Element el) throws IOException {
+    NodeList list = el.getElementsByTagName("country");
+    if (list == null || list.getLength() == 0)
+      throw new IOException("Country name should be given");
+    return list.item(0).getNodeValue();
+  }
+
+  void readSuffixes(DomainSuffixes tldEntries, Element el) {
+    NodeList children = el.getElementsByTagName("suffix");
+    for (int i = 0; i < children.getLength(); i++) {
+      tldEntries.addDomainSuffix(readSuffix((Element) children.item(i)));
+    }
+  }
+
+  DomainSuffix readSuffix(Element el) {
+    String domain = el.getAttribute("domain");
+    Status status = readStatus(el);
+    float boost = readBoost(el);
+    return new DomainSuffix(domain, status, boost);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/domain/TopLevelDomain.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/util/domain/TopLevelDomain.java b/nutch-core/src/main/java/org/apache/nutch/util/domain/TopLevelDomain.java
new file mode 100644
index 0000000..f442d1f
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/util/domain/TopLevelDomain.java
@@ -0,0 +1,67 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util.domain;
+
+/**
+ * (From wikipedia) A top-level domain (TLD) is the last part of an Internet
+ * domain name; that is, the letters which follow the final dot of any domain
+ * name. For example, in the domain name <code>www.website.com</code>, the
+ * top-level domain is <code>com</code>.
+ * 
+ * @author Enis Soztutar &lt;enis.soz.nutch@gmail.com&gt;
+ * 
+ * @see <a href="http://www.iana.org/"> iana.org</a>
+ * 
+ * @see <a href="http://en.wikipedia.org/wiki/Top-level_domain">
+ *      Top-level_domain</a>
+ */
+public class TopLevelDomain extends DomainSuffix {
+
+  public enum Type {
+    INFRASTRUCTURE, GENERIC, COUNTRY
+  };
+
+  private Type type;
+  private String countryName = null;
+
+  public TopLevelDomain(String domain, Type type, Status status, float boost) {
+    super(domain, status, boost);
+    this.type = type;
+  }
+
+  public TopLevelDomain(String domain, Status status, float boost,
+      String countryName) {
+    super(domain, status, boost);
+    this.type = Type.COUNTRY;
+    this.countryName = countryName;
+  }
+
+  public Type getType() {
+    return type;
+  }
+
+  /**
+   * Returns the country name if TLD is Country Code TLD
+   * 
+   * @return country name or null
+   */
+  public String getCountryName() {
+    return countryName;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/domain/package.html
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/util/domain/package.html b/nutch-core/src/main/java/org/apache/nutch/util/domain/package.html
new file mode 100644
index 0000000..49e0e6a
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/util/domain/package.html
@@ -0,0 +1,14 @@
+<html>
+<body>
+<h2>Classes for domain name analysis.</h2>
+
+for information please refer to following urls : 
+<ul>
+<li><a href="http://en.wikipedia.org/wiki/DNS">http://en.wikipedia.org/wiki/DNS</a></li>
+<li><a href="http://en.wikipedia.org/wiki/Top-level_domain">http://en.wikipedia.org/wiki/Top-level_domain</a></li>
+<li><a href="http://wiki.mozilla.org/TLD_List">http://wiki.mozilla.org/TLD_List</a></li>
+<li><a href="http://publicsuffix.org/">http://publicsuffix.org/</a></li>
+</ul>
+
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/util/package-info.java b/nutch-core/src/main/java/org/apache/nutch/util/package-info.java
new file mode 100644
index 0000000..053dbc1
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/util/package-info.java
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Miscellaneous utility classes.
+ */
+package org.apache.nutch.util;
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/NutchUiApplication.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/NutchUiApplication.java b/nutch-core/src/main/java/org/apache/nutch/webui/NutchUiApplication.java
new file mode 100644
index 0000000..6fd2396
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/NutchUiApplication.java
@@ -0,0 +1,75 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.webui;
+
+import org.apache.nutch.webui.pages.DashboardPage;
+import org.apache.nutch.webui.pages.assets.NutchUiCssReference;
+import org.apache.wicket.markup.html.WebPage;
+import org.apache.wicket.protocol.http.WebApplication;
+import org.apache.wicket.spring.injection.annot.SpringComponentInjector;
+import org.springframework.beans.BeansException;
+import org.springframework.context.ApplicationContext;
+import org.springframework.context.ApplicationContextAware;
+import org.springframework.stereotype.Component;
+
+import de.agilecoders.wicket.core.Bootstrap;
+import de.agilecoders.wicket.core.markup.html.themes.bootstrap.BootstrapCssReference;
+import de.agilecoders.wicket.core.settings.BootstrapSettings;
+import de.agilecoders.wicket.core.settings.SingleThemeProvider;
+import de.agilecoders.wicket.core.settings.Theme;
+import de.agilecoders.wicket.extensions.markup.html.bootstrap.icon.FontAwesomeCssReference;
+
+@Component
+public class NutchUiApplication extends WebApplication implements
+    ApplicationContextAware {
+  private static final String THEME_NAME = "bootstrap";
+  private ApplicationContext context;
+
+  /**
+   * @see org.apache.wicket.Application#getHomePage()
+   */
+  @Override
+  public Class<? extends WebPage> getHomePage() {
+    return DashboardPage.class;
+  }
+
+  /**
+   * @see org.apache.wicket.Application#init()
+   */
+  @Override
+  public void init() {
+    super.init();
+    BootstrapSettings settings = new BootstrapSettings();
+    Bootstrap.install(this, settings);
+    configureTheme(settings);
+
+    getComponentInstantiationListeners().add(
+        new SpringComponentInjector(this, context));
+  }
+
+  private void configureTheme(BootstrapSettings settings) {
+    Theme theme = new Theme(THEME_NAME, BootstrapCssReference.instance(),
+        FontAwesomeCssReference.instance(), NutchUiCssReference.instance());
+    settings.setThemeProvider(new SingleThemeProvider(theme));
+  }
+
+  @Override
+  public void setApplicationContext(ApplicationContext applicationContext)
+      throws BeansException {
+    this.context = applicationContext;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/NutchUiApplication.properties
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/NutchUiApplication.properties b/nutch-core/src/main/java/org/apache/nutch/webui/NutchUiApplication.properties
new file mode 100644
index 0000000..4c62939
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/NutchUiApplication.properties
@@ -0,0 +1,63 @@
+#############################################################################
+#Licensed to the Apache Software Foundation (ASF) under one or more
+#contributor license agreements.  See the NOTICE file distributed with
+#this work for additional information regarding copyright ownership.
+#The ASF licenses this file to You under the Apache License, Version 2.0
+#(the "License"); you may not use this file except in compliance with
+#the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+#############################################################################
+
+navbar.menu.dashboard = Dashboard
+navbar.menu.statistics = Statistics
+navbar.menu.instances = Instances
+navbar.menu.settings = Settings
+navbar.menu.crawls = Crawls
+navbar.menu.scheduling = Scheduling
+navbar.menu.search = Search
+navbar.menu.url = URLs upload
+navbar.menu.seedLists = Seed lists
+
+page.header.seedList = Seed list
+
+navbar.userMenu.settings = Settings
+navbar.userMenu.logout = Log out
+
+menu.settings=Settings
+menu.instances=Instances
+
+connected=Connected
+disconnected=Disconnected
+
+##ENUMS
+ConnectionStatus.CONNECTING=Connecting
+ConnectionStatus.CONNECTED=Connected
+ConnectionStatus.DISCONNECTED=Disconnected
+
+CrawlStatus.NEW=New
+CrawlStatus.ERROR=Error
+CrawlStatus.CRAWLING=Crawling
+CrawlStatus.FINISHED=Finished
+
+instances=Instances
+instances.header.name=Instance name
+instances.header.hostname=Hostname
+instances.header.status=Status
+instances.header.username=Username
+instances.label.name=Instance name
+instances.label.hostname=Hostname
+instances.label.port=Port
+instances.label.username=Username
+instances.label.password=Password
+instances.buttons.addInstance=Add instance
+
+settings=Settings
+settings.header.name = Name
+settings.header.value = Value
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/NutchUiServer.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/NutchUiServer.java b/nutch-core/src/main/java/org/apache/nutch/webui/NutchUiServer.java
new file mode 100644
index 0000000..d534b8f
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/NutchUiServer.java
@@ -0,0 +1,104 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.webui;
+
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.CommandLineParser;
+import org.apache.commons.cli.GnuParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.OptionBuilder;
+import org.apache.commons.cli.Options;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.wicket.protocol.http.WicketFilter;
+import org.apache.wicket.spring.SpringWebApplicationFactory;
+import org.mortbay.jetty.Handler;
+import org.mortbay.jetty.Server;
+import org.mortbay.jetty.servlet.Context;
+import org.mortbay.jetty.servlet.DefaultServlet;
+import org.mortbay.jetty.servlet.FilterHolder;
+import org.springframework.web.context.ContextLoaderListener;
+import org.springframework.web.context.WebApplicationContext;
+import org.springframework.web.context.request.RequestContextListener;
+import org.springframework.web.context.support.AnnotationConfigWebApplicationContext;
+
+public class NutchUiServer {
+  private static final String APP_FACTORY_NAME = SpringWebApplicationFactory.class
+      .getName();
+  private static final String CONFIG_LOCATION = "org.apache.nutch.webui";
+  private static final String CMD_PORT = "port";
+  private static Integer port = 8080;
+
+  public static void main(String[] args) throws Exception {
+    CommandLineParser parser = new GnuParser();
+    Options options = createWebAppOptions();
+    CommandLine commandLine = null;
+    HelpFormatter formatter = new HelpFormatter();
+    try {
+      commandLine = parser.parse(options, args);
+    } catch (Exception e) {
+      formatter.printHelp("NutchUiServer", options, true);
+      StringUtils.stringifyException(e);
+    }
+
+    if (commandLine.hasOption("help")) {
+      formatter.printHelp("NutchUiServer", options, true);
+      return;
+    }
+    if (commandLine.hasOption(CMD_PORT)) {
+      port = Integer.parseInt(commandLine.getOptionValue(CMD_PORT));
+    }
+    startServer();
+  }
+
+  private static void startServer() throws Exception, InterruptedException {
+    Server server = new Server(port);
+    Context context = new Context(server, "/", Context.SESSIONS);
+    context.addServlet(DefaultServlet.class, "/*");
+
+    context.addEventListener(new ContextLoaderListener(getContext()));
+    context.addEventListener(new RequestContextListener());
+
+    WicketFilter filter = new WicketFilter();
+    filter.setFilterPath("/");
+    FilterHolder holder = new FilterHolder(filter);
+    holder.setInitParameter("applicationFactoryClassName", APP_FACTORY_NAME);
+    context.addFilter(holder, "/*", Handler.DEFAULT);
+
+    server.setHandler(context);
+    server.start();
+    server.join();
+  }
+
+  private static WebApplicationContext getContext() {
+    AnnotationConfigWebApplicationContext context = new AnnotationConfigWebApplicationContext();
+    context.setConfigLocation(CONFIG_LOCATION);
+    return context;
+  }
+
+  private static Options createWebAppOptions() {
+    Options options = new Options();
+    Option helpOpt = new Option("h", "help", false, "show this help message");
+    OptionBuilder.withDescription("Port to run the WebApplication on.");
+    OptionBuilder.hasOptionalArg();
+    OptionBuilder.withArgName("port number");
+    options.addOption(OptionBuilder.create(CMD_PORT));
+    options.addOption(helpOpt);
+    return options;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/client/NutchClient.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/client/NutchClient.java b/nutch-core/src/main/java/org/apache/nutch/webui/client/NutchClient.java
new file mode 100644
index 0000000..3f8887d
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/client/NutchClient.java
@@ -0,0 +1,49 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.webui.client;
+
+import java.util.Map;
+
+import org.apache.nutch.webui.client.model.ConnectionStatus;
+import org.apache.nutch.webui.client.model.JobConfig;
+import org.apache.nutch.webui.client.model.JobInfo;
+import org.apache.nutch.webui.client.model.NutchStatus;
+import org.apache.nutch.webui.model.NutchInstance;
+import org.apache.nutch.webui.model.SeedList;
+
+public interface NutchClient {
+
+  public NutchInstance getNutchInstance();
+
+  public NutchStatus getNutchStatus();
+
+  public ConnectionStatus getConnectionStatus();
+
+  public String executeJob(JobConfig jobConfig);
+
+  public JobInfo getJobInfo(String jobId);
+
+  public Map<String, String> getNutchConfig(String config);
+
+  /**
+   * Create seed list and return seed directory location
+   * 
+   * @param seedList
+   * @return
+   */
+  public String createSeed(SeedList seedList);
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/client/NutchClientFactory.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/client/NutchClientFactory.java b/nutch-core/src/main/java/org/apache/nutch/webui/client/NutchClientFactory.java
new file mode 100644
index 0000000..32da00e
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/client/NutchClientFactory.java
@@ -0,0 +1,52 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.webui.client;
+
+import java.util.concurrent.ExecutionException;
+
+import org.apache.nutch.webui.client.impl.NutchClientImpl;
+import org.apache.nutch.webui.model.NutchInstance;
+import org.springframework.stereotype.Component;
+
+import com.google.common.cache.CacheBuilder;
+import com.google.common.cache.CacheLoader;
+import com.google.common.cache.LoadingCache;
+
+@Component
+public class NutchClientFactory {
+  private LoadingCache<NutchInstance, NutchClient> cache;
+
+  public NutchClientFactory() {
+    cache = CacheBuilder.newBuilder().build(new NutchClientCacheLoader());
+  }
+
+  public NutchClient getClient(NutchInstance instance) {
+    try {
+      return cache.get(instance);
+    } catch (ExecutionException e) {
+      throw new IllegalStateException(e);
+    }
+  }
+
+  private static class NutchClientCacheLoader extends
+      CacheLoader<NutchInstance, NutchClient> {
+    @Override
+    public NutchClient load(NutchInstance key) throws Exception {
+      return new NutchClientImpl(key);
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/CrawlingCycle.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/CrawlingCycle.java b/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/CrawlingCycle.java
new file mode 100644
index 0000000..2482c06
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/CrawlingCycle.java
@@ -0,0 +1,82 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.webui.client.impl;
+
+import java.util.List;
+
+import org.apache.commons.collections4.CollectionUtils;
+import org.apache.nutch.webui.client.model.Crawl;
+import org.apache.nutch.webui.client.model.JobInfo;
+import org.apache.nutch.webui.client.model.JobInfo.State;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.google.common.collect.Lists;
+
+/**
+ * This class implements crawl cycle as in crawl script
+ * 
+ * @author feodor
+ * 
+ */
+public class CrawlingCycle {
+  private Logger log = LoggerFactory.getLogger(CrawlingCycle.class);
+
+  private CrawlingCycleListener listener;
+  private RemoteCommandExecutor executor;
+  private Crawl crawl;
+
+  private List<RemoteCommand> remoteCommands;
+  private List<RemoteCommand> executedCommands = Lists.newArrayList();
+
+  public CrawlingCycle(CrawlingCycleListener listener,
+      RemoteCommandExecutor executor, Crawl crawl, List<RemoteCommand> commands) {
+    this.listener = listener;
+    this.executor = executor;
+    this.crawl = crawl;
+    this.remoteCommands = commands;
+  }
+
+  public synchronized void executeCrawlCycle() {
+    listener.crawlingStarted(crawl);
+
+    for (RemoteCommand command : remoteCommands) {
+      JobInfo jobInfo = executor.executeRemoteJob(command);
+      command.setJobInfo(jobInfo);
+
+      log.info("Executed remote command data: {}", command);
+
+      if (jobInfo.getState() == State.FAILED) {
+        listener.onCrawlError(crawl, jobInfo.getMsg());
+        return;
+      }
+
+      executedCommands.add(command);
+      listener.commandExecuted(crawl, command, calculateProgress());
+    }
+    listener.crawlingFinished(crawl);
+  }
+
+  private int calculateProgress() {
+    if (CollectionUtils.isEmpty(remoteCommands)) {
+      return 0;
+    }
+    return (int) ((float) executedCommands.size()
+        / (float) remoteCommands.size() * 100);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/CrawlingCycleListener.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/CrawlingCycleListener.java b/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/CrawlingCycleListener.java
new file mode 100644
index 0000000..c2abde5
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/CrawlingCycleListener.java
@@ -0,0 +1,31 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.webui.client.impl;
+
+import org.apache.nutch.webui.client.model.Crawl;
+
+public interface CrawlingCycleListener {
+
+  void crawlingStarted(Crawl crawl);
+
+  void onCrawlError(Crawl crawl, String msg);
+
+  void commandExecuted(Crawl crawl, RemoteCommand command, int progress);
+
+  void crawlingFinished(Crawl crawl);
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/NutchClientImpl.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/NutchClientImpl.java b/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/NutchClientImpl.java
new file mode 100644
index 0000000..1a577f9
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/NutchClientImpl.java
@@ -0,0 +1,99 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.webui.client.impl;
+
+import static javax.ws.rs.core.MediaType.APPLICATION_JSON;
+
+import java.util.Map;
+
+import org.apache.nutch.webui.client.NutchClient;
+import org.apache.nutch.webui.client.model.ConnectionStatus;
+import org.apache.nutch.webui.client.model.JobConfig;
+import org.apache.nutch.webui.client.model.JobInfo;
+import org.apache.nutch.webui.client.model.NutchStatus;
+import org.apache.nutch.webui.model.NutchInstance;
+import org.apache.nutch.webui.model.SeedList;
+
+import com.sun.jersey.api.client.Client;
+import com.sun.jersey.api.client.WebResource;
+import com.sun.jersey.api.client.config.ClientConfig;
+import com.sun.jersey.api.client.config.DefaultClientConfig;
+import com.sun.jersey.api.json.JSONConfiguration;
+
+public class NutchClientImpl implements NutchClient {
+  private Client client;
+  private WebResource nutchResource;
+  private NutchInstance instance;
+
+  public NutchClientImpl(NutchInstance instance) {
+    this.instance = instance;
+    createClient();
+  }
+
+  public void createClient() {
+    ClientConfig clientConfig = new DefaultClientConfig();
+    clientConfig.getFeatures()
+        .put(JSONConfiguration.FEATURE_POJO_MAPPING, true);
+    this.client = Client.create(clientConfig);
+    this.nutchResource = client.resource(instance.getUrl());
+  }
+
+  @Override
+  public NutchStatus getNutchStatus() {
+    return nutchResource.path("/admin").type(APPLICATION_JSON)
+        .get(NutchStatus.class);
+  }
+
+  @Override
+  public ConnectionStatus getConnectionStatus() {
+
+    getNutchStatus();
+    return ConnectionStatus.CONNECTED;
+    // TODO implement disconnected status
+  }
+
+  @Override
+  public String executeJob(JobConfig jobConfig) {
+    JobInfo jobInfo = nutchResource.path("/job/create").type(APPLICATION_JSON)
+        .post(JobInfo.class, jobConfig);
+    return jobInfo.getId();
+  }
+
+  @Override
+  public JobInfo getJobInfo(String jobId) {
+    return nutchResource.path("/job/" + jobId).type(APPLICATION_JSON)
+        .get(JobInfo.class);
+  }
+
+  @Override
+  public NutchInstance getNutchInstance() {
+    return instance;
+  }
+
+  @SuppressWarnings("unchecked")
+  @Override
+  public Map<String, String> getNutchConfig(String config) {
+    return nutchResource.path("/config/" + config).type(APPLICATION_JSON)
+        .get(Map.class);
+  }
+
+  @Override
+  public String createSeed(SeedList seedList) {
+    return nutchResource.path("/seed/create").type(APPLICATION_JSON)
+        .post(String.class, seedList);
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/RemoteCommand.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/RemoteCommand.java b/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/RemoteCommand.java
new file mode 100644
index 0000000..ea19a8a
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/RemoteCommand.java
@@ -0,0 +1,76 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.webui.client.impl;
+
+import java.io.Serializable;
+import java.text.MessageFormat;
+
+import org.apache.commons.lang3.StringUtils;
+import org.apache.nutch.webui.client.model.JobConfig;
+import org.apache.nutch.webui.client.model.JobInfo;
+import org.joda.time.Duration;
+
+public class RemoteCommand implements Serializable {
+  private JobConfig jobConfig;
+  private JobInfo jobInfo = new JobInfo();
+  private Duration timeout;
+
+  /**
+   * Use {@link RemoteCommandBuilder} instead
+   */
+  @SuppressWarnings("unused")
+  private RemoteCommand() {
+  }
+
+  public RemoteCommand(JobConfig jobConfig) {
+    this.jobConfig = jobConfig;
+  }
+
+  public JobConfig getJobConfig() {
+    return jobConfig;
+  }
+
+  public void setJobConfig(JobConfig jobConfig) {
+    this.jobConfig = jobConfig;
+  }
+
+  public JobInfo getJobInfo() {
+    return jobInfo;
+  }
+
+  public void setJobInfo(JobInfo jobInfo) {
+    this.jobInfo = jobInfo;
+  }
+
+  public Duration getTimeout() {
+    return timeout;
+  }
+
+  public void setTimeout(Duration timeout) {
+    this.timeout = timeout;
+  }
+
+  @Override
+  public String toString() {
+    String statusInfo = StringUtils.EMPTY;
+    if (jobInfo != null) {
+      statusInfo = MessageFormat.format("{0}", jobInfo.getState());
+    }
+    return MessageFormat.format("{0} status: {1}", jobConfig.getType(),
+        statusInfo);
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/RemoteCommandBuilder.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/RemoteCommandBuilder.java b/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/RemoteCommandBuilder.java
new file mode 100644
index 0000000..d6b1767
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/RemoteCommandBuilder.java
@@ -0,0 +1,64 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.webui.client.impl;
+
+import org.apache.nutch.webui.client.model.JobConfig;
+import org.apache.nutch.webui.client.model.JobInfo.JobType;
+import org.joda.time.Duration;
+
+public class RemoteCommandBuilder {
+  private JobConfig jobConfig = new JobConfig();
+  private Duration timeout = Duration.standardSeconds(10);
+
+  private RemoteCommandBuilder() {
+  }
+
+  public static RemoteCommandBuilder instance(JobType jobType) {
+    return new RemoteCommandBuilder().withJobType(jobType);
+  }
+
+  public RemoteCommandBuilder withJobType(JobType jobType) {
+    jobConfig.setType(jobType);
+    return this;
+  }
+
+  public RemoteCommandBuilder withConfigId(String configId) {
+    jobConfig.setConfId(configId);
+    return this;
+  }
+
+  public RemoteCommandBuilder withCrawlId(String crawlId) {
+    jobConfig.setCrawlId(crawlId);
+    return this;
+  }
+
+  public RemoteCommandBuilder withArgument(String key, String value) {
+    jobConfig.setArgument(key, value);
+    return this;
+  }
+
+  public RemoteCommandBuilder withTimeout(Duration timeout) {
+    this.timeout = timeout;
+    return this;
+  }
+
+  public RemoteCommand build() {
+    RemoteCommand remoteCommand = new RemoteCommand(jobConfig);
+    remoteCommand.setTimeout(timeout);
+    return remoteCommand;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/RemoteCommandExecutor.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/RemoteCommandExecutor.java b/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/RemoteCommandExecutor.java
new file mode 100644
index 0000000..e1eefc2
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/RemoteCommandExecutor.java
@@ -0,0 +1,110 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.webui.client.impl;
+
+import static com.google.common.base.Preconditions.checkState;
+
+import java.util.concurrent.Callable;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
+
+import org.apache.commons.lang3.exception.ExceptionUtils;
+import org.apache.nutch.webui.client.NutchClient;
+import org.apache.nutch.webui.client.model.JobInfo;
+import org.apache.nutch.webui.client.model.JobInfo.State;
+import org.joda.time.DateTimeConstants;
+import org.joda.time.Duration;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * This class executes remote job and waits for success/failure result
+ * 
+ * @author feodor
+ * 
+ */
+public class RemoteCommandExecutor {
+  private Logger log = LoggerFactory.getLogger(RemoteCommandExecutor.class);
+
+  private static final int DEFAULT_TIMEOUT_SEC = 60;
+  private Duration requestDelay = new Duration(500);
+
+  private NutchClient client;
+  private ExecutorService executor;
+
+  public RemoteCommandExecutor(NutchClient client) {
+    this.client = client;
+    this.executor = Executors.newSingleThreadExecutor();
+  }
+
+  public JobInfo executeRemoteJob(RemoteCommand command) {
+    try {
+      String jobId = client.executeJob(command.getJobConfig());
+      Future<JobInfo> chekerFuture = executor
+          .submit(new JobStateChecker(jobId));
+      return chekerFuture.get(getTimeout(command), TimeUnit.MILLISECONDS);
+    } catch (Exception e) {
+      log.error("Remote command failed", e);
+      JobInfo jobInfo = new JobInfo();
+      jobInfo.setState(State.FAILED);
+      jobInfo.setMsg(ExceptionUtils.getStackTrace(e));
+      return jobInfo;
+    }
+  }
+
+  private long getTimeout(RemoteCommand command) {
+    if (command.getTimeout() == null) {
+      return DEFAULT_TIMEOUT_SEC * DateTimeConstants.MILLIS_PER_SECOND;
+    }
+    return command.getTimeout().getMillis();
+  }
+
+  public void setRequestDelay(Duration requestDelay) {
+    this.requestDelay = requestDelay;
+  }
+
+  public class JobStateChecker implements Callable<JobInfo> {
+
+    private String jobId;
+
+    public JobStateChecker(String jobId) {
+      this.jobId = jobId;
+    }
+
+    @Override
+    public JobInfo call() throws Exception {
+      while (!Thread.interrupted()) {
+        JobInfo jobInfo = client.getJobInfo(jobId);
+        checkState(jobInfo != null, "Cannot get job info!");
+
+        State state = jobInfo.getState();
+        checkState(state != null, "Unknown job state!");
+
+        if (state == State.RUNNING || state == State.ANY || state == State.IDLE) {
+          Thread.sleep(requestDelay.getMillis());
+          continue;
+        }
+
+        return jobInfo;
+      }
+      return null;
+    }
+
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/RemoteCommandsBatchFactory.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/RemoteCommandsBatchFactory.java b/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/RemoteCommandsBatchFactory.java
new file mode 100644
index 0000000..cef56a5
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/RemoteCommandsBatchFactory.java
@@ -0,0 +1,97 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.webui.client.impl;
+
+import java.util.List;
+import java.util.UUID;
+
+import org.apache.nutch.webui.client.model.Crawl;
+import org.apache.nutch.webui.client.model.JobInfo.JobType;
+import org.joda.time.Duration;
+import org.springframework.beans.factory.config.BeanDefinition;
+import org.springframework.context.annotation.Scope;
+import org.springframework.stereotype.Component;
+
+import com.google.common.collect.Lists;
+
+@Component
+@Scope(BeanDefinition.SCOPE_PROTOTYPE)
+public class RemoteCommandsBatchFactory {
+
+  private List<RemoteCommand> remoteCommands;
+  private Crawl crawl;
+
+  private String batchId;
+
+  public List<RemoteCommand> createCommands(Crawl crawl) {
+    this.crawl = crawl;
+    this.remoteCommands = Lists.newArrayList();
+
+    remoteCommands.add(inject());
+    for (int i = 0; i < crawl.getNumberOfRounds(); i++) {
+      remoteCommands.addAll(createBatchCommands());
+    }
+    return remoteCommands;
+  }
+
+  private List<RemoteCommand> createBatchCommands() {
+    this.batchId = UUID.randomUUID().toString();
+    List<RemoteCommand> batchCommands = Lists.newArrayList();
+
+    batchCommands.add(createGenerateCommand());
+    batchCommands.add(createFetchCommand());
+    batchCommands.add(createParseCommand());
+    batchCommands.add(createUpdateDbCommand());
+    batchCommands.add(createIndexCommand());
+
+    return batchCommands;
+  }
+
+  private RemoteCommand inject() {
+    RemoteCommandBuilder builder = RemoteCommandBuilder
+        .instance(JobType.INJECT).withCrawlId(crawl.getCrawlId())
+        .withArgument("url_dir", crawl.getSeedDirectory());
+    return builder.build();
+  }
+
+  private RemoteCommand createGenerateCommand() {
+    return createBuilder(JobType.GENERATE).build();
+  }
+
+  private RemoteCommand createFetchCommand() {
+    return createBuilder(JobType.FETCH).withTimeout(
+        Duration.standardSeconds(50)).build();
+  }
+
+  private RemoteCommand createParseCommand() {
+    return createBuilder(JobType.PARSE).build();
+  }
+
+  private RemoteCommand createIndexCommand() {
+    return createBuilder(JobType.INDEX).build();
+  }
+
+  private RemoteCommand createUpdateDbCommand() {
+    return createBuilder(JobType.UPDATEDB).build();
+  }
+
+  private RemoteCommandBuilder createBuilder(JobType jobType) {
+    return RemoteCommandBuilder.instance(jobType)
+        .withCrawlId(crawl.getCrawlId()).withArgument("batch", batchId);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/client/model/ConnectionStatus.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/client/model/ConnectionStatus.java b/nutch-core/src/main/java/org/apache/nutch/webui/client/model/ConnectionStatus.java
new file mode 100644
index 0000000..d834612
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/client/model/ConnectionStatus.java
@@ -0,0 +1,21 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.webui.client.model;
+
+public enum ConnectionStatus {
+  CONNECTING, CONNECTED, DISCONNECTED;
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/client/model/Crawl.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/client/model/Crawl.java b/nutch-core/src/main/java/org/apache/nutch/webui/client/model/Crawl.java
new file mode 100644
index 0000000..6057f7f
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/client/model/Crawl.java
@@ -0,0 +1,126 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.webui.client.model;
+
+import java.io.Serializable;
+
+import javax.persistence.Column;
+import javax.persistence.Entity;
+import javax.persistence.GeneratedValue;
+import javax.persistence.Id;
+
+import org.apache.nutch.webui.model.SeedList;
+
+import com.j256.ormlite.field.DatabaseField;
+
+@Entity
+public class Crawl implements Serializable {
+  public enum CrawlStatus {
+    NEW, CRAWLING, FINISHED, ERROR
+  }
+
+  @Id
+  @GeneratedValue
+  private Long id;
+
+  @Column
+  private String crawlId;
+
+  @Column
+  private String crawlName;
+
+  @Column
+  private CrawlStatus status = CrawlStatus.NEW;
+
+  @Column
+  private Integer numberOfRounds = 1;
+
+  @Column
+  @DatabaseField(foreign = true, foreignAutoRefresh = true)
+  private SeedList seedList;
+
+  @Column
+  private String seedDirectory;
+
+  @Column
+  private int progress;
+
+  public Integer getNumberOfRounds() {
+    return numberOfRounds;
+  }
+
+  public void setNumberOfRounds(Integer numberOfRounds) {
+    this.numberOfRounds = numberOfRounds;
+  }
+
+  public String getCrawlId() {
+    return crawlId;
+  }
+
+  public void setCrawlId(String crawlId) {
+    this.crawlId = crawlId;
+  }
+
+  public CrawlStatus getStatus() {
+    return status;
+  }
+
+  public void setStatus(CrawlStatus status) {
+    this.status = status;
+  }
+
+  public String getCrawlName() {
+    return crawlName;
+  }
+
+  public void setCrawlName(String crawlName) {
+    this.crawlName = crawlName;
+  }
+
+  public SeedList getSeedList() {
+    return seedList;
+  }
+
+  public void setSeedList(SeedList seedList) {
+    this.seedList = seedList;
+  }
+
+  public Long getId() {
+    return id;
+  }
+
+  public void setId(Long id) {
+    this.id = id;
+  }
+
+  public String getSeedDirectory() {
+    return seedDirectory;
+  }
+
+  public void setSeedDirectory(String seedDirectory) {
+    this.seedDirectory = seedDirectory;
+  }
+
+  public int getProgress() {
+    return progress;
+  }
+
+  public void setProgress(int progress) {
+    this.progress = progress;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/client/model/JobConfig.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/client/model/JobConfig.java b/nutch-core/src/main/java/org/apache/nutch/webui/client/model/JobConfig.java
new file mode 100644
index 0000000..80df279
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/client/model/JobConfig.java
@@ -0,0 +1,77 @@
+/*******************************************************************************
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ ******************************************************************************/
+package org.apache.nutch.webui.client.model;
+
+import java.io.Serializable;
+import java.util.Collections;
+import java.util.Map;
+
+import org.apache.nutch.webui.client.model.JobInfo.JobType;
+
+import com.google.common.collect.Maps;
+
+public class JobConfig implements Serializable {
+  private String crawlId;
+  private JobType type;
+  private String confId = "default";
+  private String jobClassName;
+  private Map<String, Object> args = Maps.newHashMap();
+
+  public void setArgument(String key, String value) {
+    args.put(key, value);
+  }
+
+  public String getCrawlId() {
+    return crawlId;
+  }
+
+  public void setCrawlId(String crawlId) {
+    this.crawlId = crawlId;
+  }
+
+  public JobType getType() {
+    return type;
+  }
+
+  public void setType(JobType type) {
+    this.type = type;
+  }
+
+  public String getConfId() {
+    return confId;
+  }
+
+  public void setConfId(String confId) {
+    this.confId = confId;
+  }
+
+  public Map<String, Object> getArgs() {
+    return Collections.unmodifiableMap(args);
+  }
+
+  public void setArgs(Map<String, Object> args) {
+    this.args = args;
+  }
+
+  public String getJobClassName() {
+    return jobClassName;
+  }
+
+  public void setJobClassName(String jobClass) {
+    this.jobClassName = jobClass;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/client/model/JobInfo.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/client/model/JobInfo.java b/nutch-core/src/main/java/org/apache/nutch/webui/client/model/JobInfo.java
new file mode 100644
index 0000000..312118a
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/client/model/JobInfo.java
@@ -0,0 +1,104 @@
+/*******************************************************************************
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ ******************************************************************************/
+package org.apache.nutch.webui.client.model;
+
+import java.io.Serializable;
+import java.util.Map;
+
+public class JobInfo implements Serializable {
+  public static enum JobType {
+    INJECT, GENERATE, FETCH, PARSE, UPDATEDB, INDEX, READDB, CLASS
+  };
+
+  public static enum State {
+    IDLE, RUNNING, FINISHED, FAILED, KILLED, STOPPING, KILLING, ANY
+  };
+
+  private String id;
+  private String type;
+  private String confId;
+  private Map<String, Object> args;
+  private Map<String, Object> result;
+  private State state;
+  private String msg;
+  private String crawlId;
+
+  public String getMsg() {
+    return msg;
+  }
+
+  public void setMsg(String msg) {
+    this.msg = msg;
+  }
+
+  public State getState() {
+    return state;
+  }
+
+  public void setState(State state) {
+    this.state = state;
+  }
+
+  public Map<String, Object> getResult() {
+    return result;
+  }
+
+  public void setResult(Map<String, Object> result) {
+    this.result = result;
+  }
+
+  public Map<String, Object> getArgs() {
+    return args;
+  }
+
+  public void setArgs(Map<String, Object> args) {
+    this.args = args;
+  }
+
+  public String getConfId() {
+    return confId;
+  }
+
+  public void setConfId(String confId) {
+    this.confId = confId;
+  }
+
+  public String getId() {
+    return id;
+  }
+
+  public void setId(String id) {
+    this.id = id;
+  }
+
+  public String getCrawlId() {
+    return crawlId;
+  }
+
+  public void setCrawlId(String crawlId) {
+    this.crawlId = crawlId;
+  }
+
+  public String getType() {
+    return type;
+  }
+
+  public void setType(String type) {
+    this.type = type;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/client/model/NutchStatus.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/client/model/NutchStatus.java b/nutch-core/src/main/java/org/apache/nutch/webui/client/model/NutchStatus.java
new file mode 100644
index 0000000..0c5c425
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/client/model/NutchStatus.java
@@ -0,0 +1,62 @@
+/*******************************************************************************
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ ******************************************************************************/
+package org.apache.nutch.webui.client.model;
+
+import java.io.Serializable;
+import java.util.Collection;
+import java.util.Date;
+import java.util.Set;
+
+public class NutchStatus implements Serializable {
+
+  private Date startDate;
+  private Set<String> configuration;
+  private Collection<JobInfo> jobs;
+  private Collection<JobInfo> runningJobs;
+
+  public Date getStartDate() {
+    return startDate;
+  }
+
+  public void setStartDate(Date startDate) {
+    this.startDate = startDate;
+  }
+
+  public Set<String> getConfiguration() {
+    return configuration;
+  }
+
+  public void setConfiguration(Set<String> configuration) {
+    this.configuration = configuration;
+  }
+
+  public Collection<JobInfo> getJobs() {
+    return jobs;
+  }
+
+  public void setJobs(Collection<JobInfo> jobs) {
+    this.jobs = jobs;
+  }
+
+  public Collection<JobInfo> getRunningJobs() {
+    return runningJobs;
+  }
+
+  public void setRunningJobs(Collection<JobInfo> runningJobs) {
+    this.runningJobs = runningJobs;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/config/CustomDaoFactory.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/config/CustomDaoFactory.java b/nutch-core/src/main/java/org/apache/nutch/webui/config/CustomDaoFactory.java
new file mode 100644
index 0000000..09c2d6a
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/config/CustomDaoFactory.java
@@ -0,0 +1,58 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.webui.config;
+
+import java.sql.SQLException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+
+import com.j256.ormlite.dao.Dao;
+import com.j256.ormlite.spring.DaoFactory;
+import com.j256.ormlite.support.ConnectionSource;
+
+public class CustomDaoFactory {
+  private ConnectionSource connectionSource;
+  private List<Dao<?, ?>> registredDaos = Collections
+      .synchronizedList(new ArrayList<Dao<?, ?>>());
+
+  public CustomDaoFactory(ConnectionSource connectionSource) {
+    this.connectionSource = connectionSource;
+  }
+
+  public <T, ID> Dao<T, ID> createDao(Class<T> clazz) {
+    try {
+      Dao<T, ID> dao = DaoFactory.createDao(connectionSource, clazz);
+      register(dao);
+      return dao;
+    } catch (SQLException e) {
+      throw new RuntimeException(e);
+    }
+  }
+
+  private <T, ID> void register(Dao<T, ID> dao) {
+    synchronized (registredDaos) {
+      registredDaos.add(dao);
+    }
+  }
+
+  public List<Dao<?, ?>> getCreatedDaos() {
+    synchronized (registredDaos) {
+      return Collections.unmodifiableList(registredDaos);
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/config/CustomTableCreator.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/config/CustomTableCreator.java b/nutch-core/src/main/java/org/apache/nutch/webui/config/CustomTableCreator.java
new file mode 100644
index 0000000..9b31d73
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/config/CustomTableCreator.java
@@ -0,0 +1,83 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.webui.config;
+
+import java.sql.SQLException;
+import java.util.List;
+
+import com.j256.ormlite.dao.BaseDaoImpl;
+import com.j256.ormlite.dao.Dao;
+import com.j256.ormlite.support.ConnectionSource;
+import com.j256.ormlite.table.DatabaseTableConfig;
+import com.j256.ormlite.table.TableUtils;
+
+public class CustomTableCreator {
+
+  private ConnectionSource connectionSource;
+  private List<Dao<?, ?>> configuredDaos;
+
+  public CustomTableCreator(ConnectionSource connectionSource,
+      List<Dao<?, ?>> configuredDaos) {
+    this.connectionSource = connectionSource;
+    this.configuredDaos = configuredDaos;
+    initialize();
+  }
+
+  private void initialize() {
+    if (configuredDaos == null) {
+      throw new IllegalStateException("configuredDaos was not set in "
+          + getClass().getSimpleName());
+    }
+
+    for (Dao<?, ?> dao : configuredDaos) {
+      createTableForDao(dao);
+    }
+  }
+
+  private void createTableForDao(Dao<?, ?> dao) {
+    DatabaseTableConfig<?> tableConfig = getTableConfig(dao);
+    createTableIfNotExists(tableConfig);
+  }
+
+  private DatabaseTableConfig<?> getTableConfig(Dao<?, ?> dao) {
+    Class<?> clazz = dao.getDataClass();
+    DatabaseTableConfig<?> tableConfig = null;
+    if (dao instanceof BaseDaoImpl) {
+      tableConfig = ((BaseDaoImpl<?, ?>) dao).getTableConfig();
+    }
+    if (tableConfig == null) {
+      return getConfigFromClass(clazz);
+    }
+    return tableConfig;
+  }
+
+  private DatabaseTableConfig<?> getConfigFromClass(Class<?> clazz) {
+    try {
+      return DatabaseTableConfig.fromClass(connectionSource, clazz);
+    } catch (SQLException e) {
+      throw new RuntimeException(e);
+    }
+  }
+
+  private void createTableIfNotExists(DatabaseTableConfig<?> tableConfig) {
+    try {
+      TableUtils.createTableIfNotExists(connectionSource, tableConfig);
+    } catch (SQLException e) {
+      throw new RuntimeException(e);
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/config/NutchGuiConfiguration.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/config/NutchGuiConfiguration.java b/nutch-core/src/main/java/org/apache/nutch/webui/config/NutchGuiConfiguration.java
new file mode 100644
index 0000000..8b76440
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/config/NutchGuiConfiguration.java
@@ -0,0 +1,33 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.webui.config;
+
+import java.util.List;
+
+import org.apache.nutch.webui.model.NutchInstance;
+
+public class NutchGuiConfiguration {
+  private List<NutchInstance> instances;
+
+  public List<NutchInstance> getInstances() {
+    return instances;
+  }
+
+  public void setInstances(List<NutchInstance> instances) {
+    this.instances = instances;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/config/SpringConfiguration.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/config/SpringConfiguration.java b/nutch-core/src/main/java/org/apache/nutch/webui/config/SpringConfiguration.java
new file mode 100644
index 0000000..1687cee
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/config/SpringConfiguration.java
@@ -0,0 +1,91 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.webui.config;
+
+import java.sql.SQLException;
+import java.util.concurrent.Executor;
+
+import org.apache.nutch.webui.client.model.Crawl;
+import org.apache.nutch.webui.model.NutchInstance;
+import org.apache.nutch.webui.model.SeedList;
+import org.apache.nutch.webui.model.SeedUrl;
+import org.springframework.context.annotation.Bean;
+import org.springframework.context.annotation.Configuration;
+import org.springframework.scheduling.annotation.AsyncConfigurer;
+import org.springframework.scheduling.annotation.EnableAsync;
+import org.springframework.scheduling.concurrent.ThreadPoolTaskExecutor;
+
+import com.j256.ormlite.dao.Dao;
+import com.j256.ormlite.db.H2DatabaseType;
+import com.j256.ormlite.jdbc.JdbcConnectionSource;
+
+@Configuration
+@EnableAsync
+public class SpringConfiguration implements AsyncConfigurer {
+
+  @Override
+  public Executor getAsyncExecutor() {
+    // TODO move magic numbers to properties file
+    ThreadPoolTaskExecutor executor = new ThreadPoolTaskExecutor();
+    executor.setCorePoolSize(7);
+    executor.setMaxPoolSize(42);
+    executor.setQueueCapacity(11);
+    executor.setThreadNamePrefix("SpringExecutor-");
+    executor.initialize();
+    return executor;
+  }
+
+  @Bean
+  public JdbcConnectionSource getConnectionSource() throws SQLException {
+    JdbcConnectionSource source = new JdbcConnectionSource(
+        "jdbc:h2:~/.nutch/config", new H2DatabaseType());
+    source.initialize();
+    return source;
+  }
+
+  @Bean
+  public CustomDaoFactory getDaoFactory() throws SQLException {
+    return new CustomDaoFactory(getConnectionSource());
+  }
+
+  @Bean
+  public Dao<NutchInstance, Long> createNutchDao() throws SQLException {
+    return getDaoFactory().createDao(NutchInstance.class);
+  }
+
+  @Bean
+  public Dao<SeedList, Long> createSeedListDao() throws SQLException {
+    return getDaoFactory().createDao(SeedList.class);
+  }
+
+  @Bean
+  public Dao<SeedUrl, Long> createSeedUrlDao() throws SQLException {
+    return getDaoFactory().createDao(SeedUrl.class);
+  }
+
+  @Bean
+  public Dao<Crawl, Long> createCrawlDao() throws SQLException {
+    return getDaoFactory().createDao(Crawl.class);
+  }
+
+  @Bean
+  public CustomTableCreator createTableCreator() throws SQLException {
+    return new CustomTableCreator(getConnectionSource(), getDaoFactory()
+        .getCreatedDaos());
+  }
+
+}

[15/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build for nutch-core and nutch-plugins

Posted by th...@apache.org.

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-selenium/src/main/java/org/apache/nutch/protocol/selenium/HttpWebClient.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-selenium/src/main/java/org/apache/nutch/protocol/selenium/HttpWebClient.java b/nutch-plugins/lib-selenium/src/main/java/org/apache/nutch/protocol/selenium/HttpWebClient.java
new file mode 100644
index 0000000..86692ae
--- /dev/null
+++ b/nutch-plugins/lib-selenium/src/main/java/org/apache/nutch/protocol/selenium/HttpWebClient.java
@@ -0,0 +1,236 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.selenium;
+
+import java.io.BufferedInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.net.URL;
+import java.util.concurrent.TimeUnit;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IOUtils;
+import org.openqa.selenium.By;
+import org.openqa.selenium.OutputType;
+import org.openqa.selenium.TakesScreenshot;
+import org.openqa.selenium.TimeoutException;
+import org.openqa.selenium.WebDriver;
+import org.openqa.selenium.chrome.ChromeDriver;
+import org.openqa.selenium.firefox.FirefoxBinary;
+import org.openqa.selenium.firefox.FirefoxDriver;
+import org.openqa.selenium.firefox.FirefoxProfile;
+import org.openqa.selenium.io.TemporaryFilesystem;
+import org.openqa.selenium.remote.DesiredCapabilities;
+import org.openqa.selenium.remote.RemoteWebDriver;
+import org.openqa.selenium.safari.SafariDriver;
+import org.openqa.selenium.phantomjs.PhantomJSDriver;
+import org.openqa.selenium.phantomjs.PhantomJSDriverService;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.opera.core.systems.OperaDriver;
+
+public class HttpWebClient {
+
+  private static final Logger LOG = LoggerFactory.getLogger(HttpWebClient.class);
+
+  public static ThreadLocal<WebDriver> threadWebDriver = new ThreadLocal<WebDriver>() {
+
+    @Override
+    protected WebDriver initialValue()
+    {
+      FirefoxProfile profile = new FirefoxProfile();
+      profile.setPreference("permissions.default.stylesheet", 2);
+      profile.setPreference("permissions.default.image", 2);
+      profile.setPreference("dom.ipc.plugins.enabled.libflashplayer.so", "false");
+      profile.setPreference(FirefoxProfile.ALLOWED_HOSTS_PREFERENCE, "localhost");
+      WebDriver driver = new FirefoxDriver(profile);
+      return driver;          
+    };
+  };
+
+  public static WebDriver getDriverForPage(String url, Configuration conf) {
+      WebDriver driver = null;
+      DesiredCapabilities capabilities = null;
+      long pageLoadWait = conf.getLong("page.load.delay", 3);
+
+      try {
+        String driverType  = conf.get("selenium.driver", "firefox");
+        switch (driverType) {
+          case "firefox":
+          	String allowedHost = conf.get("selenium.firefox.allowed.hosts", "localhost");
+          	long firefoxBinaryTimeout = conf.getLong("selenium.firefox.binary.timeout", 45);
+          	boolean enableFlashPlayer = conf.getBoolean("selenium.firefox.enable.flash", false);
+          	int loadImage = conf.getInt("selenium.firefox.load.image", 1);
+          	int loadStylesheet = conf.getInt("selenium.firefox.load.stylesheet", 1);
+    		    FirefoxProfile profile = new FirefoxProfile();
+    		    FirefoxBinary binary = new FirefoxBinary();
+    		    profile.setPreference(FirefoxProfile.ALLOWED_HOSTS_PREFERENCE, allowedHost);
+    		    profile.setPreference("dom.ipc.plugins.enabled.libflashplayer.so", enableFlashPlayer);
+    		    profile.setPreference("permissions.default.stylesheet", loadStylesheet);
+  	      	profile.setPreference("permissions.default.image", loadImage);
+    		    binary.setTimeout(TimeUnit.SECONDS.toMillis(firefoxBinaryTimeout));
+            driver = new FirefoxDriver(binary, profile);
+            break;
+          case "chrome":
+            driver = new ChromeDriver();
+            break;
+          case "safari":
+            driver = new SafariDriver();
+            break;
+          case "opera":
+            driver = new OperaDriver();
+            break;
+          case "phantomjs":
+            driver = new PhantomJSDriver();
+            break;
+          case "remote":
+            String seleniumHubHost = conf.get("selenium.hub.host", "localhost");
+            int seleniumHubPort = Integer.parseInt(conf.get("selenium.hub.port", "4444"));
+            String seleniumHubPath = conf.get("selenium.hub.path", "/wd/hub");
+            String seleniumHubProtocol = conf.get("selenium.hub.protocol", "http");
+            String seleniumGridDriver = conf.get("selenium.grid.driver","firefox");
+            String seleniumGridBinary = conf.get("selenium.grid.binary");
+
+            switch (seleniumGridDriver){
+              case "firefox":
+                capabilities = DesiredCapabilities.firefox();
+                capabilities.setBrowserName("firefox");
+                capabilities.setJavascriptEnabled(true);
+                capabilities.setCapability("firefox_binary",seleniumGridBinary);
+                System.setProperty("webdriver.reap_profile", "false");
+                driver = new RemoteWebDriver(new URL(seleniumHubProtocol, seleniumHubHost, seleniumHubPort, seleniumHubPath), capabilities);
+                break;
+              case "phantomjs":
+                capabilities = DesiredCapabilities.phantomjs();
+                capabilities.setBrowserName("phantomjs");
+                capabilities.setJavascriptEnabled(true);
+                capabilities.setCapability(PhantomJSDriverService.PHANTOMJS_EXECUTABLE_PATH_PROPERTY,seleniumGridBinary);
+                driver = new RemoteWebDriver(new URL(seleniumHubProtocol, seleniumHubHost, seleniumHubPort, seleniumHubPath), capabilities);
+                break;
+              default:
+                LOG.error("The Selenium Grid WebDriver choice {} is not available... defaulting to FirefoxDriver().", driverType);
+                driver = new RemoteWebDriver(new URL(seleniumHubProtocol, seleniumHubHost, seleniumHubPort, seleniumHubPath), DesiredCapabilities.firefox());
+                break;
+            }
+            break;
+          default:
+            LOG.error("The Selenium WebDriver choice {} is not available... defaulting to FirefoxDriver().", driverType);
+            driver = new FirefoxDriver();
+            break;
+        }
+        LOG.debug("Selenium {} WebDriver selected.", driverType);
+  
+        driver.manage().timeouts().pageLoadTimeout(pageLoadWait, TimeUnit.SECONDS);
+        driver.get(url);
+      } catch (Exception e) {
+			  if(e instanceof TimeoutException) {
+          LOG.debug("Selenium WebDriver: Timeout Exception: Capturing whatever loaded so far...");
+          return driver;
+			  }
+			  cleanUpDriver(driver);
+		    throw new RuntimeException(e);
+	    } 
+
+      return driver;
+  }
+
+  public static String getHTMLContent(WebDriver driver, Configuration conf) {
+      if (conf.getBoolean("take.screenshot", false)) {
+        takeScreenshot(driver, conf);
+      }
+
+      return driver.findElement(By.tagName("body")).getAttribute("innerHTML");
+  }
+
+  public static void cleanUpDriver(WebDriver driver) {
+    if (driver != null) {
+      try {
+	      driver.close();
+        driver.quit();
+        TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles();
+      } catch (Exception e) {
+        throw new RuntimeException(e);
+      }
+    }
+  }
+
+  /**
+   * Function for obtaining the HTML BODY using the selected
+   * {@link org.openqa.selenium.WebDriver}.
+   * There are a number of configuration properties within
+   * <code>nutch-site.xml</code> which determine whether to
+   * take screenshots of the rendered pages and persist them
+   * as timestamped .png's into HDFS.
+   * @param url the URL to fetch and render
+   * @param conf the {@link org.apache.hadoop.conf.Configuration}
+   * @return the rendered inner HTML page
+   */
+  public static String getHtmlPage(String url, Configuration conf) {
+    WebDriver driver = getDriverForPage(url, conf);
+    
+    try {
+      if (conf.getBoolean("take.screenshot", false)) {
+        takeScreenshot(driver, conf);
+      }
+
+      String innerHtml = driver.findElement(By.tagName("body")).getAttribute("innerHTML");
+      return innerHtml;
+
+      // I'm sure this catch statement is a code smell ; borrowing it from lib-htmlunit
+    } catch (Exception e) {
+      TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles();
+      throw new RuntimeException(e);
+    } finally {
+      cleanUpDriver(driver);
+    }
+  }
+
+  public static String getHtmlPage(String url) {
+    return getHtmlPage(url, null);
+  }
+
+  private static void takeScreenshot(WebDriver driver, Configuration conf) {
+    try {
+      String url = driver.getCurrentUrl();
+      File srcFile = ((TakesScreenshot)driver).getScreenshotAs(OutputType.FILE);
+      LOG.debug("In-memory screenshot taken of: {}", url);
+      FileSystem fs = FileSystem.get(conf);
+      if (conf.get("screenshot.location") != null) {
+        Path screenshotPath = new Path(conf.get("screenshot.location") + "/" + srcFile.getName());
+        OutputStream os = null;
+        if (!fs.exists(screenshotPath)) {
+          LOG.debug("No existing screenshot already exists... creating new file at {} {}.", screenshotPath, srcFile.getName());
+          os = fs.create(screenshotPath);
+        }
+        InputStream is = new BufferedInputStream(new FileInputStream(srcFile));
+        IOUtils.copyBytes(is, os, conf);
+        LOG.debug("Screenshot for {} successfully saved to: {} {}", url, screenshotPath, srcFile.getName()); 
+      } else {
+        LOG.warn("Screenshot for {} not saved to HDFS (subsequently disgarded) as value for "
+            + "'screenshot.location' is absent from nutch-site.xml.", url);
+      }
+    } catch (Exception e) {
+      cleanUpDriver(driver);
+      throw new RuntimeException(e);
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-xml/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-xml/build.xml b/nutch-plugins/lib-xml/build.xml
new file mode 100644
index 0000000..0f87c07
--- /dev/null
+++ b/nutch-plugins/lib-xml/build.xml
@@ -0,0 +1,36 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="lib-xml" default="jar">
+
+	<import file="../build-plugin.xml" />
+
+	<!--
+   ! Override the compile and jar targets,
+   ! since there is nothing to compile here.
+   ! -->
+	<target name="compile" depends="init, resolve-default" />
+
+	<!--
+	<target name="jar" depends="compile">
+		<copy todir="${build.dir}" verbose="true">
+			<fileset dir="./lib" includes="**/*.jar" />
+		</copy>
+	</target>
+	-->
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-xml/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-xml/ivy.xml b/nutch-plugins/lib-xml/ivy.xml
new file mode 100644
index 0000000..414f38a
--- /dev/null
+++ b/nutch-plugins/lib-xml/ivy.xml
@@ -0,0 +1,44 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+    <dependency org="org.jdom" name="jdom" rev="1.1" conf="*->default"/>
+    <dependency org="jaxen" name="jaxen" rev="1.1.1" conf="*->master"/>
+    <dependency org="xerces" name="xercesImpl" rev="2.11.0" conf="*->master"/>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-xml/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-xml/plugin.xml b/nutch-plugins/lib-xml/plugin.xml
new file mode 100644
index 0000000..79bd17f
--- /dev/null
+++ b/nutch-plugins/lib-xml/plugin.xml
@@ -0,0 +1,65 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<!--
+ ! XML library - Gathers many XML related libraries:
+ !
+ ! * Jaxen
+ !     - Download : http://jaxen.org/releases.html
+ !     - License  : http://jaxen.org/license.html
+ !
+ !   * Xerces-J 2.6.1
+ !     - Download : http://xerces.apache.org/xerces2-j/download.cgi
+ !     - License  : http://www.apache.org/licenses/LICENSE-2.0
+ !
+ !   * SAXPath 1.0 FCS
+ !     - Note     : SAXPath has been incorporated into Jaxen.
+ !                  It has been merged into the Jaxen codebase
+ !                  and is no longer being maintained separately
+ !     - Download : http://sourceforge.net/project/showfiles.php?group_id=26014
+ !     - License  : OSI-Approved Open Source
+ !
+ !   * jdom 1.0 beta8-dev
+ !     - Download : http://www.jdom.org/downloads/index.html
+ !     - License  : http://www.jdom.org/docs/faq.html#a0030
+ !
+ !-->
+<plugin
+   id="lib-xml"
+   name="XML Libraries"
+   version="1.0"
+   provider-name="org.apache.nutch.xml">
+
+   <runtime>
+     <library name="jaxen-core.jar">
+       <export name="*"/>
+     </library>
+     <library name="jaxen-jdom.jar">
+       <export name="*"/>
+     </library>
+     <library name="xercesImpl.jar">
+       <export name="*"/>
+     </library>
+     <library name="saxpath.jar">
+       <export name="*"/>
+     </library>
+     <library name="jdom.jar">
+        <export name="*"/>
+     </library>
+   </runtime>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-xml/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-xml/pom.xml b/nutch-plugins/lib-xml/pom.xml
new file mode 100644
index 0000000..132d0f2
--- /dev/null
+++ b/nutch-plugins/lib-xml/pom.xml
@@ -0,0 +1,38 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>lib-xml</artifactId>
+    <packaging>jar</packaging>
+
+    <name>lib-xml</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/microformats-reltag/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/microformats-reltag/build.xml b/nutch-plugins/microformats-reltag/build.xml
new file mode 100644
index 0000000..395afee
--- /dev/null
+++ b/nutch-plugins/microformats-reltag/build.xml
@@ -0,0 +1,27 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="microformats-reltag" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+  <!-- Deploy Unit test dependencies -->
+  <target name="deps-test">
+    <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
+  </target>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/microformats-reltag/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/microformats-reltag/ivy.xml b/nutch-plugins/microformats-reltag/ivy.xml
new file mode 100644
index 0000000..1a86d68
--- /dev/null
+++ b/nutch-plugins/microformats-reltag/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/microformats-reltag/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/microformats-reltag/plugin.xml b/nutch-plugins/microformats-reltag/plugin.xml
new file mode 100644
index 0000000..b35e1f4
--- /dev/null
+++ b/nutch-plugins/microformats-reltag/plugin.xml
@@ -0,0 +1,49 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="microformats-reltag"
+   name="Rel-Tag microformat Parser/Indexer/Querier"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+    <runtime>
+      <library name="microformats-reltag.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+   <extension id="org.apache.nutch.microformats.reltag.RelTagParser"
+              name="Rel-Tag parser"
+              point="org.apache.nutch.parse.HtmlParseFilter">
+      <implementation id="RelTagParser"
+                      class="org.apache.nutch.microformats.reltag.RelTagParser"/>
+   </extension>
+
+   <extension id="org.apache.nutch.microformats.reltag.RelTagIndexingFilter"
+              name="Rel-Tag indexing filter"
+              point="org.apache.nutch.indexer.IndexingFilter">
+      <implementation id="RelTagIndexingFilter"
+                      class="org.apache.nutch.microformats.reltag.RelTagIndexingFilter"/>
+   </extension>
+
+</plugin>
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/microformats-reltag/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/microformats-reltag/pom.xml b/nutch-plugins/microformats-reltag/pom.xml
new file mode 100644
index 0000000..8579cb5
--- /dev/null
+++ b/nutch-plugins/microformats-reltag/pom.xml
@@ -0,0 +1,38 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>microformats-reltag</artifactId>
+    <packaging>jar</packaging>
+
+    <name>microformats-reltag</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/microformats-reltag/src/main/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/microformats-reltag/src/main/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java b/nutch-plugins/microformats-reltag/src/main/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java
new file mode 100644
index 0000000..e50a150
--- /dev/null
+++ b/nutch-plugins/microformats-reltag/src/main/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java
@@ -0,0 +1,77 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.microformats.reltag;
+
+// Nutch imports
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.IndexingFilter;
+import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.parse.Parse;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
+
+/**
+ * An {@link org.apache.nutch.indexer.IndexingFilter} that add <code>tag</code>
+ * field(s) to the document.
+ * 
+ * @see <a href="http://www.microformats.org/wiki/rel-tag">
+ *      http://www.microformats.org/wiki/rel-tag</a>
+ * @author J&eacute;r&ocirc;me Charron
+ */
+public class RelTagIndexingFilter implements IndexingFilter {
+
+  private Configuration conf;
+
+  // Inherited JavaDoc
+  public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
+      CrawlDatum datum, Inlinks inlinks) throws IndexingException {
+
+    // Check if some Rel-Tags found, possibly put there by RelTagParser
+    String[] tags = parse.getData().getParseMeta()
+        .getValues(RelTagParser.REL_TAG);
+    if (tags != null) {
+      for (int i = 0; i < tags.length; i++) {
+        doc.add("tag", tags[i]);
+      }
+    }
+
+    return doc;
+  }
+
+  /*
+   * ----------------------------- * <implementation:Configurable> *
+   * -----------------------------
+   */
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+  }
+
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+  /*
+   * ------------------------------ * </implementation:Configurable> *
+   * ------------------------------
+   */
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/microformats-reltag/src/main/java/org/apache/nutch/microformats/reltag/RelTagParser.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/microformats-reltag/src/main/java/org/apache/nutch/microformats/reltag/RelTagParser.java b/nutch-plugins/microformats-reltag/src/main/java/org/apache/nutch/microformats/reltag/RelTagParser.java
new file mode 100644
index 0000000..9176a1e
--- /dev/null
+++ b/nutch-plugins/microformats-reltag/src/main/java/org/apache/nutch/microformats/reltag/RelTagParser.java
@@ -0,0 +1,148 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.microformats.reltag;
+
+// JDK imports
+import java.net.URL;
+import java.net.URLDecoder;
+import java.util.Iterator;
+import java.util.Set;
+import java.util.TreeSet;
+import org.w3c.dom.DocumentFragment;
+import org.w3c.dom.NamedNodeMap;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+
+// Commons Logging imports
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+// Nutch imports
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.HTMLMetaTags;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseResult;
+import org.apache.nutch.parse.HtmlParseFilter;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.StringUtil;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
+
+/**
+ * Adds microformat rel-tags of document if found.
+ * 
+ * @see <a href="http://www.microformats.org/wiki/rel-tag">
+ *      http://www.microformats.org/wiki/rel-tag</a>
+ */
+public class RelTagParser implements HtmlParseFilter {
+
+  public final static Logger LOG = LoggerFactory.getLogger(RelTagParser.class);
+
+  public final static String REL_TAG = "Rel-Tag";
+
+  private Configuration conf = null;
+
+  /**
+   * Scan the HTML document looking at possible rel-tags
+   */
+  public ParseResult filter(Content content, ParseResult parseResult,
+      HTMLMetaTags metaTags, DocumentFragment doc) {
+
+    // get parse obj
+    Parse parse = parseResult.get(content.getUrl());
+    // Trying to find the document's rel-tags
+    Parser parser = new Parser(doc);
+    Set<?> tags = parser.getRelTags();
+    Iterator<?> iter = tags.iterator();
+    Metadata metadata = parse.getData().getParseMeta();
+    while (iter.hasNext())
+      metadata.add(REL_TAG, (String) iter.next());
+
+    return parseResult;
+  }
+
+  private static class Parser {
+
+    Set<String> tags = null;
+
+    Parser(Node node) {
+      tags = new TreeSet<String>();
+      parse(node);
+    }
+
+    Set<String> getRelTags() {
+      return tags;
+    }
+
+    void parse(Node node) {
+
+      if (node.getNodeType() == Node.ELEMENT_NODE) {
+        // Look for <a> tag
+        if ("a".equalsIgnoreCase(node.getNodeName())) {
+          NamedNodeMap attrs = node.getAttributes();
+          Node hrefNode = attrs.getNamedItem("href");
+          // Checks that it contains a href attribute
+          if (hrefNode != null) {
+            Node relNode = attrs.getNamedItem("rel");
+            // Checks that it contains a rel attribute too
+            if (relNode != null) {
+              // Finaly checks that rel=tag
+              if ("tag".equalsIgnoreCase(relNode.getNodeValue())) {
+                String tag = parseTag(hrefNode.getNodeValue());
+                if (!StringUtil.isEmpty(tag)) {
+                  if (!tags.contains(tag)) {
+                    tags.add(tag);
+                    LOG.debug("Adding tag: " + tag + " to tag set.");
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+
+      // Recurse
+      NodeList children = node.getChildNodes();
+      for (int i = 0; children != null && i < children.getLength(); i++)
+        parse(children.item(i));
+    }
+
+    private final static String parseTag(String url) {
+      String tag = null;
+      try {
+        URL u = new URL(url);
+        String path = u.getPath();
+        tag = URLDecoder.decode(path.substring(path.lastIndexOf('/') + 1),
+            "UTF-8");
+      } catch (Exception e) {
+        // Malformed tag...
+        tag = null;
+      }
+      return tag;
+    }
+
+  }
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+  }
+
+  public Configuration getConf() {
+    return this.conf;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/microformats-reltag/src/main/java/org/apache/nutch/microformats/reltag/package.html
----------------------------------------------------------------------
diff --git a/nutch-plugins/microformats-reltag/src/main/java/org/apache/nutch/microformats/reltag/package.html b/nutch-plugins/microformats-reltag/src/main/java/org/apache/nutch/microformats/reltag/package.html
new file mode 100644
index 0000000..bef5409
--- /dev/null
+++ b/nutch-plugins/microformats-reltag/src/main/java/org/apache/nutch/microformats/reltag/package.html
@@ -0,0 +1,8 @@
+<html>
+<body>
+<p>
+A microformats <a href="http://www.microformats.org/wiki/Rel-Tag">Rel-Tag</a>
+Parser/Indexer/Querier plugin.
+</p>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/mimetype-filter/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/mimetype-filter/build.xml b/nutch-plugins/mimetype-filter/build.xml
new file mode 100644
index 0000000..977e643
--- /dev/null
+++ b/nutch-plugins/mimetype-filter/build.xml
@@ -0,0 +1,28 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="mimetype-filter" default="jar-core">
+
+    <import file="../build-plugin.xml" />
+
+    <!-- for junit test -->
+    <mkdir dir="${build.test}/data"/>
+    <copy todir="${build.test}/data">
+        <fileset dir="sample" includes="**/*.txt"/>
+    </copy>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/mimetype-filter/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/mimetype-filter/ivy.xml b/nutch-plugins/mimetype-filter/ivy.xml
new file mode 100644
index 0000000..0a363f7
--- /dev/null
+++ b/nutch-plugins/mimetype-filter/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="${nutch.root}/ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/mimetype-filter/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/mimetype-filter/plugin.xml b/nutch-plugins/mimetype-filter/plugin.xml
new file mode 100644
index 0000000..d038447
--- /dev/null
+++ b/nutch-plugins/mimetype-filter/plugin.xml
@@ -0,0 +1,37 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="mimetype-filter"
+   name="Filter indexed documents by the detected MIME"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="mimetype-filter.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+    <extension id="org.apache.nutch.indexer.filter"
+               name="Nutch MIME filter"
+               point="org.apache.nutch.indexer.IndexingFilter">
+        <implementation id="MimeTypeIndexingFilter"
+                        class="org.apache.nutch.indexer.filter.MimeTypeIndexingFilter"/>
+    </extension>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/mimetype-filter/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/mimetype-filter/pom.xml b/nutch-plugins/mimetype-filter/pom.xml
new file mode 100644
index 0000000..29c0798
--- /dev/null
+++ b/nutch-plugins/mimetype-filter/pom.xml
@@ -0,0 +1,38 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>mimetype-filter</artifactId>
+    <packaging>jar</packaging>
+
+    <name>mimetype-filter</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/mimetype-filter/src/main/java/org/apache/nutch/indexer/filter/MimeTypeIndexingFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/mimetype-filter/src/main/java/org/apache/nutch/indexer/filter/MimeTypeIndexingFilter.java b/nutch-plugins/mimetype-filter/src/main/java/org/apache/nutch/indexer/filter/MimeTypeIndexingFilter.java
new file mode 100644
index 0000000..494d888
--- /dev/null
+++ b/nutch-plugins/mimetype-filter/src/main/java/org/apache/nutch/indexer/filter/MimeTypeIndexingFilter.java
@@ -0,0 +1,273 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.indexer.filter;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.Options;
+import org.apache.commons.cli.OptionBuilder;
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.CommandLineParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.GnuParser;
+import org.apache.commons.cli.UnrecognizedOptionException;
+
+// Nutch imports
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+
+import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.indexer.IndexingFilter;
+import org.apache.nutch.indexer.NutchDocument;
+
+import org.apache.nutch.net.protocols.Response;
+
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.ParseStatus;
+
+import org.apache.nutch.metadata.Metadata;
+
+import org.apache.nutch.util.MimeUtil;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.PrefixStringMatcher;
+import org.apache.nutch.util.TrieStringMatcher;
+import org.apache.tika.Tika;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * An {@link org.apache.nutch.indexer.IndexingFilter} that allows filtering
+ * of documents based on the MIME Type detected by Tika
+ *
+ */
+public class MimeTypeIndexingFilter implements IndexingFilter {
+
+  public static final String MIMEFILTER_REGEX_FILE = "mimetype.filter.file";
+
+  private static final Logger LOG = LoggerFactory
+      .getLogger(MimeTypeIndexingFilter.class);
+
+  private MimeUtil MIME;
+  private Tika tika = new Tika();
+
+  private TrieStringMatcher trie;
+
+  private Configuration conf;
+
+  private boolean acceptMode = true;
+
+  // Inherited JavaDoc
+  @Override
+  public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
+      CrawlDatum datum, Inlinks inlinks) throws IndexingException {
+
+    String mimeType;
+    String contentType;
+
+    Writable tcontentType = datum.getMetaData()
+        .get(new Text(Response.CONTENT_TYPE));
+
+    if (tcontentType != null) {
+      contentType = tcontentType.toString();
+    } else {
+      contentType = parse.getData().getMeta(Response.CONTENT_TYPE);
+    }
+
+    if (contentType == null) {
+      mimeType = tika.detect(url.toString());
+    } else {
+      mimeType = MIME.forName(MimeUtil.cleanMimeType(contentType));
+    }
+
+    contentType = mimeType;
+
+    if (LOG.isInfoEnabled()) {
+      LOG.info(String.format("[%s] %s", contentType, url));
+    }
+
+    if (trie != null) {
+      if (trie.shortestMatch(contentType) == null) {
+        // no match, but
+        if (acceptMode) {
+          return doc;
+        }
+        return null;
+      } else {
+        // matched, but we are blocking
+        if (acceptMode) {
+          return null;
+        }
+      }
+    }
+
+    return doc;
+  }
+
+  /*
+   * -----------------------------
+   * <implementation:Configurable> *
+   * -----------------------------
+   */
+  @Override
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    MIME = new MimeUtil(conf);
+
+    // load the file of the values
+    String file = conf.get(MIMEFILTER_REGEX_FILE, "");
+
+    if (file != null) {
+      if (file.isEmpty()) {
+        LOG.warn(String
+            .format("Missing %s property, ALL mimetypes will be allowed",
+                MIMEFILTER_REGEX_FILE));
+      } else {
+        Reader reader = conf.getConfResourceAsReader(file);
+
+        try {
+          readConfiguration(reader);
+        } catch (IOException e) {
+          if (LOG.isErrorEnabled()) {
+            LOG.error(e.getMessage());
+          }
+
+          throw new RuntimeException(e.getMessage(), e);
+        }
+      }
+    }
+  }
+
+  private void readConfiguration(Reader reader) throws IOException {
+    BufferedReader in = new BufferedReader(reader);
+    String line;
+    List rules = new ArrayList();
+
+    while (null != (line = in.readLine())) {
+      if (line.length() == 0) {
+        continue;
+      }
+
+      char first = line.charAt(0);
+      switch (first) {
+      case ' ':
+      case '\n':
+      case '#': // skip blank & comment lines
+        break;
+      case '+':
+        acceptMode = true;
+        break;
+      case '-':
+        acceptMode = false;
+        break;
+      default:
+        rules.add(line);
+        break;
+      }
+    }
+
+    trie = new PrefixStringMatcher(rules);
+  }
+
+  @Override
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+  /**
+   * Main method for invoking this tool
+   *
+   * @throws IOException, IndexingException
+   */
+  public static void main(String[] args) throws IOException, IndexingException {
+    Option helpOpt = new Option("h", "help", false, "show this help message");
+    Option rulesOpt = OptionBuilder.withArgName("file").hasArg()
+        .withDescription(
+            "Rules file to be used in the tests relative to the conf directory")
+        .isRequired().create("rules");
+
+    Options options = new Options();
+    options.addOption(helpOpt).addOption(rulesOpt);
+
+    CommandLineParser parser = new GnuParser();
+    HelpFormatter formatter = new HelpFormatter();
+    String rulesFile;
+
+    try {
+      CommandLine line = parser.parse(options, args);
+
+      if (line.hasOption("help") || !line.hasOption("rules")) {
+        formatter
+            .printHelp("org.apache.nutch.indexer.filter.MimeTypeIndexingFilter",
+                options, true);
+        return;
+      }
+
+      rulesFile = line.getOptionValue("rules");
+    } catch (UnrecognizedOptionException e) {
+      formatter
+          .printHelp("org.apache.nutch.indexer.filter.MimeTypeIndexingFilter",
+              options, true);
+      return;
+    } catch (Exception e) {
+      LOG.error(StringUtils.stringifyException(e));
+      e.printStackTrace();
+      return;
+    }
+
+    MimeTypeIndexingFilter filter = new MimeTypeIndexingFilter();
+    Configuration conf = NutchConfiguration.create();
+    conf.set(MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE, rulesFile);
+    filter.setConf(conf);
+
+    BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
+    String line;
+
+    while ((line = in.readLine()) != null && !line.isEmpty()) {
+      Metadata metadata = new Metadata();
+      metadata.set(Response.CONTENT_TYPE, line);
+      ParseImpl parse = new ParseImpl("text",
+          new ParseData(new ParseStatus(), "title", new Outlink[0], metadata));
+
+      NutchDocument doc = filter.filter(new NutchDocument(), parse,
+          new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
+
+      if (doc != null) {
+        System.out.print("+ ");
+        System.out.println(line);
+      } else {
+        System.out.print("- ");
+        System.out.println(line);
+      }
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/mimetype-filter/src/test/java/org/apache/nutch/indexer/filter/MimeTypeIndexingFilterTest.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/mimetype-filter/src/test/java/org/apache/nutch/indexer/filter/MimeTypeIndexingFilterTest.java b/nutch-plugins/mimetype-filter/src/test/java/org/apache/nutch/indexer/filter/MimeTypeIndexingFilterTest.java
new file mode 100644
index 0000000..bca230f
--- /dev/null
+++ b/nutch-plugins/mimetype-filter/src/test/java/org/apache/nutch/indexer/filter/MimeTypeIndexingFilterTest.java
@@ -0,0 +1,114 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.indexer.filter;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.util.NutchConfiguration;
+
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+/**
+ * JUnit based tests of class
+ * {@link org.apache.nutch.indexer.filter.MimeTypeIndexingFilter}
+ *
+ */
+public class MimeTypeIndexingFilterTest {
+
+  private Configuration conf = NutchConfiguration.create();
+  private MimeTypeIndexingFilter filter = new MimeTypeIndexingFilter();
+  private String[] MIME_TYPES = { "text/html", "image/png", "application/pdf" };
+  private ParseImpl[] parses = new ParseImpl[MIME_TYPES.length];
+  private String sampleDir = System.getProperty("test.data", ".");
+
+  @Before
+  public void setUp() throws Exception {
+    for (int i = 0; i < MIME_TYPES.length; i++) {
+      Metadata metadata = new Metadata();
+      metadata.add(Response.CONTENT_TYPE, MIME_TYPES[i]);
+
+      ParseImpl parse = new ParseImpl("text",
+          new ParseData(new ParseStatus(), "title", new Outlink[0], metadata));
+
+      parses[i] = parse;
+    }
+  }
+
+  @Test
+  public void testMissingConfigFile() throws Exception {
+    String file = conf.get(MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE, "");
+    Assert.assertEquals(String
+        .format("Property %s must not be present in the the configuration file",
+            MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE), "", file);
+
+    filter.setConf(conf);
+
+    // property not set so in this cases all documents must pass the filter
+    for (int i = 0; i < parses.length; i++) {
+      NutchDocument doc = filter.filter(new NutchDocument(), parses[i],
+          new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
+
+      Assert.assertNotNull("All documents must be allowed by default", doc);
+    }
+  }
+
+  @Test
+  public void testAllowOnlyImages() throws Exception {
+    conf.set(MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE, "allow-images.txt");
+    filter.setConf(conf);
+
+    for (int i = 0; i < parses.length; i++) {
+      NutchDocument doc = filter.filter(new NutchDocument(), parses[i],
+          new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
+
+      if (MIME_TYPES[i].contains("image")) {
+        Assert.assertNotNull("Allow only images", doc);
+      } else {
+        Assert.assertNull("Block everything else", doc);
+      }
+    }
+  }
+
+  @Test
+  public void testBlockHTML() throws Exception {
+    conf.set(MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE, "block-html.txt");
+    filter.setConf(conf);
+
+    for (int i = 0; i < parses.length; i++) {
+      NutchDocument doc = filter.filter(new NutchDocument(), parses[i],
+          new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
+
+      if (MIME_TYPES[i].contains("html")) {
+        Assert.assertNull("Block only HTML documents", doc);
+      } else {
+        Assert.assertNotNull("Allow everything else", doc);
+      }
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/mimetype-filter/src/test/resources/allow-images.txt
----------------------------------------------------------------------
diff --git a/nutch-plugins/mimetype-filter/src/test/resources/allow-images.txt b/nutch-plugins/mimetype-filter/src/test/resources/allow-images.txt
new file mode 100644
index 0000000..0f5f136
--- /dev/null
+++ b/nutch-plugins/mimetype-filter/src/test/resources/allow-images.txt
@@ -0,0 +1,34 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This filter can be configured to work in one of two modes (similar to
+# suffix-url-filter)
+
+# default to reject ('-'): in this mode, only documents with a mimetype that
+# match the ones specified in the config file will be accepted, all other
+# mimetypes will be rejected.
+
+# default to accept ('+'): in this mode, only documents with a mimetype
+# that match the ones specified in the config file will be rejected,
+# all other mimetypes will be accepted.
+
+# The format of this config file is one mimetype per line, with no preceding
+# whitespace. Order, in which suffixes are specified, doesn't matter. Blank
+# lines and comments (#) are allowed.
+#
+
+-
+
+image

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/mimetype-filter/src/test/resources/block-html.txt
----------------------------------------------------------------------
diff --git a/nutch-plugins/mimetype-filter/src/test/resources/block-html.txt b/nutch-plugins/mimetype-filter/src/test/resources/block-html.txt
new file mode 100644
index 0000000..69600ec
--- /dev/null
+++ b/nutch-plugins/mimetype-filter/src/test/resources/block-html.txt
@@ -0,0 +1,34 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This filter can be configured to work in one of two modes (similar to
+# suffix-url-filter)
+
+# default to reject ('-'): in this mode, only documents with a mimetype that
+# match the ones specified in the config file will be accepted, all other
+# mimetypes will be rejected.
+
+# default to accept ('+'): in this mode, only documents with a mimetype
+# that match the ones specified in the config file will be rejected,
+# all other mimetypes will be accepted.
+
+# The format of this config file is one mimetype per line, with no preceding
+# whitespace. Order, in which suffixes are specified, doesn't matter. Blank
+# lines and comments (#) are allowed.
+#
+
++
+
+text/html
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/nutch-extensionpoints/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/nutch-extensionpoints/build.xml b/nutch-plugins/nutch-extensionpoints/build.xml
new file mode 100644
index 0000000..45eb815
--- /dev/null
+++ b/nutch-plugins/nutch-extensionpoints/build.xml
@@ -0,0 +1,30 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="nutch-extensionpoints" default="jar">
+
+  <import file="../build-plugin.xml"/>
+
+  <!--
+   ! Override the compile and jar targets,
+   ! since there is nothing to compile here.
+   ! -->
+  <target name="compile" depends="init, resolve-default"/>
+
+  <!--target name="jar" depends="compile"/-->
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/nutch-extensionpoints/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/nutch-extensionpoints/ivy.xml b/nutch-plugins/nutch-extensionpoints/ivy.xml
new file mode 100644
index 0000000..1a86d68
--- /dev/null
+++ b/nutch-plugins/nutch-extensionpoints/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/nutch-extensionpoints/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/nutch-extensionpoints/plugin.xml b/nutch-plugins/nutch-extensionpoints/plugin.xml
new file mode 100644
index 0000000..8cf7a23
--- /dev/null
+++ b/nutch-plugins/nutch-extensionpoints/plugin.xml
@@ -0,0 +1,67 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="nutch-extensionpoints"
+   name="the nutch core extension points"
+   version="2.0.0"
+   provider-name="nutch.org">
+
+   <!-- this file hosts all extension points nutch core code offers. 
+   Please not that plugins can define extension points as well to be extendable.-->
+
+<extension-point
+      id="org.apache.nutch.indexer.IndexingFilter"
+      name="Nutch Indexing Filter"/>
+
+<extension-point
+      id="org.apache.nutch.indexer.IndexWriter"
+      name="Nutch Index Writer"/>
+
+<extension-point
+      id="org.apache.nutch.parse.Parser"
+      name="Nutch Content Parser"/>
+ 
+<extension-point
+      id="org.apache.nutch.parse.HtmlParseFilter"
+      name="HTML Parse Filter"/>
+
+<extension-point
+      id="org.apache.nutch.protocol.Protocol"
+      name="Nutch Protocol"/>
+
+<extension-point
+      id="org.apache.nutch.net.URLFilter"
+      name="Nutch URL Filter"/>
+
+<extension-point
+        id="org.apache.nutch.net.URLExemptionFilter"
+        name="Nutch URL Ignore Exemption Filter"/>
+
+<extension-point
+      id="org.apache.nutch.net.URLNormalizer"
+      name="Nutch URL Normalizer"/>
+
+<extension-point
+      id="org.apache.nutch.scoring.ScoringFilter"
+      name="Nutch Scoring"/>
+
+<extension-point
+      id="org.apache.nutch.segment.SegmentMergeFilter"
+      name="Nutch Segment Merge Filter"/>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/nutch-extensionpoints/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/nutch-extensionpoints/pom.xml b/nutch-plugins/nutch-extensionpoints/pom.xml
new file mode 100644
index 0000000..db76178
--- /dev/null
+++ b/nutch-plugins/nutch-extensionpoints/pom.xml
@@ -0,0 +1,38 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>nutch-extensionpoints</artifactId>
+    <packaging>jar</packaging>
+
+    <name>nutch-extensionpoints</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-ext/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-ext/build.xml b/nutch-plugins/parse-ext/build.xml
new file mode 100644
index 0000000..25552fa
--- /dev/null
+++ b/nutch-plugins/parse-ext/build.xml
@@ -0,0 +1,32 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="parse-ext" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+  <!-- Deploy Unit test dependencies -->
+  <target name="deps-test">
+    <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
+    <ant target="deploy" inheritall="false" dir="../protocol-file"/>
+  </target>
+
+
+  <copy file="command" todir="${deploy.dir}" preservelastmodified="true"/>
+  <chmod file="${deploy.dir}/command" perm="755"/>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-ext/command
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-ext/command b/nutch-plugins/parse-ext/command
new file mode 100644
index 0000000..f42c055
--- /dev/null
+++ b/nutch-plugins/parse-ext/command
@@ -0,0 +1,24 @@
+#!/bin/bash
+#
+# Sample bash script as external command invoked by parse-ext plugin
+#
+# 20040701, John Xing
+
+set -e
+
+if  [ $# -ne 1 ]; then
+  echo Usage:$0 mimeType >&2
+  exit 1
+fi
+
+case $1 in
+"application/vnd.nutch.example.cat")
+  cat
+  ;;
+"application/vnd.nutch.example.md5sum")
+  md5sum
+  ;;
+*)
+  echo "Can't parse mimeType $1" >&2
+  exit 1
+esac

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-ext/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-ext/ivy.xml b/nutch-plugins/parse-ext/ivy.xml
new file mode 100644
index 0000000..1a86d68
--- /dev/null
+++ b/nutch-plugins/parse-ext/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-ext/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-ext/plugin.xml b/nutch-plugins/parse-ext/plugin.xml
new file mode 100644
index 0000000..6819b36
--- /dev/null
+++ b/nutch-plugins/parse-ext/plugin.xml
@@ -0,0 +1,60 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="parse-ext"
+   name="External Parser Plug-in"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="parse-ext.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+   <extension id="org.apache.nutch.parse.ext"
+              name="ExtParse"
+              point="org.apache.nutch.parse.Parser">
+
+      <implementation id="ExtParser"
+                      class="org.apache.nutch.parse.ext.ExtParser">
+        <parameter name="contentType" value="application/vnd.nutch.example.cat"/>
+        <parameter name="pathSuffix"  value=""/>
+        <parameter name="command"     value="./build/plugins/parse-ext/command"/>
+        <parameter name="timeout"     value="10"/>
+        <!-- can optionally specify an encoding parameter now, see NUTCH-564-->
+        <!-- <parameter name="encoding" value="UTF-8"/> -->
+      </implementation>
+
+      <implementation id="ExtParser"
+                      class="org.apache.nutch.parse.ext.ExtParser">
+        <parameter name="contentType" value="application/vnd.nutch.example.md5sum"/>
+        <parameter name="pathSuffix"  value=""/>
+        <parameter name="command"     value="./build/plugins/parse-ext/command"/>
+        <parameter name="timeout"     value="20"/>
+        <!-- can optionally specify an encoding parameter now, see NUTCH-564-->
+        <!-- <parameter name="encoding" value="UTF-8"/> -->
+      </implementation>
+
+   </extension>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-ext/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-ext/pom.xml b/nutch-plugins/parse-ext/pom.xml
new file mode 100644
index 0000000..5a7b7be
--- /dev/null
+++ b/nutch-plugins/parse-ext/pom.xml
@@ -0,0 +1,38 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>parse-ext</artifactId>
+    <packaging>jar</packaging>
+
+    <name>parse-ext</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-ext/src/main/java/org/apache/nutch/parse/ext/ExtParser.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-ext/src/main/java/org/apache/nutch/parse/ext/ExtParser.java b/nutch-plugins/parse-ext/src/main/java/org/apache/nutch/parse/ext/ExtParser.java
new file mode 100644
index 0000000..94d9b32
--- /dev/null
+++ b/nutch-plugins/parse-ext/src/main/java/org/apache/nutch/parse/ext/ExtParser.java
@@ -0,0 +1,183 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.ext;
+
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.parse.ParseResult;
+import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.parse.Parser;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.OutlinkExtractor;
+
+import org.apache.nutch.util.CommandRunner;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.hadoop.conf.Configuration;
+
+import org.apache.nutch.plugin.Extension;
+import org.apache.nutch.plugin.PluginRepository;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.Hashtable;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.nio.charset.Charset;
+
+/**
+ * A wrapper that invokes external command to do real parsing job.
+ * 
+ * @author John Xing
+ */
+
+public class ExtParser implements Parser {
+
+  public static final Logger LOG = LoggerFactory
+      .getLogger("org.apache.nutch.parse.ext");
+
+  static final int BUFFER_SIZE = 4096;
+
+  static final int TIMEOUT_DEFAULT = 30; // in seconds
+
+  // handy map from String contentType to String[] {command, timeoutString,
+  // encoding}
+  Hashtable<String, String[]> TYPE_PARAMS_MAP = new Hashtable<String, String[]>();
+
+  private Configuration conf;
+
+  public ExtParser() {
+  }
+
+  public ParseResult getParse(Content content) {
+
+    String contentType = content.getContentType();
+
+    String[] params = (String[]) TYPE_PARAMS_MAP.get(contentType);
+    if (params == null)
+      return new ParseStatus(ParseStatus.FAILED,
+          "No external command defined for contentType: " + contentType)
+          .getEmptyParseResult(content.getUrl(), getConf());
+
+    String command = params[0];
+    int timeout = Integer.parseInt(params[1]);
+    String encoding = params[2];
+
+    if (LOG.isTraceEnabled()) {
+      LOG.trace("Use " + command + " with timeout=" + timeout + "secs");
+    }
+
+    String text = null;
+    String title = null;
+
+    try {
+
+      byte[] raw = content.getContent();
+
+      String contentLength = content.getMetadata().get(Response.CONTENT_LENGTH);
+      if (contentLength != null
+          && raw.length != Integer.parseInt(contentLength)) {
+        return new ParseStatus(ParseStatus.FAILED,
+            ParseStatus.FAILED_TRUNCATED, "Content truncated at " + raw.length
+                + " bytes. Parser can't handle incomplete " + contentType
+                + " file.").getEmptyParseResult(content.getUrl(), getConf());
+      }
+
+      ByteArrayOutputStream os = new ByteArrayOutputStream(BUFFER_SIZE);
+      ByteArrayOutputStream es = new ByteArrayOutputStream(BUFFER_SIZE / 4);
+
+      CommandRunner cr = new CommandRunner();
+
+      cr.setCommand(command + " " + contentType);
+      cr.setInputStream(new ByteArrayInputStream(raw));
+      cr.setStdOutputStream(os);
+      cr.setStdErrorStream(es);
+
+      cr.setTimeout(timeout);
+
+      cr.evaluate();
+
+      if (cr.getExitValue() != 0)
+        return new ParseStatus(ParseStatus.FAILED, "External command "
+            + command + " failed with error: " + es.toString())
+            .getEmptyParseResult(content.getUrl(), getConf());
+
+      text = os.toString(encoding);
+
+    } catch (Exception e) { // run time exception
+      return new ParseStatus(e)
+          .getEmptyParseResult(content.getUrl(), getConf());
+    }
+
+    if (text == null)
+      text = "";
+
+    if (title == null)
+      title = "";
+
+    // collect outlink
+    Outlink[] outlinks = OutlinkExtractor.getOutlinks(text, getConf());
+
+    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title,
+        outlinks, content.getMetadata());
+    return ParseResult.createParseResult(content.getUrl(), new ParseImpl(text,
+        parseData));
+  }
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    Extension[] extensions = PluginRepository.get(conf)
+        .getExtensionPoint("org.apache.nutch.parse.Parser").getExtensions();
+
+    String contentType, command, timeoutString, encoding;
+
+    for (int i = 0; i < extensions.length; i++) {
+      Extension extension = extensions[i];
+
+      // only look for extensions defined by plugin parse-ext
+      if (!extension.getDescriptor().getPluginId().equals("parse-ext"))
+        continue;
+
+      contentType = extension.getAttribute("contentType");
+      if (contentType == null || contentType.equals(""))
+        continue;
+
+      command = extension.getAttribute("command");
+      if (command == null || command.equals(""))
+        continue;
+
+      // null encoding means default
+      encoding = extension.getAttribute("encoding");
+      if (encoding == null)
+        encoding = Charset.defaultCharset().name();
+
+      timeoutString = extension.getAttribute("timeout");
+      if (timeoutString == null || timeoutString.equals(""))
+        timeoutString = "" + TIMEOUT_DEFAULT;
+
+      TYPE_PARAMS_MAP.put(contentType, new String[] { command, timeoutString,
+          encoding });
+    }
+  }
+
+  public Configuration getConf() {
+    return this.conf;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-ext/src/main/java/org/apache/nutch/parse/ext/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-ext/src/main/java/org/apache/nutch/parse/ext/package-info.java b/nutch-plugins/parse-ext/src/main/java/org/apache/nutch/parse/ext/package-info.java
new file mode 100644
index 0000000..6394489
--- /dev/null
+++ b/nutch-plugins/parse-ext/src/main/java/org/apache/nutch/parse/ext/package-info.java
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Parse wrapper to run external command to do the parsing.
+ */
+package org.apache.nutch.parse.ext;
+

[41/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build for nutch-core and nutch-plugins

Posted by th...@apache.org.

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/plugin/ExtensionPoint.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/plugin/ExtensionPoint.java b/nutch-core/src/main/java/org/apache/nutch/plugin/ExtensionPoint.java
new file mode 100644
index 0000000..178c5a2
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/plugin/ExtensionPoint.java
@@ -0,0 +1,123 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.plugin;
+
+import java.util.ArrayList;
+
+/**
+ * The <code>ExtensionPoint</code> provide meta information of a extension
+ * point.
+ * 
+ * @author joa23
+ */
+public class ExtensionPoint {
+  private String ftId;
+  private String fName;
+  private String fSchema;
+  private ArrayList<Extension> fExtensions;
+
+  /**
+   * Constructor
+   * 
+   * @param pId
+   *          unique extension point Id
+   * @param pName
+   *          name of the extension point
+   * @param pSchema
+   *          xml schema of the extension point
+   */
+  public ExtensionPoint(String pId, String pName, String pSchema) {
+    setId(pId);
+    setName(pName);
+    setSchema(pSchema);
+    fExtensions = new ArrayList<Extension>();
+  }
+
+  /**
+   * Returns the unique id of the extension point.
+   * 
+   * @return String
+   */
+  public String getId() {
+    return ftId;
+  }
+
+  /**
+   * Returns the name of the extension point.
+   * 
+   * @return String
+   */
+  public String getName() {
+    return fName;
+  }
+
+  /**
+   * Returns a path to the xml schema of a extension point.
+   * 
+   * @return String
+   */
+  public String getSchema() {
+    return fSchema;
+  }
+
+  /**
+   * Sets the extensionPointId.
+   * 
+   * @param pId
+   *          extension point id
+   */
+  private void setId(String pId) {
+    ftId = pId;
+  }
+
+  /**
+   * Sets the extension point name.
+   * 
+   * @param pName
+   */
+  private void setName(String pName) {
+    fName = pName;
+  }
+
+  /**
+   * Sets the schema.
+   * 
+   * @param pSchema
+   */
+  private void setSchema(String pSchema) {
+    fSchema = pSchema;
+  }
+
+  /**
+   * Install a coresponding extension to this extension point.
+   * 
+   * @param extension
+   */
+  public void addExtension(Extension extension) {
+    fExtensions.add(extension);
+  }
+
+  /**
+   * Returns a array of extensions that lsiten to this extension point
+   * 
+   * @return Extension[]
+   */
+  public Extension[] getExtensions() {
+    return fExtensions.toArray(new Extension[fExtensions.size()]);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/plugin/MissingDependencyException.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/plugin/MissingDependencyException.java b/nutch-core/src/main/java/org/apache/nutch/plugin/MissingDependencyException.java
new file mode 100644
index 0000000..b81cc50
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/plugin/MissingDependencyException.java
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.plugin;
+
+/**
+ * <code>MissingDependencyException</code> will be thrown if a plugin dependency
+ * cannot be found.
+ * 
+ * @author J&eacute;r&ocirc;me Charron
+ */
+public class MissingDependencyException extends Exception {
+
+  private static final long serialVersionUID = 1L;
+
+  public MissingDependencyException(Throwable cause) {
+    super(cause);
+  }
+
+  public MissingDependencyException(String message) {
+    super(message);
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/plugin/Pluggable.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/plugin/Pluggable.java b/nutch-core/src/main/java/org/apache/nutch/plugin/Pluggable.java
new file mode 100644
index 0000000..09aba30
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/plugin/Pluggable.java
@@ -0,0 +1,31 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.plugin;
+
+/**
+ * Defines the capability of a class to be plugged into Nutch. This is a common
+ * interface that must be implemented by all Nutch Extension Points.
+ * 
+ * @author J&eacute;r&ocirc;me Charron
+ * 
+ * @see <a href="http://wiki.apache.org/nutch/AboutPlugins">About Plugins</a>
+ * @see <a href="package-summary.html#package_description"> plugin package
+ *      description</a>
+ */
+public interface Pluggable {
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/plugin/Plugin.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/plugin/Plugin.java b/nutch-core/src/main/java/org/apache/nutch/plugin/Plugin.java
new file mode 100644
index 0000000..e78754b
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/plugin/Plugin.java
@@ -0,0 +1,95 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.plugin;
+
+import org.apache.hadoop.conf.Configuration;
+
+/**
+ * A nutch-plugin is an container for a set of custom logic that provide
+ * extensions to the nutch core functionality or another plugin that provides an
+ * API for extending. A plugin can provide one or a set of extensions.
+ * Extensions are components that can be dynamically installed as a kind of
+ * listener to extension points. Extension points are a kind of publisher that
+ * provide a API and invoke one or a set of installed extensions.
+ * 
+ * Each plugin may extend the base <code>Plugin</code>. <code>Plugin</code>
+ * instances are used as the point of life cycle managemet of plugin related
+ * functionality.
+ * 
+ * The <code>Plugin</code> will be startuped and shutdown by the nutch plugin
+ * management system.
+ * 
+ * A possible usecase of the <code>Plugin</code> implementation is to create or
+ * close a database connection.
+ * 
+ * @author joa23
+ */
+public class Plugin {
+  private PluginDescriptor fDescriptor;
+  protected Configuration conf;
+
+  /**
+   * Constructor
+   * 
+   */
+  public Plugin(PluginDescriptor pDescriptor, Configuration conf) {
+    setDescriptor(pDescriptor);
+    this.conf = conf;
+  }
+
+  /**
+   * Will be invoked until plugin start up. Since the nutch-plugin system use
+   * lazy loading the start up is invoked until the first time a extension is
+   * used.
+   * 
+   * @throws PluginRuntimeException
+   *           If the startup was without successs.
+   */
+  public void startUp() throws PluginRuntimeException {
+  }
+
+  /**
+   * Shutdown the plugin. This happens until nutch will be stopped.
+   * 
+   * @throws PluginRuntimeException
+   *           if a problems occurs until shutdown the plugin.
+   */
+  public void shutDown() throws PluginRuntimeException {
+  }
+
+  /**
+   * Returns the plugin descriptor
+   * 
+   * @return PluginDescriptor
+   */
+  public PluginDescriptor getDescriptor() {
+    return fDescriptor;
+  }
+
+  /**
+   * @param descriptor
+   *          The descriptor to set
+   */
+  private void setDescriptor(PluginDescriptor descriptor) {
+    fDescriptor = descriptor;
+  }
+
+  protected void finalize() throws Throwable {
+    super.finalize();
+    shutDown();
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/plugin/PluginClassLoader.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/plugin/PluginClassLoader.java b/nutch-core/src/main/java/org/apache/nutch/plugin/PluginClassLoader.java
new file mode 100644
index 0000000..128bbc6
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/plugin/PluginClassLoader.java
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.plugin;
+
+import java.net.URL;
+import java.net.URLClassLoader;
+import java.util.Arrays;
+
+/**
+ * The <code>PluginClassLoader</code> contains only classes of the runtime
+ * libraries setuped in the plugin manifest file and exported libraries of
+ * plugins that are required pluguin. Libraries can be exported or not. Not
+ * exported libraries are only used in the plugin own
+ * <code>PluginClassLoader</code>. Exported libraries are available for
+ * <code>PluginClassLoader</code> of plugins that depends on these plugins.
+ * 
+ * @author joa23
+ */
+public class PluginClassLoader extends URLClassLoader {
+
+  private URL[] urls;
+  private ClassLoader parent;
+
+  /**
+   * Construtor
+   * 
+   * @param urls
+   *          Array of urls with own libraries and all exported libraries of
+   *          plugins that are required to this plugin
+   * @param parent
+   */
+  public PluginClassLoader(URL[] urls, ClassLoader parent) {
+    super(urls, parent);
+
+    this.urls = urls;
+    this.parent = parent;
+  }
+
+  @Override
+  public int hashCode() {
+    final int PRIME = 31;
+    int result = 1;
+    result = PRIME * result + ((parent == null) ? 0 : parent.hashCode());
+    result = PRIME * result + Arrays.hashCode(urls);
+    return result;
+  }
+
+  @Override
+  public boolean equals(Object obj) {
+    if (this == obj)
+      return true;
+    if (obj == null)
+      return false;
+    if (getClass() != obj.getClass())
+      return false;
+    final PluginClassLoader other = (PluginClassLoader) obj;
+    if (parent == null) {
+      if (other.parent != null)
+        return false;
+    } else if (!parent.equals(other.parent))
+      return false;
+    if (!Arrays.equals(urls, other.urls))
+      return false;
+    return true;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/plugin/PluginDescriptor.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/plugin/PluginDescriptor.java b/nutch-core/src/main/java/org/apache/nutch/plugin/PluginDescriptor.java
new file mode 100644
index 0000000..0a43745
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/plugin/PluginDescriptor.java
@@ -0,0 +1,363 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.plugin;
+
+import java.io.File;
+import java.io.IOException;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.net.URI;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Locale;
+import java.util.MissingResourceException;
+import java.util.ResourceBundle;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+
+/**
+ * The <code>PluginDescriptor</code> provide access to all meta information of a
+ * nutch-plugin, as well to the internationalizable resources and the plugin own
+ * classloader. There are meta information about <code>Plugin</code>,
+ * <code>ExtensionPoint</code> and <code>Extension</code>. To provide access to
+ * the meta data of a plugin via a descriptor allow a lazy loading mechanism.
+ */
+public class PluginDescriptor {
+  private String fPluginPath;
+  private String fPluginClass = Plugin.class.getName();
+  private String fPluginId;
+  private String fVersion;
+  private String fName;
+  private String fProviderName;
+  private HashMap<String, ResourceBundle> fMessages = new HashMap<String, ResourceBundle>();
+  private ArrayList<ExtensionPoint> fExtensionPoints = new ArrayList<ExtensionPoint>();
+  private ArrayList<String> fDependencies = new ArrayList<String>();
+  private ArrayList<URL> fExportedLibs = new ArrayList<URL>();
+  private ArrayList<URL> fNotExportedLibs = new ArrayList<URL>();
+  private ArrayList<Extension> fExtensions = new ArrayList<Extension>();
+  private PluginClassLoader fClassLoader;
+  public static final Logger LOG = LoggerFactory
+      .getLogger(PluginDescriptor.class);
+  private Configuration fConf;
+
+  /**
+   * Constructor
+   * 
+   * @param pId
+   * @param pVersion
+   * @param pName
+   * @param pProviderName
+   * @param pPluginclazz
+   * @param pPath
+   */
+  public PluginDescriptor(String pId, String pVersion, String pName,
+      String pProviderName, String pPluginclazz, String pPath,
+      Configuration conf) {
+    setPath(pPath);
+    setPluginId(pId);
+    setVersion(pVersion);
+    setName(pName);
+    setProvidername(pProviderName);
+
+    if (pPluginclazz != null)
+      setPluginClass(pPluginclazz);
+
+    this.fConf = conf;
+  }
+
+  /**
+   * @param pPath
+   */
+  private void setPath(String pPath) {
+    fPluginPath = pPath;
+  }
+
+  /**
+   * Returns the name of the plugin.
+   * 
+   * @return String
+   */
+  public String getName() {
+    return fName;
+  }
+
+  /**
+   * @param providerName
+   */
+  private void setProvidername(String providerName) {
+    fProviderName = providerName;
+  }
+
+  /**
+   * @param name
+   */
+  private void setName(String name) {
+    fName = name;
+  }
+
+  /**
+   * @param version
+   */
+  private void setVersion(String version) {
+    fVersion = version;
+  }
+
+  /**
+   * Returns the fully qualified name of the class which implements the abstarct
+   * <code>Plugin</code> class.
+   * 
+   * @return the name of this plug-in's runtime class or <code>null</code>.
+   */
+  public String getPluginClass() {
+    return fPluginClass;
+  }
+
+  /**
+   * Returns the unique identifier of the plug-in or <code>null</code>.
+   * 
+   * @return String
+   */
+  public String getPluginId() {
+    return fPluginId;
+  }
+
+  /**
+   * Returns an array of extensions.
+   * 
+   * @return Exception[]
+   */
+  public Extension[] getExtensions() {
+    return fExtensions.toArray(new Extension[fExtensions.size()]);
+  }
+
+  /**
+   * Adds a extension.
+   * 
+   * @param pExtension
+   */
+  public void addExtension(Extension pExtension) {
+    fExtensions.add(pExtension);
+  }
+
+  /**
+   * Sets the pluginClass.
+   * 
+   * @param pluginClass
+   *          The pluginClass to set
+   */
+  private void setPluginClass(String pluginClass) {
+    fPluginClass = pluginClass;
+  }
+
+  /**
+   * Sets the plugin Id.
+   * 
+   * @param pluginId
+   *          The pluginId to set
+   */
+  private void setPluginId(String pluginId) {
+    fPluginId = pluginId;
+  }
+
+  /**
+   * Adds a extension point.
+   * 
+   * @param extensionPoint
+   */
+  public void addExtensionPoint(ExtensionPoint extensionPoint) {
+    fExtensionPoints.add(extensionPoint);
+  }
+
+  /**
+   * Returns a array of extension points.
+   * 
+   * @return ExtensionPoint[]
+   */
+  public ExtensionPoint[] getExtenstionPoints() {
+    return fExtensionPoints
+        .toArray(new ExtensionPoint[fExtensionPoints.size()]);
+  }
+
+  /**
+   * Returns a array of plugin ids.
+   * 
+   * @return String[]
+   */
+  public String[] getDependencies() {
+    return fDependencies.toArray(new String[fDependencies.size()]);
+  }
+
+  /**
+   * Adds a dependency
+   * 
+   * @param pId
+   *          id of the dependent plugin
+   */
+  public void addDependency(String pId) {
+    fDependencies.add(pId);
+  }
+
+  /**
+   * Adds a exported library with a relative path to the plugin directory. We
+   * automatically escape characters that are illegal in URLs. It is recommended
+   * that code converts an abstract pathname into a URL by first converting it
+   * into a URI, via the toURI method, and then converting the URI into a URL
+   * via the URI.toURL method.
+   * 
+   * @param pLibPath
+   */
+  public void addExportedLibRelative(String pLibPath)
+      throws MalformedURLException {
+    URI uri = new File(getPluginPath() + File.separator + pLibPath).toURI();
+    URL url = uri.toURL();
+    fExportedLibs.add(url);
+  }
+
+  /**
+   * Returns the directory path of the plugin.
+   * 
+   * @return String
+   */
+  public String getPluginPath() {
+    return fPluginPath;
+  }
+
+  /**
+   * Returns a array exported librareis as URLs
+   * 
+   * @return URL[]
+   */
+  public URL[] getExportedLibUrls() {
+    return fExportedLibs.toArray(new URL[0]);
+  }
+
+  /**
+   * Adds a exported library with a relative path to the plugin directory. We
+   * automatically escape characters that are illegal in URLs. It is recommended
+   * that code converts an abstract pathname into a URL by first converting it
+   * into a URI, via the toURI method, and then converting the URI into a URL
+   * via the URI.toURL method.
+   * 
+   * @param pLibPath
+   */
+  public void addNotExportedLibRelative(String pLibPath)
+      throws MalformedURLException {
+    URI uri = new File(getPluginPath() + File.separator + pLibPath).toURI();
+    URL url = uri.toURL();
+    fNotExportedLibs.add(url);
+  }
+
+  /**
+   * Returns a array of libraries as URLs that are not exported by the plugin.
+   * 
+   * @return URL[]
+   */
+  public URL[] getNotExportedLibUrls() {
+    return fNotExportedLibs.toArray(new URL[fNotExportedLibs.size()]);
+  }
+
+  /**
+   * Returns a cached classloader for a plugin. Until classloader creation all
+   * needed libraries are collected. A classloader use as first the plugins own
+   * libraries and add then all exported libraries of dependend plugins.
+   * 
+   * @return PluginClassLoader the classloader for the plugin
+   */
+  public PluginClassLoader getClassLoader() {
+    if (fClassLoader != null)
+      return fClassLoader;
+    ArrayList<URL> arrayList = new ArrayList<URL>();
+    arrayList.addAll(fExportedLibs);
+    arrayList.addAll(fNotExportedLibs);
+    arrayList.addAll(getDependencyLibs());
+    File file = new File(getPluginPath());
+    try {
+      for (File file2 : file.listFiles()) {
+        if (file2.getAbsolutePath().endsWith("properties"))
+          arrayList.add(file2.getParentFile().toURI().toURL());
+      }
+    } catch (MalformedURLException e) {
+      LOG.debug(getPluginId() + " " + e.toString());
+    }
+    URL[] urls = arrayList.toArray(new URL[arrayList.size()]);
+    fClassLoader = new PluginClassLoader(urls,
+        PluginDescriptor.class.getClassLoader());
+    return fClassLoader;
+  }
+
+  /**
+   * @return Collection
+   */
+  private ArrayList<URL> getDependencyLibs() {
+    ArrayList<URL> list = new ArrayList<URL>();
+    collectLibs(list, this);
+    return list;
+  }
+
+  /**
+   * @param pLibs
+   * @param pDescriptor
+   */
+  private void collectLibs(ArrayList<URL> pLibs, PluginDescriptor pDescriptor) {
+
+    for (String id : pDescriptor.getDependencies()) {
+      PluginDescriptor descriptor = PluginRepository.get(fConf)
+          .getPluginDescriptor(id);
+      for (URL url : descriptor.getExportedLibUrls()) {
+        pLibs.add(url);
+      }
+      collectLibs(pLibs, descriptor);
+    }
+  }
+
+  /**
+   * Returns a I18N'd resource string. The resource bundles could be stored in
+   * root directory of a plugin in the well know i18n file name conventions.
+   * 
+   * @param pKey
+   * @param pLocale
+   * @return String
+   * @throws IOException
+   */
+  public String getResourceString(String pKey, Locale pLocale)
+      throws IOException {
+    if (fMessages.containsKey(pLocale.toString())) {
+      ResourceBundle bundle = fMessages.get(pLocale.toString());
+      try {
+        return bundle.getString(pKey);
+      } catch (MissingResourceException e) {
+        return '!' + pKey + '!';
+      }
+    }
+    try {
+      ResourceBundle res = ResourceBundle.getBundle("messages", pLocale,
+          getClassLoader());
+      return res.getString(pKey);
+    } catch (MissingResourceException x) {
+      return '!' + pKey + '!';
+    }
+  }
+
+  public String getProviderName() {
+    return fProviderName;
+  }
+
+  public String getVersion() {
+    return fVersion;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/plugin/PluginManifestParser.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/plugin/PluginManifestParser.java b/nutch-core/src/main/java/org/apache/nutch/plugin/PluginManifestParser.java
new file mode 100644
index 0000000..bd2a490
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/plugin/PluginManifestParser.java
@@ -0,0 +1,303 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.plugin;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.UnsupportedEncodingException;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.net.URLDecoder;
+import java.util.HashMap;
+import java.util.Map;
+
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.parsers.ParserConfigurationException;
+
+import org.slf4j.Logger;
+
+import org.apache.hadoop.conf.Configuration;
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+import org.xml.sax.SAXException;
+
+/**
+ * The <code>PluginManifestParser</code> parser just parse the manifest file in
+ * all plugin directories.
+ * 
+ * @author joa23
+ */
+public class PluginManifestParser {
+  private static final String ATTR_NAME = "name";
+  private static final String ATTR_CLASS = "class";
+  private static final String ATTR_ID = "id";
+
+  public static final Logger LOG = PluginRepository.LOG;
+
+  private static final boolean WINDOWS = System.getProperty("os.name")
+      .startsWith("Windows");
+
+  private Configuration conf;
+
+  private PluginRepository pluginRepository;
+
+  public PluginManifestParser(Configuration conf,
+      PluginRepository pluginRepository) {
+    this.conf = conf;
+    this.pluginRepository = pluginRepository;
+  }
+
+  /**
+   * Returns a list of all found plugin descriptors.
+   * 
+   * @param pluginFolders
+   *          folders to search plugins from
+   * @return A {@link Map} of all found {@link PluginDescriptor}s.
+   */
+  public Map<String, PluginDescriptor> parsePluginFolder(String[] pluginFolders) {
+    Map<String, PluginDescriptor> map = new HashMap<String, PluginDescriptor>();
+
+    if (pluginFolders == null) {
+      throw new IllegalArgumentException("plugin.folders is not defined");
+    }
+
+    for (String name : pluginFolders) {
+      File directory = getPluginFolder(name);
+      if (directory == null) {
+        continue;
+      }
+      LOG.info("Plugins: looking in: " + directory.getAbsolutePath());
+      for (File oneSubFolder : directory.listFiles()) {
+        if (oneSubFolder.isDirectory()) {
+          String manifestPath = oneSubFolder.getAbsolutePath() + File.separator
+              + "plugin.xml";
+          try {
+            LOG.debug("parsing: " + manifestPath);
+            PluginDescriptor p = parseManifestFile(manifestPath);
+            map.put(p.getPluginId(), p);
+          } catch (Exception e) {
+            LOG.warn("Error while loading plugin `" + manifestPath + "` "
+                + e.toString());
+          }
+        }
+      }
+    }
+    return map;
+  }
+
+  /**
+   * Return the named plugin folder. If the name is absolute then it is
+   * returned. Otherwise, for relative names, the classpath is scanned.
+   */
+  public File getPluginFolder(String name) {
+    File directory = new File(name);
+    if (!directory.isAbsolute()) {
+      URL url = PluginManifestParser.class.getClassLoader().getResource(name);
+      if (url == null && directory.exists() && directory.isDirectory()
+          && directory.listFiles().length > 0) {
+        return directory; // relative path that is not in the classpath
+      } else if (url == null) {
+        LOG.warn("Plugins: directory not found: " + name);
+        return null;
+      } else if (!"file".equals(url.getProtocol())) {
+        LOG.warn("Plugins: not a file: url. Can't load plugins from: " + url);
+        return null;
+      }
+      String path = url.getPath();
+      if (WINDOWS && path.startsWith("/")) // patch a windows bug
+        path = path.substring(1);
+      try {
+        path = URLDecoder.decode(path, "UTF-8"); // decode the url path
+      } catch (UnsupportedEncodingException e) {
+      }
+      directory = new File(path);
+    } else if (!directory.exists()) {
+      LOG.warn("Plugins: directory not found: " + name);
+      return null;
+    }
+    return directory;
+  }
+
+  /**
+   * @param manifestPath
+   * @throws ParserConfigurationException
+   * @throws IOException
+   * @throws SAXException
+   * @throws MalformedURLException
+   */
+  private PluginDescriptor parseManifestFile(String pManifestPath)
+      throws MalformedURLException, SAXException, IOException,
+      ParserConfigurationException {
+    Document document = parseXML(new File(pManifestPath).toURI().toURL());
+    String pPath = new File(pManifestPath).getParent();
+    return parsePlugin(document, pPath);
+  }
+
+  /**
+   * @param url
+   * @return Document
+   * @throws IOException
+   * @throws SAXException
+   * @throws ParserConfigurationException
+   * @throws DocumentException
+   */
+  private Document parseXML(URL url) throws SAXException, IOException,
+      ParserConfigurationException {
+    DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
+    DocumentBuilder builder = factory.newDocumentBuilder();
+    return builder.parse(url.openStream());
+  }
+
+  /**
+   * @param pDocument
+   * @throws MalformedURLException
+   */
+  private PluginDescriptor parsePlugin(Document pDocument, String pPath)
+      throws MalformedURLException {
+    Element rootElement = pDocument.getDocumentElement();
+    String id = rootElement.getAttribute(ATTR_ID);
+    String name = rootElement.getAttribute(ATTR_NAME);
+    String version = rootElement.getAttribute("version");
+    String providerName = rootElement.getAttribute("provider-name");
+    String pluginClazz = null;
+    if (rootElement.getAttribute(ATTR_CLASS).trim().length() > 0) {
+      pluginClazz = rootElement.getAttribute(ATTR_CLASS);
+    }
+    PluginDescriptor pluginDescriptor = new PluginDescriptor(id, version, name,
+        providerName, pluginClazz, pPath, this.conf);
+    LOG.debug("plugin: id=" + id + " name=" + name + " version=" + version
+        + " provider=" + providerName + "class=" + pluginClazz);
+    parseExtension(rootElement, pluginDescriptor);
+    parseExtensionPoints(rootElement, pluginDescriptor);
+    parseLibraries(rootElement, pluginDescriptor);
+    parseRequires(rootElement, pluginDescriptor);
+    return pluginDescriptor;
+  }
+
+  /**
+   * @param pRootElement
+   * @param pDescriptor
+   * @throws MalformedURLException
+   */
+  private void parseRequires(Element pRootElement, PluginDescriptor pDescriptor)
+      throws MalformedURLException {
+
+    NodeList nodelist = pRootElement.getElementsByTagName("requires");
+    if (nodelist.getLength() > 0) {
+
+      Element requires = (Element) nodelist.item(0);
+
+      NodeList imports = requires.getElementsByTagName("import");
+      for (int i = 0; i < imports.getLength(); i++) {
+        Element anImport = (Element) imports.item(i);
+        String plugin = anImport.getAttribute("plugin");
+        if (plugin != null) {
+          pDescriptor.addDependency(plugin);
+        }
+      }
+    }
+  }
+
+  /**
+   * @param pRootElement
+   * @param pDescriptor
+   * @throws MalformedURLException
+   */
+  private void parseLibraries(Element pRootElement, PluginDescriptor pDescriptor)
+      throws MalformedURLException {
+    NodeList nodelist = pRootElement.getElementsByTagName("runtime");
+    if (nodelist.getLength() > 0) {
+
+      Element runtime = (Element) nodelist.item(0);
+
+      NodeList libraries = runtime.getElementsByTagName("library");
+      for (int i = 0; i < libraries.getLength(); i++) {
+        Element library = (Element) libraries.item(i);
+        String libName = library.getAttribute(ATTR_NAME);
+        NodeList list = library.getElementsByTagName("export");
+        Element exportElement = (Element) list.item(0);
+        if (exportElement != null)
+          pDescriptor.addExportedLibRelative(libName);
+        else
+          pDescriptor.addNotExportedLibRelative(libName);
+      }
+    }
+  }
+
+  /**
+   * @param rootElement
+   * @param pluginDescriptor
+   */
+  private void parseExtensionPoints(Element pRootElement,
+      PluginDescriptor pPluginDescriptor) {
+    NodeList list = pRootElement.getElementsByTagName("extension-point");
+    if (list != null) {
+      for (int i = 0; i < list.getLength(); i++) {
+        Element oneExtensionPoint = (Element) list.item(i);
+        String id = oneExtensionPoint.getAttribute(ATTR_ID);
+        String name = oneExtensionPoint.getAttribute(ATTR_NAME);
+        String schema = oneExtensionPoint.getAttribute("schema");
+        ExtensionPoint extensionPoint = new ExtensionPoint(id, name, schema);
+        pPluginDescriptor.addExtensionPoint(extensionPoint);
+      }
+    }
+  }
+
+  /**
+   * @param rootElement
+   * @param pluginDescriptor
+   */
+  private void parseExtension(Element pRootElement,
+      PluginDescriptor pPluginDescriptor) {
+    NodeList extensions = pRootElement.getElementsByTagName("extension");
+    if (extensions != null) {
+      for (int i = 0; i < extensions.getLength(); i++) {
+        Element oneExtension = (Element) extensions.item(i);
+        String pointId = oneExtension.getAttribute("point");
+
+        NodeList extensionImplementations = oneExtension.getChildNodes();
+        if (extensionImplementations != null) {
+          for (int j = 0; j < extensionImplementations.getLength(); j++) {
+            Node node = extensionImplementations.item(j);
+            if (!node.getNodeName().equals("implementation")) {
+              continue;
+            }
+            Element oneImplementation = (Element) node;
+            String id = oneImplementation.getAttribute(ATTR_ID);
+            String extensionClass = oneImplementation.getAttribute(ATTR_CLASS);
+            LOG.debug("impl: point=" + pointId + " class=" + extensionClass);
+            Extension extension = new Extension(pPluginDescriptor, pointId, id,
+                extensionClass, this.conf, this.pluginRepository);
+            NodeList parameters = oneImplementation
+                .getElementsByTagName("parameter");
+            if (parameters != null) {
+              for (int k = 0; k < parameters.getLength(); k++) {
+                Element param = (Element) parameters.item(k);
+                extension.addAttribute(param.getAttribute(ATTR_NAME),
+                    param.getAttribute("value"));
+              }
+            }
+            pPluginDescriptor.addExtension(extension);
+          }
+        }
+      }
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/plugin/PluginRepository.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/plugin/PluginRepository.java b/nutch-core/src/main/java/org/apache/nutch/plugin/PluginRepository.java
new file mode 100644
index 0000000..3e19345
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/plugin/PluginRepository.java
@@ -0,0 +1,523 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.plugin;
+
+import java.lang.reflect.Array;
+import java.lang.reflect.Constructor;
+import java.lang.reflect.InvocationTargetException;
+import java.lang.reflect.Method;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.WeakHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.regex.Pattern;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.ObjectCache;
+
+/**
+ * The plugin repositority is a registry of all plugins.
+ * 
+ * At system boot up a repositority is builded by parsing the mainifest files of
+ * all plugins. Plugins that require not existing other plugins are not
+ * registed. For each plugin a plugin descriptor instance will be created. The
+ * descriptor represents all meta information about a plugin. So a plugin
+ * instance will be created later when it is required, this allow lazy plugin
+ * loading.
+ */
+public class PluginRepository {
+  private static final WeakHashMap<String, PluginRepository> CACHE = new WeakHashMap<String, PluginRepository>();
+
+  private boolean auto;
+
+  private List<PluginDescriptor> fRegisteredPlugins;
+
+  private HashMap<String, ExtensionPoint> fExtensionPoints;
+
+  private HashMap<String, Plugin> fActivatedPlugins;
+
+  private static final Map<String, Map<PluginClassLoader, Class>> CLASS_CACHE = new HashMap<String, Map<PluginClassLoader, Class>>();
+
+  private Configuration conf;
+
+  public static final Logger LOG = LoggerFactory
+      .getLogger(PluginRepository.class);
+
+  /**
+   * @throws PluginRuntimeException
+   * @see java.lang.Object#Object()
+   */
+  public PluginRepository(Configuration conf) throws RuntimeException {
+    fActivatedPlugins = new HashMap<String, Plugin>();
+    fExtensionPoints = new HashMap<String, ExtensionPoint>();
+    this.conf = new Configuration(conf);
+    this.auto = conf.getBoolean("plugin.auto-activation", true);
+    String[] pluginFolders = conf.getStrings("plugin.folders");
+    PluginManifestParser manifestParser = new PluginManifestParser(this.conf,
+        this);
+    Map<String, PluginDescriptor> allPlugins = manifestParser
+        .parsePluginFolder(pluginFolders);
+    if (allPlugins.isEmpty()) {
+      LOG.warn("No plugins found on paths of property plugin.folders=\"{}\"",
+          conf.get("plugin.folders"));
+    }
+    Pattern excludes = Pattern.compile(conf.get("plugin.excludes", ""));
+    Pattern includes = Pattern.compile(conf.get("plugin.includes", ""));
+    Map<String, PluginDescriptor> filteredPlugins = filter(excludes, includes,
+        allPlugins);
+    fRegisteredPlugins = getDependencyCheckedPlugins(filteredPlugins,
+        this.auto ? allPlugins : filteredPlugins);
+    installExtensionPoints(fRegisteredPlugins);
+    try {
+      installExtensions(fRegisteredPlugins);
+    } catch (PluginRuntimeException e) {
+      LOG.error(e.toString());
+      throw new RuntimeException(e.getMessage());
+    }
+    displayStatus();
+  }
+
+  /**
+   * @return a cached instance of the plugin repository
+   */
+  public static synchronized PluginRepository get(Configuration conf) {
+    String uuid = NutchConfiguration.getUUID(conf);
+    if (uuid == null) {
+      uuid = "nonNutchConf@" + conf.hashCode(); // fallback
+    }
+    PluginRepository result = CACHE.get(uuid);
+    if (result == null) {
+      result = new PluginRepository(conf);
+      CACHE.put(uuid, result);
+    }
+    return result;
+  }
+
+  private void installExtensionPoints(List<PluginDescriptor> plugins) {
+    if (plugins == null) {
+      return;
+    }
+
+    for (PluginDescriptor plugin : plugins) {
+      for (ExtensionPoint point : plugin.getExtenstionPoints()) {
+        String xpId = point.getId();
+        LOG.debug("Adding extension point " + xpId);
+        fExtensionPoints.put(xpId, point);
+      }
+    }
+  }
+
+  /**
+   * @param pRegisteredPlugins
+   */
+  private void installExtensions(List<PluginDescriptor> pRegisteredPlugins)
+      throws PluginRuntimeException {
+
+    for (PluginDescriptor descriptor : pRegisteredPlugins) {
+      for (Extension extension : descriptor.getExtensions()) {
+        String xpId = extension.getTargetPoint();
+        ExtensionPoint point = getExtensionPoint(xpId);
+        if (point == null) {
+          throw new PluginRuntimeException("Plugin ("
+              + descriptor.getPluginId() + "), " + "extension point: " + xpId
+              + " does not exist.");
+        }
+        point.addExtension(extension);
+      }
+    }
+  }
+
+  private void getPluginCheckedDependencies(PluginDescriptor plugin,
+      Map<String, PluginDescriptor> plugins,
+      Map<String, PluginDescriptor> dependencies,
+      Map<String, PluginDescriptor> branch) throws MissingDependencyException,
+      CircularDependencyException {
+
+    if (dependencies == null) {
+      dependencies = new HashMap<String, PluginDescriptor>();
+    }
+    if (branch == null) {
+      branch = new HashMap<String, PluginDescriptor>();
+    }
+    branch.put(plugin.getPluginId(), plugin);
+
+    // Otherwise, checks each dependency
+    for (String id : plugin.getDependencies()) {
+      PluginDescriptor dependency = plugins.get(id);
+      if (dependency == null) {
+        throw new MissingDependencyException("Missing dependency " + id
+            + " for plugin " + plugin.getPluginId());
+      }
+      if (branch.containsKey(id)) {
+        throw new CircularDependencyException("Circular dependency detected "
+            + id + " for plugin " + plugin.getPluginId());
+      }
+      dependencies.put(id, dependency);
+      getPluginCheckedDependencies(plugins.get(id), plugins, dependencies,
+          branch);
+    }
+
+    branch.remove(plugin.getPluginId());
+  }
+
+  private Map<String, PluginDescriptor> getPluginCheckedDependencies(
+      PluginDescriptor plugin, Map<String, PluginDescriptor> plugins)
+      throws MissingDependencyException, CircularDependencyException {
+    Map<String, PluginDescriptor> dependencies = new HashMap<String, PluginDescriptor>();
+    Map<String, PluginDescriptor> branch = new HashMap<String, PluginDescriptor>();
+    getPluginCheckedDependencies(plugin, plugins, dependencies, branch);
+    return dependencies;
+  }
+
+  /**
+   * @param filtered
+   *          is the list of plugin filtred
+   * @param all
+   *          is the list of all plugins found.
+   * @return List
+   */
+  private List<PluginDescriptor> getDependencyCheckedPlugins(
+      Map<String, PluginDescriptor> filtered, Map<String, PluginDescriptor> all) {
+    if (filtered == null) {
+      return null;
+    }
+    Map<String, PluginDescriptor> checked = new HashMap<String, PluginDescriptor>();
+
+    for (PluginDescriptor plugin : filtered.values()) {
+      try {
+        checked.putAll(getPluginCheckedDependencies(plugin, all));
+        checked.put(plugin.getPluginId(), plugin);
+      } catch (MissingDependencyException mde) {
+        // Logger exception and ignore plugin
+        LOG.warn(mde.getMessage());
+      } catch (CircularDependencyException cde) {
+        // Simply ignore this plugin
+        LOG.warn(cde.getMessage());
+      }
+    }
+    return new ArrayList<PluginDescriptor>(checked.values());
+  }
+
+  /**
+   * Returns all registed plugin descriptors.
+   * 
+   * @return PluginDescriptor[]
+   */
+  public PluginDescriptor[] getPluginDescriptors() {
+    return fRegisteredPlugins.toArray(new PluginDescriptor[fRegisteredPlugins
+        .size()]);
+  }
+
+  /**
+   * Returns the descriptor of one plugin identified by a plugin id.
+   * 
+   * @param pPluginId
+   * @return PluginDescriptor
+   */
+  public PluginDescriptor getPluginDescriptor(String pPluginId) {
+
+    for (PluginDescriptor descriptor : fRegisteredPlugins) {
+      if (descriptor.getPluginId().equals(pPluginId))
+        return descriptor;
+    }
+    return null;
+  }
+
+  /**
+   * Returns a extension point indentified by a extension point id.
+   * 
+   * @param pXpId
+   * @return a extentsion point
+   */
+  public ExtensionPoint getExtensionPoint(String pXpId) {
+    return this.fExtensionPoints.get(pXpId);
+  }
+
+  /**
+   * Returns a instance of a plugin. Plugin instances are cached. So a plugin
+   * exist only as one instance. This allow a central management of plugin own
+   * resources.
+   * 
+   * After creating the plugin instance the startUp() method is invoked. The
+   * plugin use a own classloader that is used as well by all instance of
+   * extensions of the same plugin. This class loader use all exported libraries
+   * from the dependend plugins and all plugin libraries.
+   * 
+   * @param pDescriptor
+   * @return Plugin
+   * @throws PluginRuntimeException
+   */
+  public Plugin getPluginInstance(PluginDescriptor pDescriptor)
+      throws PluginRuntimeException {
+    if (fActivatedPlugins.containsKey(pDescriptor.getPluginId()))
+      return fActivatedPlugins.get(pDescriptor.getPluginId());
+    try {
+      // Must synchronize here to make sure creation and initialization
+      // of a plugin instance are done by one and only one thread.
+      // The same is in Extension.getExtensionInstance().
+      // Suggested by Stefan Groschupf <sg...@media-style.com>
+      synchronized (pDescriptor) {
+        Class<?> pluginClass = getCachedClass(pDescriptor,
+            pDescriptor.getPluginClass());
+        Constructor<?> constructor = pluginClass.getConstructor(new Class<?>[] {
+            PluginDescriptor.class, Configuration.class });
+        Plugin plugin = (Plugin) constructor.newInstance(new Object[] {
+            pDescriptor, this.conf });
+        plugin.startUp();
+        fActivatedPlugins.put(pDescriptor.getPluginId(), plugin);
+        return plugin;
+      }
+    } catch (ClassNotFoundException e) {
+      throw new PluginRuntimeException(e);
+    } catch (InstantiationException e) {
+      throw new PluginRuntimeException(e);
+    } catch (IllegalAccessException e) {
+      throw new PluginRuntimeException(e);
+    } catch (NoSuchMethodException e) {
+      throw new PluginRuntimeException(e);
+    } catch (InvocationTargetException e) {
+      throw new PluginRuntimeException(e);
+    }
+  }
+
+  /*
+   * (non-Javadoc)
+   * 
+   * @see java.lang.Object#finalize()
+   */
+  public void finalize() throws Throwable {
+    shutDownActivatedPlugins();
+  }
+
+  /**
+   * Shuts down all plugins
+   * 
+   * @throws PluginRuntimeException
+   */
+  private void shutDownActivatedPlugins() throws PluginRuntimeException {
+    for (Plugin plugin : fActivatedPlugins.values()) {
+      plugin.shutDown();
+    }
+  }
+
+  public Class getCachedClass(PluginDescriptor pDescriptor, String className)
+      throws ClassNotFoundException {
+    Map<PluginClassLoader, Class> descMap = CLASS_CACHE.get(className);
+    if (descMap == null) {
+      descMap = new HashMap<PluginClassLoader, Class>();
+      CLASS_CACHE.put(className, descMap);
+    }
+    PluginClassLoader loader = pDescriptor.getClassLoader();
+    Class clazz = descMap.get(loader);
+    if (clazz == null) {
+      clazz = loader.loadClass(className);
+      descMap.put(loader, clazz);
+    }
+    return clazz;
+  }
+
+  private void displayStatus() {
+    LOG.info("Plugin Auto-activation mode: [" + this.auto + "]");
+    LOG.info("Registered Plugins:");
+
+    if ((fRegisteredPlugins == null) || (fRegisteredPlugins.size() == 0)) {
+      LOG.info("\tNONE");
+    } else {
+      for (PluginDescriptor plugin : fRegisteredPlugins) {
+        LOG.info("\t" + plugin.getName() + " (" + plugin.getPluginId() + ")");
+      }
+    }
+
+    LOG.info("Registered Extension-Points:");
+    if ((fExtensionPoints == null) || (fExtensionPoints.size() == 0)) {
+      LOG.info("\tNONE");
+    } else {
+      for (ExtensionPoint ep : fExtensionPoints.values()) {
+        LOG.info("\t" + ep.getName() + " (" + ep.getId() + ")");
+      }
+    }
+  }
+
+  /**
+   * Filters a list of plugins. The list of plugins is filtered regarding the
+   * configuration properties <code>plugin.excludes</code> and
+   * <code>plugin.includes</code>.
+   * 
+   * @param excludes
+   * @param includes
+   * @param plugins
+   *          Map of plugins
+   * @return map of plugins matching the configuration
+   */
+  private Map<String, PluginDescriptor> filter(Pattern excludes,
+      Pattern includes, Map<String, PluginDescriptor> plugins) {
+
+    Map<String, PluginDescriptor> map = new HashMap<String, PluginDescriptor>();
+
+    if (plugins == null) {
+      return map;
+    }
+
+    for (PluginDescriptor plugin : plugins.values()) {
+
+      if (plugin == null) {
+        continue;
+      }
+      String id = plugin.getPluginId();
+      if (id == null) {
+        continue;
+      }
+
+      if (!includes.matcher(id).matches()) {
+        LOG.debug("not including: " + id);
+        continue;
+      }
+      if (excludes.matcher(id).matches()) {
+        LOG.debug("excluding: " + id);
+        continue;
+      }
+      map.put(plugin.getPluginId(), plugin);
+    }
+    return map;
+  }
+
+  /**
+   * Get ordered list of plugins. Filter and normalization plugins are applied
+   * in a configurable "pipeline" order, e.g., if one plugin depends on the
+   * output of another plugin. This method loads the plugins in the order
+   * defined by orderProperty. If orderProperty is empty or unset, all active
+   * plugins of the given interface and extension point are loaded.
+   * 
+   * @param clazz
+   *          interface class implemented by required plugins
+   * @param xPointId
+   *          extension point id of required plugins
+   * @param orderProperty
+   *          property name defining plugin order
+   * @return array of plugin instances
+   */
+  public synchronized Object[] getOrderedPlugins(Class<?> clazz,
+      String xPointId, String orderProperty) {
+    Object[] filters;
+    ObjectCache objectCache = ObjectCache.get(conf);
+    filters = (Object[]) objectCache.getObject(clazz.getName());
+
+    if (filters == null) {
+      String order = conf.get(orderProperty);
+      List<String> orderOfFilters = new ArrayList<String>();
+      boolean userDefinedOrder = false;
+      if (order != null && !order.trim().isEmpty()) {
+        orderOfFilters = Arrays.asList(order.trim().split("\\s+"));
+        userDefinedOrder = true;
+      }
+
+      try {
+        ExtensionPoint point = PluginRepository.get(conf).getExtensionPoint(
+            xPointId);
+        if (point == null)
+          throw new RuntimeException(xPointId + " not found.");
+        Extension[] extensions = point.getExtensions();
+        HashMap<String, Object> filterMap = new HashMap<String, Object>();
+        for (int i = 0; i < extensions.length; i++) {
+          Extension extension = extensions[i];
+          Object filter = extension.getExtensionInstance();
+          if (!filterMap.containsKey(filter.getClass().getName())) {
+            filterMap.put(filter.getClass().getName(), filter);
+            if (!userDefinedOrder)
+              orderOfFilters.add(filter.getClass().getName());
+          }
+        }
+        List<Object> sorted = new ArrayList<Object>();
+        for (String orderedFilter : orderOfFilters) {
+          Object f = filterMap.get(orderedFilter);
+          if (f == null) {
+            LOG.error(clazz.getSimpleName() + " : " + orderedFilter
+                + " declared in configuration property " + orderProperty
+                + " but not found in an active plugin - ignoring.");
+            continue;
+          }
+          sorted.add(f);
+        }
+        Object[] filter = (Object[]) Array.newInstance(clazz, sorted.size());
+        for (int i = 0; i < sorted.size(); i++) {
+          filter[i] = sorted.get(i);
+          if (LOG.isTraceEnabled()) {
+            LOG.trace(clazz.getSimpleName() + " : filters[" + i + "] = "
+                + filter[i].getClass());
+          }
+        }
+        objectCache.setObject(clazz.getName(), filter);
+      } catch (PluginRuntimeException e) {
+        throw new RuntimeException(e);
+      }
+
+      filters = (Object[]) objectCache.getObject(clazz.getName());
+    }
+    return filters;
+  }
+
+  /**
+   * Loads all necessary dependencies for a selected plugin, and then runs one
+   * of the classes' main() method.
+   * 
+   * @param args
+   *          plugin ID (needs to be activated in the configuration), and the
+   *          class name. The rest of arguments is passed to the main method of
+   *          the selected class.
+   * @throws Exception
+   */
+  public static void main(String[] args) throws Exception {
+    if (args.length < 2) {
+      System.err
+          .println("Usage: PluginRepository pluginId className [arg1 arg2 ...]");
+      return;
+    }
+    Configuration conf = NutchConfiguration.create();
+    PluginRepository repo = new PluginRepository(conf);
+    // args[0] - plugin ID
+    PluginDescriptor d = repo.getPluginDescriptor(args[0]);
+    if (d == null) {
+      System.err.println("Plugin '" + args[0] + "' not present or inactive.");
+      return;
+    }
+    ClassLoader cl = d.getClassLoader();
+    // args[1] - class name
+    Class<?> clazz = null;
+    try {
+      clazz = Class.forName(args[1], true, cl);
+    } catch (Exception e) {
+      System.err.println("Could not load the class '" + args[1] + ": "
+          + e.getMessage());
+      return;
+    }
+    Method m = null;
+    try {
+      m = clazz.getMethod("main", new Class<?>[] { args.getClass() });
+    } catch (Exception e) {
+      System.err.println("Could not find the 'main(String[])' method in class "
+          + args[1] + ": " + e.getMessage());
+      return;
+    }
+    String[] subargs = new String[args.length - 2];
+    System.arraycopy(args, 2, subargs, 0, subargs.length);
+    m.invoke(null, new Object[] { subargs });
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/plugin/PluginRuntimeException.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/plugin/PluginRuntimeException.java b/nutch-core/src/main/java/org/apache/nutch/plugin/PluginRuntimeException.java
new file mode 100644
index 0000000..acccda2
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/plugin/PluginRuntimeException.java
@@ -0,0 +1,37 @@
+/*
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.plugin;
+
+/**
+ * <code>PluginRuntimeException</code> will be thrown until a exception in the
+ * plugin managemnt occurs.
+ * 
+ * @author joa23
+ */
+public class PluginRuntimeException extends Exception {
+
+  private static final long serialVersionUID = 1L;
+
+  public PluginRuntimeException(Throwable cause) {
+    super(cause);
+  }
+
+  public PluginRuntimeException(String message) {
+    super(message);
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/plugin/package.html
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/plugin/package.html b/nutch-core/src/main/java/org/apache/nutch/plugin/package.html
new file mode 100644
index 0000000..5ca4c9e
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/plugin/package.html
@@ -0,0 +1,40 @@
+<html>
+<body>
+The Nutch {@link org.apache.nutch.plugin.Pluggable Plugin} System.
+<p>
+<b>The Nutch Plugin System provides a way to extend nutch functionality</b>.
+A large part of the functionality of Nutch are provided by plugins:
+All of the parsing, indexing and searching that nutch does is actually
+accomplished by various plugins.
+</p><p>
+In writing a plugin, you're actually providing one or more extensions of the
+existing extension-points (<i>hooks</i>).
+The core Nutch extension-points are themselves defined in a plugin,
+the <code>nutch-extensionpoints</code> plugin.
+Each extension-point defines an interface that must be implemented by the
+extension. The core extension-points and extensions available in Nutch are
+listed in the {@link org.apache.nutch.plugin.Pluggable} interface.
+</p>
+
+@see <a href="./doc-files/plugin.dtd">Nutch plugin manifest DTD</a>
+
+@see <a href="http://wiki.apache.org/nutch/PluginCentral">
+     Plugin Central
+     </a>
+@see <a href="http://wiki.apache.org/nutch/AboutPlugins">
+     About Plugins
+     </a>
+@see <a href="http://wiki.apache.org/nutch/WhyNutchHasAPluginSystem">
+     Why Nutch has a Plugin System?
+     </a>
+@see <a href="http://wiki.apache.org/nutch/WhichTechnicalConceptsAreBehindTheNutchPluginSystem">
+     Which technical concepts are behind the nutch plugin system?
+     </a>
+@see <a href="http://wiki.apache.org/nutch/WhatsTheProblemWithPluginsAndClass-loading">
+     What's the problem with Plugins and Class loading?
+     </a>
+@see <a href="http://wiki.apache.org/nutch/WritingPluginExample">
+     Writing Plugin Example
+     </a>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/protocol/Content.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/protocol/Content.java b/nutch-core/src/main/java/org/apache/nutch/protocol/Content.java
new file mode 100755
index 0000000..4dc8277
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/protocol/Content.java
@@ -0,0 +1,296 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol;
+
+//JDK imports
+import java.io.ByteArrayInputStream;
+import java.io.DataInput;
+import java.io.DataInputStream;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.zip.InflaterInputStream;
+
+//Hadoop imports
+import org.apache.commons.cli.Options;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.ArrayFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.VersionMismatchException;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.util.GenericOptionsParser;
+
+//Nutch imports
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.util.MimeUtil;
+import org.apache.nutch.util.NutchConfiguration;
+
+public final class Content implements Writable {
+
+  public static final String DIR_NAME = "content";
+
+  private final static int VERSION = -1;
+
+  private int version;
+
+  private String url;
+
+  private String base;
+
+  private byte[] content;
+
+  private String contentType;
+
+  private Metadata metadata;
+
+  private MimeUtil mimeTypes;
+
+  public Content() {
+    metadata = new Metadata();
+  }
+
+  public Content(String url, String base, byte[] content, String contentType,
+      Metadata metadata, Configuration conf) {
+
+    if (url == null)
+      throw new IllegalArgumentException("null url");
+    if (base == null)
+      throw new IllegalArgumentException("null base");
+    if (content == null)
+      throw new IllegalArgumentException("null content");
+    if (metadata == null)
+      throw new IllegalArgumentException("null metadata");
+
+    this.url = url;
+    this.base = base;
+    this.content = content;
+    this.metadata = metadata;
+
+    this.mimeTypes = new MimeUtil(conf);
+    this.contentType = getContentType(contentType, url, content);
+  }
+
+  private final void readFieldsCompressed(DataInput in) throws IOException {
+    byte oldVersion = in.readByte();
+    switch (oldVersion) {
+    case 0:
+    case 1:
+      url = Text.readString(in); // read url
+      base = Text.readString(in); // read base
+
+      content = new byte[in.readInt()]; // read content
+      in.readFully(content);
+
+      contentType = Text.readString(in); // read contentType
+      // reconstruct metadata
+      int keySize = in.readInt();
+      String key;
+      for (int i = 0; i < keySize; i++) {
+        key = Text.readString(in);
+        int valueSize = in.readInt();
+        for (int j = 0; j < valueSize; j++) {
+          metadata.add(key, Text.readString(in));
+        }
+      }
+      break;
+    case 2:
+      url = Text.readString(in); // read url
+      base = Text.readString(in); // read base
+
+      content = new byte[in.readInt()]; // read content
+      in.readFully(content);
+
+      contentType = Text.readString(in); // read contentType
+      metadata.readFields(in); // read meta data
+      break;
+    default:
+      throw new VersionMismatchException((byte) 2, oldVersion);
+    }
+
+  }
+
+  public final void readFields(DataInput in) throws IOException {
+    metadata.clear();
+    int sizeOrVersion = in.readInt();
+    if (sizeOrVersion < 0) { // version
+      version = sizeOrVersion;
+      switch (version) {
+      case VERSION:
+        url = Text.readString(in);
+        base = Text.readString(in);
+
+        content = new byte[in.readInt()];
+        in.readFully(content);
+
+        contentType = Text.readString(in);
+        metadata.readFields(in);
+        break;
+      default:
+        throw new VersionMismatchException((byte) VERSION, (byte) version);
+      }
+    } else { // size
+      byte[] compressed = new byte[sizeOrVersion];
+      in.readFully(compressed, 0, compressed.length);
+      ByteArrayInputStream deflated = new ByteArrayInputStream(compressed);
+      DataInput inflater = new DataInputStream(
+          new InflaterInputStream(deflated));
+      readFieldsCompressed(inflater);
+    }
+  }
+
+  public final void write(DataOutput out) throws IOException {
+    out.writeInt(VERSION);
+
+    Text.writeString(out, url); // write url
+    Text.writeString(out, base); // write base
+
+    out.writeInt(content.length); // write content
+    out.write(content);
+
+    Text.writeString(out, contentType); // write contentType
+
+    metadata.write(out); // write metadata
+  }
+
+  public static Content read(DataInput in) throws IOException {
+    Content content = new Content();
+    content.readFields(in);
+    return content;
+  }
+
+  //
+  // Accessor methods
+  //
+
+  /** The url fetched. */
+  public String getUrl() {
+    return url;
+  }
+
+  /**
+   * The base url for relative links contained in the content. Maybe be
+   * different from url if the request redirected.
+   */
+  public String getBaseUrl() {
+    return base;
+  }
+
+  /** The binary content retrieved. */
+  public byte[] getContent() {
+    return content;
+  }
+
+  public void setContent(byte[] content) {
+    this.content = content;
+  }
+
+  /**
+   * The media type of the retrieved content.
+   * 
+   * @see <a href="http://www.iana.org/assignments/media-types/">
+   *      http://www.iana.org/assignments/media-types/</a>
+   */
+  public String getContentType() {
+    return contentType;
+  }
+
+  public void setContentType(String contentType) {
+    this.contentType = contentType;
+  }
+
+  /** Other protocol-specific data. */
+  public Metadata getMetadata() {
+    return metadata;
+  }
+
+  /** Other protocol-specific data. */
+  public void setMetadata(Metadata metadata) {
+    this.metadata = metadata;
+  }
+
+  public boolean equals(Object o) {
+    if (!(o instanceof Content)) {
+      return false;
+    }
+    Content that = (Content) o;
+    return this.url.equals(that.url) && this.base.equals(that.base)
+        && Arrays.equals(this.getContent(), that.getContent())
+        && this.contentType.equals(that.contentType)
+        && this.metadata.equals(that.metadata);
+  }
+
+  public String toString() {
+    StringBuffer buffer = new StringBuffer();
+
+    buffer.append("Version: " + version + "\n");
+    buffer.append("url: " + url + "\n");
+    buffer.append("base: " + base + "\n");
+    buffer.append("contentType: " + contentType + "\n");
+    buffer.append("metadata: " + metadata + "\n");
+    buffer.append("Content:\n");
+    buffer.append(new String(content)); // try default encoding
+
+    return buffer.toString();
+
+  }
+
+  public static void main(String argv[]) throws Exception {
+
+    String usage = "Content (-local | -dfs <namenode:port>) recno segment";
+
+    if (argv.length < 3) {
+      System.out.println("usage:" + usage);
+      return;
+    }
+    Options opts = new Options();
+    Configuration conf = NutchConfiguration.create();
+
+    GenericOptionsParser parser = new GenericOptionsParser(conf, opts, argv);
+
+    String[] remainingArgs = parser.getRemainingArgs();
+    FileSystem fs = FileSystem.get(conf);
+
+    try {
+      int recno = Integer.parseInt(remainingArgs[0]);
+      String segment = remainingArgs[1];
+
+      Path file = new Path(segment, DIR_NAME);
+      System.out.println("Reading from file: " + file);
+
+      ArrayFile.Reader contents = new ArrayFile.Reader(fs, file.toString(),
+          conf);
+
+      Content content = new Content();
+      contents.get(recno, content);
+      System.out.println("Retrieved " + recno + " from file " + file);
+
+      System.out.println(content);
+
+      contents.close();
+    } finally {
+      fs.close();
+    }
+  }
+
+  private String getContentType(String typeName, String url, byte[] data) {
+    return this.mimeTypes.autoResolveContentType(typeName, url, data);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/protocol/Protocol.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/protocol/Protocol.java b/nutch-core/src/main/java/org/apache/nutch/protocol/Protocol.java
new file mode 100755
index 0000000..0aa5d29
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/protocol/Protocol.java
@@ -0,0 +1,68 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configurable;
+import org.apache.hadoop.io.Text;
+
+// Nutch imports
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.plugin.Pluggable;
+
+import crawlercommons.robots.BaseRobotRules;
+
+/** A retriever of url content. Implemented by protocol extensions. */
+public interface Protocol extends Pluggable, Configurable {
+  /** The name of the extension point. */
+  public final static String X_POINT_ID = Protocol.class.getName();
+
+  /**
+   * Property name. If in the current configuration this property is set to
+   * true, protocol implementations should handle "politeness" limits
+   * internally. If this is set to false, it is assumed that these limits are
+   * enforced elsewhere, and protocol implementations should not enforce them
+   * internally.
+   */
+  public final static String CHECK_BLOCKING = "protocol.plugin.check.blocking";
+
+  /**
+   * Property name. If in the current configuration this property is set to
+   * true, protocol implementations should handle robot exclusion rules
+   * internally. If this is set to false, it is assumed that these limits are
+   * enforced elsewhere, and protocol implementations should not enforce them
+   * internally.
+   */
+  public final static String CHECK_ROBOTS = "protocol.plugin.check.robots";
+
+  /**
+   * Returns the {@link Content} for a fetchlist entry.
+   */
+  ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum);
+
+  /**
+   * Retrieve robot rules applicable for this url.
+   * 
+   * @param url
+   *          url to check
+   * @param datum
+   *          page datum
+   * @return robot rules (specific for this url or default), never null
+   */
+  BaseRobotRules getRobotRules(Text url, CrawlDatum datum);
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/protocol/ProtocolException.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/protocol/ProtocolException.java b/nutch-core/src/main/java/org/apache/nutch/protocol/ProtocolException.java
new file mode 100755
index 0000000..fc4add5
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/protocol/ProtocolException.java
@@ -0,0 +1,39 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol;
+
+@SuppressWarnings("serial")
+public class ProtocolException extends Exception {
+
+  public ProtocolException() {
+    super();
+  }
+
+  public ProtocolException(String message) {
+    super(message);
+  }
+
+  public ProtocolException(String message, Throwable cause) {
+    super(message, cause);
+  }
+
+  public ProtocolException(Throwable cause) {
+    super(cause);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/protocol/ProtocolFactory.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/protocol/ProtocolFactory.java b/nutch-core/src/main/java/org/apache/nutch/protocol/ProtocolFactory.java
new file mode 100644
index 0000000..8a92d60
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/protocol/ProtocolFactory.java
@@ -0,0 +1,119 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol;
+
+import java.net.URL;
+import java.net.MalformedURLException;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.nutch.plugin.*;
+import org.apache.nutch.util.ObjectCache;
+
+import org.apache.hadoop.conf.Configuration;
+
+/**
+ * Creates and caches {@link Protocol} plugins. Protocol plugins should define
+ * the attribute "protocolName" with the name of the protocol that they
+ * implement. Configuration object is used for caching. Cache key is constructed
+ * from appending protocol name (eg. http) to constant
+ * {@link Protocol#X_POINT_ID}.
+ */
+public class ProtocolFactory {
+
+  public static final Logger LOG = LoggerFactory
+      .getLogger(ProtocolFactory.class);
+
+  private ExtensionPoint extensionPoint;
+
+  private Configuration conf;
+
+  public ProtocolFactory(Configuration conf) {
+    this.conf = conf;
+    this.extensionPoint = PluginRepository.get(conf).getExtensionPoint(
+        Protocol.X_POINT_ID);
+    if (this.extensionPoint == null) {
+      throw new RuntimeException("x-point " + Protocol.X_POINT_ID
+          + " not found.");
+    }
+  }
+
+  /**
+   * Returns the appropriate {@link Protocol} implementation for a url.
+   * 
+   * @param urlString
+   *          Url String
+   * @return The appropriate {@link Protocol} implementation for a given
+   *         {@link URL}.
+   * @throws ProtocolNotFound
+   *           when Protocol can not be found for urlString
+   */
+  public synchronized Protocol getProtocol(String urlString)
+      throws ProtocolNotFound {
+    ObjectCache objectCache = ObjectCache.get(conf);
+    try {
+      URL url = new URL(urlString);
+      String protocolName = url.getProtocol();
+      if (protocolName == null)
+        throw new ProtocolNotFound(urlString);
+
+      String cacheId = Protocol.X_POINT_ID + protocolName;
+      Protocol protocol = (Protocol) objectCache.getObject(cacheId);
+      if (protocol != null) {
+        return protocol;
+      }
+
+      Extension extension = findExtension(protocolName);
+      if (extension == null) {
+        throw new ProtocolNotFound(protocolName);
+      }
+
+      protocol = (Protocol) extension.getExtensionInstance();
+      objectCache.setObject(cacheId, protocol);
+      return protocol;
+    } catch (MalformedURLException e) {
+      throw new ProtocolNotFound(urlString, e.toString());
+    } catch (PluginRuntimeException e) {
+      throw new ProtocolNotFound(urlString, e.toString());
+    }
+  }
+
+  private Extension findExtension(String name) throws PluginRuntimeException {
+
+    Extension[] extensions = this.extensionPoint.getExtensions();
+
+    for (int i = 0; i < extensions.length; i++) {
+      Extension extension = extensions[i];
+
+      if (contains(name, extension.getAttribute("protocolName")))
+        return extension;
+    }
+    return null;
+  }
+
+  boolean contains(String what, String where) {
+    String parts[] = where.split("[, ]");
+    for (int i = 0; i < parts.length; i++) {
+      if (parts[i].equals(what))
+        return true;
+    }
+    return false;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/protocol/ProtocolNotFound.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/protocol/ProtocolNotFound.java b/nutch-core/src/main/java/org/apache/nutch/protocol/ProtocolNotFound.java
new file mode 100644
index 0000000..8cadc23
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/protocol/ProtocolNotFound.java
@@ -0,0 +1,36 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol;
+
+@SuppressWarnings("serial")
+public class ProtocolNotFound extends ProtocolException {
+  private String url;
+
+  public ProtocolNotFound(String url) {
+    this(url, "protocol not found for url=" + url);
+  }
+
+  public ProtocolNotFound(String url, String message) {
+    super(message);
+    this.url = url;
+  }
+
+  public String getUrl() {
+    return url;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/protocol/ProtocolOutput.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/protocol/ProtocolOutput.java b/nutch-core/src/main/java/org/apache/nutch/protocol/ProtocolOutput.java
new file mode 100644
index 0000000..c7f0c2c
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/protocol/ProtocolOutput.java
@@ -0,0 +1,55 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol;
+
+/**
+ * Simple aggregate to pass from protocol plugins both content and protocol
+ * status.
+ * 
+ * @author Andrzej Bialecki &lt;ab@getopt.org&gt;
+ */
+public class ProtocolOutput {
+  private Content content;
+  private ProtocolStatus status;
+
+  public ProtocolOutput(Content content, ProtocolStatus status) {
+    this.content = content;
+    this.status = status;
+  }
+
+  public ProtocolOutput(Content content) {
+    this.content = content;
+    this.status = ProtocolStatus.STATUS_SUCCESS;
+  }
+
+  public Content getContent() {
+    return content;
+  }
+
+  public void setContent(Content content) {
+    this.content = content;
+  }
+
+  public ProtocolStatus getStatus() {
+    return status;
+  }
+
+  public void setStatus(ProtocolStatus status) {
+    this.status = status;
+  }
+}

[40/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build for nutch-core and nutch-plugins

Posted by th...@apache.org.

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/protocol/ProtocolStatus.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/protocol/ProtocolStatus.java b/nutch-core/src/main/java/org/apache/nutch/protocol/ProtocolStatus.java
new file mode 100644
index 0000000..9e75531
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/protocol/ProtocolStatus.java
@@ -0,0 +1,297 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.HashMap;
+
+import org.apache.hadoop.io.VersionMismatchException;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.io.WritableUtils;
+
+/**
+ * @author Andrzej Bialecki
+ */
+public class ProtocolStatus implements Writable {
+
+  private final static byte VERSION = 2;
+
+  /** Content was retrieved without errors. */
+  public static final int SUCCESS = 1;
+  /** Content was not retrieved. Any further errors may be indicated in args. */
+  public static final int FAILED = 2;
+
+  /** This protocol was not found. Application may attempt to retry later. */
+  public static final int PROTO_NOT_FOUND = 10;
+  /** Resource is gone. */
+  public static final int GONE = 11;
+  /** Resource has moved permanently. New url should be found in args. */
+  public static final int MOVED = 12;
+  /** Resource has moved temporarily. New url should be found in args. */
+  public static final int TEMP_MOVED = 13;
+  /** Resource was not found. */
+  public static final int NOTFOUND = 14;
+  /** Temporary failure. Application may retry immediately. */
+  public static final int RETRY = 15;
+  /**
+   * Unspecified exception occured. Further information may be provided in args.
+   */
+  public static final int EXCEPTION = 16;
+  /** Access denied - authorization required, but missing/incorrect. */
+  public static final int ACCESS_DENIED = 17;
+  /** Access denied by robots.txt rules. */
+  public static final int ROBOTS_DENIED = 18;
+  /** Too many redirects. */
+  public static final int REDIR_EXCEEDED = 19;
+  /** Not fetching. */
+  public static final int NOTFETCHING = 20;
+  /** Unchanged since the last fetch. */
+  public static final int NOTMODIFIED = 21;
+  /**
+   * Request was refused by protocol plugins, because it would block. The
+   * expected number of milliseconds to wait before retry may be provided in
+   * args.
+   */
+  public static final int WOULDBLOCK = 22;
+  /** Thread was blocked http.max.delays times during fetching. */
+  public static final int BLOCKED = 23;
+
+  // Useful static instances for status codes that don't usually require any
+  // additional arguments.
+  public static final ProtocolStatus STATUS_SUCCESS = new ProtocolStatus(
+      SUCCESS);
+  public static final ProtocolStatus STATUS_FAILED = new ProtocolStatus(FAILED);
+  public static final ProtocolStatus STATUS_GONE = new ProtocolStatus(GONE);
+  public static final ProtocolStatus STATUS_NOTFOUND = new ProtocolStatus(
+      NOTFOUND);
+  public static final ProtocolStatus STATUS_RETRY = new ProtocolStatus(RETRY);
+  public static final ProtocolStatus STATUS_ROBOTS_DENIED = new ProtocolStatus(
+      ROBOTS_DENIED);
+  public static final ProtocolStatus STATUS_REDIR_EXCEEDED = new ProtocolStatus(
+      REDIR_EXCEEDED);
+  public static final ProtocolStatus STATUS_NOTFETCHING = new ProtocolStatus(
+      NOTFETCHING);
+  public static final ProtocolStatus STATUS_NOTMODIFIED = new ProtocolStatus(
+      NOTMODIFIED);
+  public static final ProtocolStatus STATUS_WOULDBLOCK = new ProtocolStatus(
+      WOULDBLOCK);
+  public static final ProtocolStatus STATUS_BLOCKED = new ProtocolStatus(
+      BLOCKED);
+
+  private int code;
+  private long lastModified;
+  private String[] args;
+
+  private static final HashMap<Integer, String> codeToName = new HashMap<Integer, String>();
+  static {
+    codeToName.put(new Integer(SUCCESS), "success");
+    codeToName.put(new Integer(FAILED), "failed");
+    codeToName.put(new Integer(PROTO_NOT_FOUND), "proto_not_found");
+    codeToName.put(new Integer(GONE), "gone");
+    codeToName.put(new Integer(MOVED), "moved");
+    codeToName.put(new Integer(TEMP_MOVED), "temp_moved");
+    codeToName.put(new Integer(NOTFOUND), "notfound");
+    codeToName.put(new Integer(RETRY), "retry");
+    codeToName.put(new Integer(EXCEPTION), "exception");
+    codeToName.put(new Integer(ACCESS_DENIED), "access_denied");
+    codeToName.put(new Integer(ROBOTS_DENIED), "robots_denied");
+    codeToName.put(new Integer(REDIR_EXCEEDED), "redir_exceeded");
+    codeToName.put(new Integer(NOTFETCHING), "notfetching");
+    codeToName.put(new Integer(NOTMODIFIED), "notmodified");
+    codeToName.put(new Integer(WOULDBLOCK), "wouldblock");
+    codeToName.put(new Integer(BLOCKED), "blocked");
+  }
+
+  public ProtocolStatus() {
+
+  }
+
+  public ProtocolStatus(int code, String[] args) {
+    this.code = code;
+    this.args = args;
+  }
+
+  public ProtocolStatus(int code, String[] args, long lastModified) {
+    this.code = code;
+    this.args = args;
+    this.lastModified = lastModified;
+  }
+
+  public ProtocolStatus(int code) {
+    this(code, null);
+  }
+
+  public ProtocolStatus(int code, long lastModified) {
+    this(code, null, lastModified);
+  }
+
+  public ProtocolStatus(int code, Object message) {
+    this(code, message, 0L);
+  }
+
+  public ProtocolStatus(int code, Object message, long lastModified) {
+    this.code = code;
+    this.lastModified = lastModified;
+    if (message != null)
+      this.args = new String[] { String.valueOf(message) };
+  }
+
+  public ProtocolStatus(Throwable t) {
+    this(EXCEPTION, t);
+  }
+
+  public static ProtocolStatus read(DataInput in) throws IOException {
+    ProtocolStatus res = new ProtocolStatus();
+    res.readFields(in);
+    return res;
+  }
+
+  public void readFields(DataInput in) throws IOException {
+    byte version = in.readByte();
+    switch (version) {
+    case 1:
+      code = in.readByte();
+      lastModified = in.readLong();
+      args = WritableUtils.readCompressedStringArray(in);
+      break;
+    case VERSION:
+      code = in.readByte();
+      lastModified = in.readLong();
+      args = WritableUtils.readStringArray(in);
+      break;
+    default:
+      throw new VersionMismatchException(VERSION, version);
+    }
+  }
+
+  public void write(DataOutput out) throws IOException {
+    out.writeByte(VERSION);
+    out.writeByte((byte) code);
+    out.writeLong(lastModified);
+    if (args == null) {
+      out.writeInt(-1);
+    } else {
+      WritableUtils.writeStringArray(out, args);
+    }
+  }
+
+  public void setArgs(String[] args) {
+    this.args = args;
+  }
+
+  public String[] getArgs() {
+    return args;
+  }
+
+  public int getCode() {
+    return code;
+  }
+
+  public String getName() {
+    return codeToName.get(this.code);
+  }
+
+  public void setCode(int code) {
+    this.code = code;
+  }
+
+  public boolean isSuccess() {
+    return code == SUCCESS;
+  }
+
+  public boolean isTransientFailure() {
+    return code == ACCESS_DENIED || code == EXCEPTION || code == REDIR_EXCEEDED
+        || code == RETRY || code == TEMP_MOVED || code == WOULDBLOCK
+        || code == PROTO_NOT_FOUND;
+  }
+
+  public boolean isPermanentFailure() {
+    return code == FAILED || code == GONE || code == MOVED || code == NOTFOUND
+        || code == ROBOTS_DENIED;
+  }
+
+  public boolean isRedirect() {
+      return code == MOVED || code == TEMP_MOVED;
+  }
+
+  public String getMessage() {
+    if (args != null && args.length > 0)
+      return args[0];
+    return null;
+  }
+
+  public void setMessage(String msg) {
+    if (args != null && args.length > 0)
+      args[0] = msg;
+    else
+      args = new String[] { msg };
+  }
+
+  public long getLastModified() {
+    return lastModified;
+  }
+
+  public void setLastModified(long lastModified) {
+    this.lastModified = lastModified;
+  }
+
+  public boolean equals(Object o) {
+    if (o == null)
+      return false;
+    if (!(o instanceof ProtocolStatus))
+      return false;
+    ProtocolStatus other = (ProtocolStatus) o;
+    if (this.code != other.code || this.lastModified != other.lastModified)
+      return false;
+    if (this.args == null) {
+      if (other.args == null)
+        return true;
+      else
+        return false;
+    } else {
+      if (other.args == null)
+        return false;
+      if (other.args.length != this.args.length)
+        return false;
+      for (int i = 0; i < this.args.length; i++) {
+        if (!this.args[i].equals(other.args[i]))
+          return false;
+      }
+    }
+    return true;
+  }
+
+  public String toString() {
+    StringBuffer res = new StringBuffer();
+    res.append(codeToName.get(new Integer(code)) + "(" + code
+        + "), lastModified=" + lastModified);
+    if (args != null) {
+      if (args.length == 1) {
+        res.append(": " + String.valueOf(args[0]));
+      } else {
+        for (int i = 0; i < args.length; i++) {
+          if (args[i] != null)
+            res.append(", args[" + i + "]=" + String.valueOf(args[i]));
+        }
+      }
+    }
+    return res.toString();
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/protocol/RobotRulesParser.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/protocol/RobotRulesParser.java b/nutch-core/src/main/java/org/apache/nutch/protocol/RobotRulesParser.java
new file mode 100644
index 0000000..475aef4
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/protocol/RobotRulesParser.java
@@ -0,0 +1,325 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol;
+
+// JDK imports
+import java.io.File;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.LineNumberReader;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.HashSet;
+import java.util.Hashtable;
+import java.util.Set;
+import java.util.StringTokenizer;
+
+// Commons Logging imports
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+// Nutch imports
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.SuffixStringMatcher;
+
+import crawlercommons.robots.BaseRobotRules;
+import crawlercommons.robots.SimpleRobotRules;
+import crawlercommons.robots.SimpleRobotRules.RobotRulesMode;
+import crawlercommons.robots.SimpleRobotRulesParser;
+
+/**
+ * This class uses crawler-commons for handling the parsing of
+ * {@code robots.txt} files. It emits SimpleRobotRules objects, which describe
+ * the download permissions as described in SimpleRobotRulesParser.
+ * 
+ * Protocol-specific implementations have to implement the method
+ * {@link getRobotRulesSet}.
+ */
+public abstract class RobotRulesParser implements Tool {
+
+  public static final Logger LOG = LoggerFactory
+      .getLogger(RobotRulesParser.class);
+
+  protected static final Hashtable<String, BaseRobotRules> CACHE = new Hashtable<String, BaseRobotRules>();
+  
+  /**
+   * A {@link BaseRobotRules} object appropriate for use when the
+   * {@code robots.txt} file is empty or missing; all requests are allowed.
+   */
+  public static final BaseRobotRules EMPTY_RULES = new SimpleRobotRules(
+      RobotRulesMode.ALLOW_ALL);
+
+  /**
+   * A {@link BaseRobotRules} object appropriate for use when the
+   * {@code robots.txt} file is not fetched due to a {@code 403/Forbidden}
+   * response; all requests are disallowed.
+   */
+  public static BaseRobotRules FORBID_ALL_RULES = new SimpleRobotRules(
+      RobotRulesMode.ALLOW_NONE);
+
+  private static SimpleRobotRulesParser robotParser = new SimpleRobotRulesParser();
+  protected Configuration conf;
+  protected String agentNames;
+
+  /** set of host names or IPs to be explicitly excluded from robots.txt checking */
+  protected Set<String> whiteList = new HashSet<String>();
+  
+  /* Matcher user for efficiently matching URLs against a set of suffixes. */
+  private SuffixStringMatcher matcher = null;
+
+  public RobotRulesParser() {
+  }
+
+  public RobotRulesParser(Configuration conf) {
+    setConf(conf);
+  }
+
+  /**
+   * Set the {@link Configuration} object
+   */
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+
+    // Grab the agent names we advertise to robots files.
+    String agentName = conf.get("http.agent.name");
+    if (agentName == null || (agentName = agentName.trim()).isEmpty()) {
+      throw new RuntimeException("Agent name not configured!");
+    }
+    agentNames = agentName;
+
+    // If there are any other agents specified, append those to the list of
+    // agents
+    String otherAgents = conf.get("http.robots.agents");
+    if (otherAgents != null && !otherAgents.trim().isEmpty()) {
+      StringTokenizer tok = new StringTokenizer(otherAgents, ",");
+      StringBuilder sb = new StringBuilder(agentNames);
+      while (tok.hasMoreTokens()) {
+        String str = tok.nextToken().trim();
+        if (str.equals("*") || str.equals(agentName)) {
+          // skip wildcard "*" or agent name itself
+          // (required for backward compatibility, cf. NUTCH-1715 and
+          // NUTCH-1718)
+        } else {
+          sb.append(",").append(str);
+        }
+      }
+
+      agentNames = sb.toString();
+    }
+
+    String[] confWhiteList = conf.getStrings("http.robot.rules.whitelist");
+    if (confWhiteList == null) {
+      LOG.info("robots.txt whitelist not configured.");
+    }
+    else {
+      for (int i = 0; i < confWhiteList.length; i++) {
+        if (confWhiteList[i].isEmpty()) {
+      	  LOG.info("Empty whitelisted URL skipped!");
+      	  continue;
+        }
+        whiteList.add(confWhiteList[i]);
+      }
+      
+      if (whiteList.size() > 0) {
+        matcher = new SuffixStringMatcher(whiteList);
+        LOG.info("Whitelisted hosts: " + whiteList);
+      }
+    }
+  }
+
+  /**
+   * Get the {@link Configuration} object
+   */
+  public Configuration getConf() {
+    return conf;
+  }
+
+  /**
+   * Check whether a URL belongs to a whitelisted host.
+   */
+  public boolean isWhiteListed(URL url) {
+    boolean match = false;
+    String urlString = url.getHost();
+    
+    if (matcher != null) {
+    	match = matcher.matches(urlString);
+    }
+    
+    return match;
+  }
+
+  /**
+   * Parses the robots content using the {@link SimpleRobotRulesParser} from
+   * crawler commons
+   * 
+   * @param url
+   *          A string containing url
+   * @param content
+   *          Contents of the robots file in a byte array
+   * @param contentType
+   *          The content type of the robots file
+   * @param robotName
+   *          A string containing all the robots agent names used by parser for
+   *          matching
+   * @return BaseRobotRules object
+   */
+  public BaseRobotRules parseRules(String url, byte[] content,
+      String contentType, String robotName) {
+    return robotParser.parseContent(url, content, contentType, robotName);
+  }
+
+  public BaseRobotRules getRobotRulesSet(Protocol protocol, Text url) {
+    URL u = null;
+    try {
+      u = new URL(url.toString());
+    } catch (Exception e) {
+      return EMPTY_RULES;
+    }
+    return getRobotRulesSet(protocol, u);
+  }
+
+  /**
+   * Fetch robots.txt (or it's protocol-specific equivalent) which applies to
+   * the given URL, parse it and return the set of robot rules applicable for
+   * the configured agent name(s).
+   * 
+   * @param protocol
+   *          protocol implementation
+   * @param url
+   *          URL to be checked whether fetching is allowed by robot rules
+   * @return robot rules
+   */
+  public abstract BaseRobotRules getRobotRulesSet(Protocol protocol, URL url);
+
+  @Override
+  public int run(String[] args) {
+
+    if (args.length < 2) {
+      String[] help = {
+          "Usage: RobotRulesParser <robots-file> <url-file> [<agent-names>]\n",
+          "\tThe <robots-file> will be parsed as a robots.txt file,",
+          "\tusing the given <agent-name> to select rules.",
+          "\tURLs will be read (one per line) from <url-file>,",
+          "\tand tested against the rules.",
+          "\tMultiple agent names can be provided using",
+          "\tcomma as a delimiter without any spaces.",
+          "\tIf no agent name is given the property http.agent.name",
+          "\tis used. If http.agent.name is empty, robots.txt is checked",
+          "\tfor rules assigned to the user agent `*' (meaning any other)." };
+      for (String s : help) {
+        System.err.println(s);
+      }
+      System.exit(-1);
+    }
+
+    File robotsFile = new File(args[0]);
+    File urlFile = new File(args[1]);
+
+    if (args.length > 2) {
+      // set agent name from command-line in configuration and update parser
+      String agents = args[2];
+      conf.set("http.agent.name", agents);
+      setConf(conf);
+    }
+
+    try {
+      BaseRobotRules rules = getRobotRulesSet(null, robotsFile.toURI().toURL());
+
+      LineNumberReader testsIn = new LineNumberReader(new FileReader(urlFile));
+      String testPath;
+      testPath = testsIn.readLine().trim();
+      while (testPath != null) {
+        try {
+          // testPath can be just a path or a complete URL
+          URL url = new URL(testPath);
+          String status;
+          if (isWhiteListed(url)) {
+            status = "whitelisted";
+          } else if (rules.isAllowed(testPath)) {
+            status = "allowed";
+          } else {
+            status = "not allowed";
+          }
+          System.out.println(status + ":\t" + testPath);
+        } catch (MalformedURLException e) {
+        }
+        testPath = testsIn.readLine();
+      }
+      testsIn.close();
+    } catch (IOException e) {
+      LOG.error("Failed to run: " + StringUtils.stringifyException(e));
+      return -1;
+    }
+
+    return 0;
+  }
+
+  /**
+   * {@link RobotRulesParser} implementation which expects the location of the
+   * robots.txt passed by URL (usually pointing to a local file) in
+   * {@link getRobotRulesSet}.
+   */
+  private static class TestRobotRulesParser extends RobotRulesParser {
+
+    public TestRobotRulesParser(Configuration conf) {
+      // make sure that agent name is set so that setConf() does not complain,
+      // the agent name is later overwritten by command-line argument
+      if (conf.get("http.agent.name") == null) {
+        conf.set("http.agent.name", "*");
+      }
+      setConf(conf);
+    }
+
+    /**
+     * @param protocol  (ignored)
+     * @param url
+     *          location of the robots.txt file
+     * */
+    public BaseRobotRules getRobotRulesSet(Protocol protocol, URL url) {
+      BaseRobotRules rules;
+      try {
+        int contentLength = url.openConnection().getContentLength();
+        byte[] robotsBytes = new byte[contentLength];
+        InputStream openStream = url.openStream();
+        openStream.read(robotsBytes);
+        openStream.close();
+        rules = robotParser.parseContent(url.toString(), robotsBytes,
+            "text/plain", this.conf.get("http.agent.name"));
+      } catch (IOException e) {
+        LOG.error("Failed to open robots.txt file " + url
+            + StringUtils.stringifyException(e));
+        rules = EMPTY_RULES;
+      }
+      return rules;
+    }
+
+  }
+
+  public static void main(String[] args) throws Exception {
+    Configuration conf = NutchConfiguration.create();
+    int res = ToolRunner.run(conf, new TestRobotRulesParser(conf), args);
+    System.exit(res);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/protocol/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/protocol/package-info.java b/nutch-core/src/main/java/org/apache/nutch/protocol/package-info.java
new file mode 100644
index 0000000..6685249
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/protocol/package-info.java
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Classes related to the {@link org.apache.nutch.protocol.Protocol Protocol} interface,
+ * see also {@link org.apache.nutch.net.protocols}.
+ */
+package org.apache.nutch.protocol;
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/scoring/AbstractScoringFilter.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/scoring/AbstractScoringFilter.java b/nutch-core/src/main/java/org/apache/nutch/scoring/AbstractScoringFilter.java
new file mode 100644
index 0000000..d74c7fb
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/scoring/AbstractScoringFilter.java
@@ -0,0 +1,68 @@
+package org.apache.nutch.scoring;
+
+import java.util.Collection;
+import java.util.List;
+import java.util.Map.Entry;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.scoring.ScoringFilter;
+import org.apache.nutch.scoring.ScoringFilterException;
+
+public abstract class AbstractScoringFilter implements ScoringFilter {
+
+  private Configuration conf;
+
+  public Configuration getConf() {
+    return conf;
+  }
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+  }
+
+  public void injectedScore(Text url, CrawlDatum datum)
+      throws ScoringFilterException {
+  }
+
+  public void initialScore(Text url, CrawlDatum datum)
+      throws ScoringFilterException {
+  }
+
+  public float generatorSortValue(Text url, CrawlDatum datum, float initSort)
+      throws ScoringFilterException {
+    return initSort;
+  }
+
+  public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content)
+      throws ScoringFilterException {
+  }
+
+  public void passScoreAfterParsing(Text url, Content content, Parse parse)
+      throws ScoringFilterException {
+  }
+
+  public CrawlDatum distributeScoreToOutlinks(Text fromUrl,
+      ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets,
+      CrawlDatum adjust, int allCount) throws ScoringFilterException {
+    return adjust;
+  }
+
+  public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum,
+      List<CrawlDatum> inlinked) throws ScoringFilterException {
+  }
+
+  @Override
+  public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum,
+      CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore)
+      throws ScoringFilterException {
+    return initScore;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/scoring/ScoringFilter.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/scoring/ScoringFilter.java b/nutch-core/src/main/java/org/apache/nutch/scoring/ScoringFilter.java
new file mode 100644
index 0000000..4061a75
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/scoring/ScoringFilter.java
@@ -0,0 +1,213 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.scoring;
+
+import java.util.Collection;
+import java.util.List;
+import java.util.Map.Entry;
+
+import org.apache.hadoop.conf.Configurable;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.plugin.Pluggable;
+import org.apache.nutch.protocol.Content;
+
+/**
+ * A contract defining behavior of scoring plugins.
+ * 
+ * A scoring filter will manipulate scoring variables in CrawlDatum and in
+ * resulting search indexes. Filters can be chained in a specific order, to
+ * provide multi-stage scoring adjustments.
+ * 
+ * @author Andrzej Bialecki
+ */
+public interface ScoringFilter extends Configurable, Pluggable {
+  /** The name of the extension point. */
+  public final static String X_POINT_ID = ScoringFilter.class.getName();
+
+  /**
+   * Set an initial score for newly injected pages. Note: newly injected pages
+   * may have no inlinks, so filter implementations may wish to set this score
+   * to a non-zero value, to give newly injected pages some initial credit.
+   * 
+   * @param url
+   *          url of the page
+   * @param datum
+   *          new datum. Filters will modify it in-place.
+   * @throws ScoringFilterException
+   */
+  public void injectedScore(Text url, CrawlDatum datum)
+      throws ScoringFilterException;
+
+  /**
+   * Set an initial score for newly discovered pages. Note: newly discovered
+   * pages have at least one inlink with its score contribution, so filter
+   * implementations may choose to set initial score to zero (unknown value),
+   * and then the inlink score contribution will set the "real" value of the new
+   * page.
+   * 
+   * @param url
+   *          url of the page
+   * @param datum
+   *          new datum. Filters will modify it in-place.
+   * @throws ScoringFilterException
+   */
+  public void initialScore(Text url, CrawlDatum datum)
+      throws ScoringFilterException;
+
+  /**
+   * This method prepares a sort value for the purpose of sorting and selecting
+   * top N scoring pages during fetchlist generation.
+   * 
+   * @param url
+   *          url of the page
+   * @param datum
+   *          page's datum, should not be modified
+   * @param initSort
+   *          initial sort value, or a value from previous filters in chain
+   */
+  public float generatorSortValue(Text url, CrawlDatum datum, float initSort)
+      throws ScoringFilterException;
+
+  /**
+   * This method takes all relevant score information from the current datum
+   * (coming from a generated fetchlist) and stores it into
+   * {@link org.apache.nutch.protocol.Content} metadata. This is needed in order
+   * to pass this value(s) to the mechanism that distributes it to outlinked
+   * pages.
+   * 
+   * @param url
+   *          url of the page
+   * @param datum
+   *          source datum. NOTE: modifications to this value are not persisted.
+   * @param content
+   *          instance of content. Implementations may modify this in-place,
+   *          primarily by setting some metadata properties.
+   */
+  public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content)
+      throws ScoringFilterException;
+
+  /**
+   * Currently a part of score distribution is performed using only data coming
+   * from the parsing process. We need this method in order to ensure the
+   * presence of score data in these steps.
+   * 
+   * @param url
+   *          page url
+   * @param content
+   *          original content. NOTE: modifications to this value are not
+   *          persisted.
+   * @param parse
+   *          target instance to copy the score information to. Implementations
+   *          may modify this in-place, primarily by setting some metadata
+   *          properties.
+   */
+  public void passScoreAfterParsing(Text url, Content content, Parse parse)
+      throws ScoringFilterException;
+
+  /**
+   * Distribute score value from the current page to all its outlinked pages.
+   * 
+   * @param fromUrl
+   *          url of the source page
+   * @param parseData
+   *          ParseData instance, which stores relevant score value(s) in its
+   *          metadata. NOTE: filters may modify this in-place, all changes will
+   *          be persisted.
+   * @param targets
+   *          &lt;url, CrawlDatum&gt; pairs. NOTE: filters can modify this
+   *          in-place, all changes will be persisted.
+   * @param adjust
+   *          a CrawlDatum instance, initially null, which implementations may
+   *          use to pass adjustment values to the original CrawlDatum. When
+   *          creating this instance, set its status to
+   *          {@link CrawlDatum#STATUS_LINKED}.
+   * @param allCount
+   *          number of all collected outlinks from the source page
+   * @return if needed, implementations may return an instance of CrawlDatum,
+   *         with status {@link CrawlDatum#STATUS_LINKED}, which contains
+   *         adjustments to be applied to the original CrawlDatum score(s) and
+   *         metadata. This can be null if not needed.
+   * @throws ScoringFilterException
+   */
+  public CrawlDatum distributeScoreToOutlinks(Text fromUrl,
+      ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets,
+      CrawlDatum adjust, int allCount) throws ScoringFilterException;
+
+  /**
+   * This method calculates a new score of CrawlDatum during CrawlDb update,
+   * based on the initial value of the original CrawlDatum, and also score
+   * values contributed by inlinked pages.
+   * 
+   * @param url
+   *          url of the page
+   * @param old
+   *          original datum, with original score. May be null if this is a
+   *          newly discovered page. If not null, filters should use score
+   *          values from this parameter as the starting values - the
+   *          <code>datum</code> parameter may contain values that are no longer
+   *          valid, if other updates occured between generation and this
+   *          update.
+   * @param datum
+   *          the new datum, with the original score saved at the time when
+   *          fetchlist was generated. Filters should update this in-place, and
+   *          it will be saved in the crawldb.
+   * @param inlinked
+   *          (partial) list of CrawlDatum-s (with their scores) from links
+   *          pointing to this page, found in the current update batch.
+   * @throws ScoringFilterException
+   */
+  public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum,
+      List<CrawlDatum> inlinked) throws ScoringFilterException;
+
+  /**
+   * This method calculates a Lucene document boost.
+   * 
+   * @param url
+   *          url of the page
+   * @param doc
+   *          Lucene document. NOTE: this already contains all information
+   *          collected by indexing filters. Implementations may modify this
+   *          instance, in order to store/remove some information.
+   * @param dbDatum
+   *          current page from CrawlDb. NOTE: changes made to this instance are
+   *          not persisted.
+   * @param fetchDatum
+   *          datum from FetcherOutput (containing among others the fetching
+   *          status)
+   * @param parse
+   *          parsing result. NOTE: changes made to this instance are not
+   *          persisted.
+   * @param inlinks
+   *          current inlinks from LinkDb. NOTE: changes made to this instance
+   *          are not persisted.
+   * @param initScore
+   *          initial boost value for the Lucene document.
+   * @return boost value for the Lucene document. This value is passed as an
+   *         argument to the next scoring filter in chain. NOTE: implementations
+   *         may also express other scoring strategies by modifying Lucene
+   *         document directly.
+   * @throws ScoringFilterException
+   */
+  public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum,
+      CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore)
+      throws ScoringFilterException;
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/scoring/ScoringFilterException.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/scoring/ScoringFilterException.java b/nutch-core/src/main/java/org/apache/nutch/scoring/ScoringFilterException.java
new file mode 100644
index 0000000..f363c4b
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/scoring/ScoringFilterException.java
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.scoring;
+
+/**
+ * Specialized exception for errors during scoring.
+ * 
+ * @author Andrzej Bialecki
+ */
+@SuppressWarnings("serial")
+public class ScoringFilterException extends Exception {
+
+  public ScoringFilterException() {
+    super();
+  }
+
+  public ScoringFilterException(String message) {
+    super(message);
+  }
+
+  public ScoringFilterException(String message, Throwable cause) {
+    super(message, cause);
+  }
+
+  public ScoringFilterException(Throwable cause) {
+    super(cause);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/scoring/ScoringFilters.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/scoring/ScoringFilters.java b/nutch-core/src/main/java/org/apache/nutch/scoring/ScoringFilters.java
new file mode 100644
index 0000000..5bad78f
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/scoring/ScoringFilters.java
@@ -0,0 +1,118 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.scoring;
+
+import java.util.Collection;
+import java.util.List;
+import java.util.Map.Entry;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.plugin.PluginRepository;
+import org.apache.nutch.protocol.Content;
+
+/**
+ * Creates and caches {@link ScoringFilter} implementing plugins.
+ * 
+ * @author Andrzej Bialecki
+ */
+public class ScoringFilters extends Configured implements ScoringFilter {
+
+  private ScoringFilter[] filters;
+
+  public ScoringFilters(Configuration conf) {
+    super(conf);
+    this.filters = (ScoringFilter[]) PluginRepository.get(conf)
+        .getOrderedPlugins(ScoringFilter.class, ScoringFilter.X_POINT_ID,
+            "scoring.filter.order");
+  }
+
+  /** Calculate a sort value for Generate. */
+  public float generatorSortValue(Text url, CrawlDatum datum, float initSort)
+      throws ScoringFilterException {
+    for (int i = 0; i < this.filters.length; i++) {
+      initSort = this.filters[i].generatorSortValue(url, datum, initSort);
+    }
+    return initSort;
+  }
+
+  /** Calculate a new initial score, used when adding newly discovered pages. */
+  public void initialScore(Text url, CrawlDatum datum)
+      throws ScoringFilterException {
+    for (int i = 0; i < this.filters.length; i++) {
+      this.filters[i].initialScore(url, datum);
+    }
+  }
+
+  /** Calculate a new initial score, used when injecting new pages. */
+  public void injectedScore(Text url, CrawlDatum datum)
+      throws ScoringFilterException {
+    for (int i = 0; i < this.filters.length; i++) {
+      this.filters[i].injectedScore(url, datum);
+    }
+  }
+
+  /** Calculate updated page score during CrawlDb.update(). */
+  public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum,
+      List<CrawlDatum> inlinked) throws ScoringFilterException {
+    for (int i = 0; i < this.filters.length; i++) {
+      this.filters[i].updateDbScore(url, old, datum, inlinked);
+    }
+  }
+
+  public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content)
+      throws ScoringFilterException {
+    for (int i = 0; i < this.filters.length; i++) {
+      this.filters[i].passScoreBeforeParsing(url, datum, content);
+    }
+  }
+
+  public void passScoreAfterParsing(Text url, Content content, Parse parse)
+      throws ScoringFilterException {
+    for (int i = 0; i < this.filters.length; i++) {
+      this.filters[i].passScoreAfterParsing(url, content, parse);
+    }
+  }
+
+  public CrawlDatum distributeScoreToOutlinks(Text fromUrl,
+      ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets,
+      CrawlDatum adjust, int allCount) throws ScoringFilterException {
+    for (int i = 0; i < this.filters.length; i++) {
+      adjust = this.filters[i].distributeScoreToOutlinks(fromUrl, parseData,
+          targets, adjust, allCount);
+    }
+    return adjust;
+  }
+
+  public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum,
+      CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore)
+      throws ScoringFilterException {
+    for (int i = 0; i < this.filters.length; i++) {
+      initScore = this.filters[i].indexerScore(url, doc, dbDatum, fetchDatum,
+          parse, inlinks, initScore);
+    }
+    return initScore;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/scoring/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/scoring/package-info.java b/nutch-core/src/main/java/org/apache/nutch/scoring/package-info.java
new file mode 100644
index 0000000..b6a578b
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/scoring/package-info.java
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * The {@link org.apache.nutch.scoring.ScoringFilter ScoringFilter} interface.
+ */
+package org.apache.nutch.scoring;
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/scoring/webgraph/LinkDatum.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/scoring/webgraph/LinkDatum.java b/nutch-core/src/main/java/org/apache/nutch/scoring/webgraph/LinkDatum.java
new file mode 100644
index 0000000..67c9366
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/scoring/webgraph/LinkDatum.java
@@ -0,0 +1,140 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.scoring.webgraph;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+
+/**
+ * A class for holding link information including the url, anchor text, a score,
+ * the timestamp of the link and a link type.
+ */
+public class LinkDatum implements Writable {
+
+  public final static byte INLINK = 1;
+  public final static byte OUTLINK = 2;
+
+  private String url = null;
+  private String anchor = "";
+  private float score = 0.0f;
+  private long timestamp = 0L;
+  private byte linkType = 0;
+
+  /**
+   * Default constructor, no url, timestamp, score, or link type.
+   */
+  public LinkDatum() {
+
+  }
+
+  /**
+   * Creates a LinkDatum with a given url. Timestamp is set to current time.
+   * 
+   * @param url
+   *          The link url.
+   */
+  public LinkDatum(String url) {
+    this(url, "", System.currentTimeMillis());
+  }
+
+  /**
+   * Creates a LinkDatum with a url and an anchor text. Timestamp is set to
+   * current time.
+   * 
+   * @param url
+   *          The link url.
+   * @param anchor
+   *          The link anchor text.
+   */
+  public LinkDatum(String url, String anchor) {
+    this(url, anchor, System.currentTimeMillis());
+  }
+
+  public LinkDatum(String url, String anchor, long timestamp) {
+    this.url = url;
+    this.anchor = anchor;
+    this.timestamp = timestamp;
+  }
+
+  public String getUrl() {
+    return url;
+  }
+
+  public String getAnchor() {
+    return anchor;
+  }
+
+  public void setAnchor(String anchor) {
+    this.anchor = anchor;
+  }
+
+  public float getScore() {
+    return score;
+  }
+
+  public void setScore(float score) {
+    this.score = score;
+  }
+
+  public void setUrl(String url) {
+    this.url = url;
+  }
+
+  public long getTimestamp() {
+    return timestamp;
+  }
+
+  public void setTimestamp(long timestamp) {
+    this.timestamp = timestamp;
+  }
+
+  public byte getLinkType() {
+    return linkType;
+  }
+
+  public void setLinkType(byte linkType) {
+    this.linkType = linkType;
+  }
+
+  public void readFields(DataInput in) throws IOException {
+    url = Text.readString(in);
+    anchor = Text.readString(in);
+    score = in.readFloat();
+    timestamp = in.readLong();
+    linkType = in.readByte();
+  }
+
+  public void write(DataOutput out) throws IOException {
+    Text.writeString(out, url);
+    Text.writeString(out, anchor != null ? anchor : "");
+    out.writeFloat(score);
+    out.writeLong(timestamp);
+    out.writeByte(linkType);
+  }
+
+  public String toString() {
+
+    String type = (linkType == INLINK ? "inlink"
+        : (linkType == OUTLINK) ? "outlink" : "unknown");
+    return "url: " + url + ", anchor: " + anchor + ", score: " + score
+        + ", timestamp: " + timestamp + ", link type: " + type;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/scoring/webgraph/LinkDumper.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/scoring/webgraph/LinkDumper.java b/nutch-core/src/main/java/org/apache/nutch/scoring/webgraph/LinkDumper.java
new file mode 100644
index 0000000..1569c4d
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/scoring/webgraph/LinkDumper.java
@@ -0,0 +1,433 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.scoring.webgraph;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.text.SimpleDateFormat;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Random;
+import java.util.Set;
+
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.CommandLineParser;
+import org.apache.commons.cli.GnuParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.OptionBuilder;
+import org.apache.commons.cli.Options;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.MapFile;
+import org.apache.hadoop.io.ObjectWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.io.WritableUtils;
+import org.apache.hadoop.mapred.FileInputFormat;
+import org.apache.hadoop.mapred.FileOutputFormat;
+import org.apache.hadoop.mapred.JobClient;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.MapFileOutputFormat;
+import org.apache.hadoop.mapred.Mapper;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reducer;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.hadoop.mapred.SequenceFileInputFormat;
+import org.apache.hadoop.mapred.SequenceFileOutputFormat;
+import org.apache.hadoop.mapred.lib.HashPartitioner;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.nutch.util.FSUtils;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.NutchJob;
+import org.apache.nutch.util.TimingUtil;
+
+/**
+ * The LinkDumper tool creates a database of node to inlink information that can
+ * be read using the nested Reader class. This allows the inlink and scoring
+ * state of a single url to be reviewed quickly to determine why a given url is
+ * ranking a certain way. This tool is to be used with the LinkRank analysis.
+ */
+public class LinkDumper extends Configured implements Tool {
+
+  public static final Logger LOG = LoggerFactory.getLogger(LinkDumper.class);
+  public static final String DUMP_DIR = "linkdump";
+
+  /**
+   * Reader class which will print out the url and all of its inlinks to system
+   * out. Each inlinkwill be displayed with its node information including score
+   * and number of in and outlinks.
+   */
+  public static class Reader {
+
+    public static void main(String[] args) throws Exception {
+
+      if (args == null || args.length < 2) {
+        System.out.println("LinkDumper$Reader usage: <webgraphdb> <url>");
+        return;
+      }
+
+      // open the readers for the linkdump directory
+      Configuration conf = NutchConfiguration.create();
+      FileSystem fs = FileSystem.get(conf);
+      Path webGraphDb = new Path(args[0]);
+      String url = args[1];
+      MapFile.Reader[] readers = MapFileOutputFormat.getReaders(fs, new Path(
+          webGraphDb, DUMP_DIR), conf);
+
+      // get the link nodes for the url
+      Text key = new Text(url);
+      LinkNodes nodes = new LinkNodes();
+      MapFileOutputFormat.getEntry(readers,
+          new HashPartitioner<Text, LinkNodes>(), key, nodes);
+
+      // print out the link nodes
+      LinkNode[] linkNodesAr = nodes.getLinks();
+      System.out.println(url + ":");
+      for (LinkNode node : linkNodesAr) {
+        System.out.println("  " + node.getUrl() + " - "
+            + node.getNode().toString());
+      }
+
+      // close the readers
+      FSUtils.closeReaders(readers);
+    }
+  }
+
+  /**
+   * Bean class which holds url to node information.
+   */
+  public static class LinkNode implements Writable {
+
+    private String url = null;
+    private Node node = null;
+
+    public LinkNode() {
+
+    }
+
+    public LinkNode(String url, Node node) {
+      this.url = url;
+      this.node = node;
+    }
+
+    public String getUrl() {
+      return url;
+    }
+
+    public void setUrl(String url) {
+      this.url = url;
+    }
+
+    public Node getNode() {
+      return node;
+    }
+
+    public void setNode(Node node) {
+      this.node = node;
+    }
+
+    public void readFields(DataInput in) throws IOException {
+      url = in.readUTF();
+      node = new Node();
+      node.readFields(in);
+    }
+
+    public void write(DataOutput out) throws IOException {
+      out.writeUTF(url);
+      node.write(out);
+    }
+
+  }
+
+  /**
+   * Writable class which holds an array of LinkNode objects.
+   */
+  public static class LinkNodes implements Writable {
+
+    private LinkNode[] links;
+
+    public LinkNodes() {
+
+    }
+
+    public LinkNodes(LinkNode[] links) {
+      this.links = links;
+    }
+
+    public LinkNode[] getLinks() {
+      return links;
+    }
+
+    public void setLinks(LinkNode[] links) {
+      this.links = links;
+    }
+
+    public void readFields(DataInput in) throws IOException {
+      int numLinks = in.readInt();
+      if (numLinks > 0) {
+        links = new LinkNode[numLinks];
+        for (int i = 0; i < numLinks; i++) {
+          LinkNode node = new LinkNode();
+          node.readFields(in);
+          links[i] = node;
+        }
+      }
+    }
+
+    public void write(DataOutput out) throws IOException {
+      if (links != null && links.length > 0) {
+        int numLinks = links.length;
+        out.writeInt(numLinks);
+        for (int i = 0; i < numLinks; i++) {
+          links[i].write(out);
+        }
+      }
+    }
+  }
+
+  /**
+   * Inverts outlinks from the WebGraph to inlinks and attaches node
+   * information.
+   */
+  public static class Inverter implements
+      Mapper<Text, Writable, Text, ObjectWritable>,
+      Reducer<Text, ObjectWritable, Text, LinkNode> {
+
+    private JobConf conf;
+
+    public void configure(JobConf conf) {
+      this.conf = conf;
+    }
+
+    /**
+     * Wraps all values in ObjectWritables.
+     */
+    public void map(Text key, Writable value,
+        OutputCollector<Text, ObjectWritable> output, Reporter reporter)
+        throws IOException {
+
+      ObjectWritable objWrite = new ObjectWritable();
+      objWrite.set(value);
+      output.collect(key, objWrite);
+    }
+
+    /**
+     * Inverts outlinks to inlinks while attaching node information to the
+     * outlink.
+     */
+    public void reduce(Text key, Iterator<ObjectWritable> values,
+        OutputCollector<Text, LinkNode> output, Reporter reporter)
+        throws IOException {
+
+      String fromUrl = key.toString();
+      List<LinkDatum> outlinks = new ArrayList<LinkDatum>();
+      Node node = null;
+      
+      // loop through all values aggregating outlinks, saving node
+      while (values.hasNext()) {
+        ObjectWritable write = values.next();
+        Object obj = write.get();
+        if (obj instanceof Node) {
+          node = (Node) obj;
+        } else if (obj instanceof LinkDatum) {
+          outlinks.add(WritableUtils.clone((LinkDatum) obj, conf));
+        }
+      }
+
+      // only collect if there are outlinks
+      int numOutlinks = node.getNumOutlinks();
+      if (numOutlinks > 0) {
+        for (int i = 0; i < outlinks.size(); i++) {
+          LinkDatum outlink = outlinks.get(i);
+          String toUrl = outlink.getUrl();
+
+          // collect the outlink as an inlink with the node
+          output.collect(new Text(toUrl), new LinkNode(fromUrl, node));
+        }
+      }
+    }
+
+    public void close() {
+    }
+  }
+
+  /**
+   * Merges LinkNode objects into a single array value per url. This allows all
+   * values to be quickly retrieved and printed via the Reader tool.
+   */
+  public static class Merger implements
+      Reducer<Text, LinkNode, Text, LinkNodes> {
+
+    private JobConf conf;
+    private int maxInlinks = 50000;
+
+    public void configure(JobConf conf) {
+      this.conf = conf;
+    }
+
+    /**
+     * Aggregate all LinkNode objects for a given url.
+     */
+    public void reduce(Text key, Iterator<LinkNode> values,
+        OutputCollector<Text, LinkNodes> output, Reporter reporter)
+        throws IOException {
+
+      List<LinkNode> nodeList = new ArrayList<LinkNode>();
+      int numNodes = 0;
+
+      while (values.hasNext()) {
+        LinkNode cur = values.next();
+        if (numNodes < maxInlinks) {
+          nodeList.add(WritableUtils.clone(cur, conf));
+          numNodes++;
+        } else {
+          break;
+        }
+      }
+
+      LinkNode[] linkNodesAr = nodeList.toArray(new LinkNode[nodeList.size()]);
+      LinkNodes linkNodes = new LinkNodes(linkNodesAr);
+      output.collect(key, linkNodes);
+    }
+
+    public void close() {
+
+    }
+  }
+
+  /**
+   * Runs the inverter and merger jobs of the LinkDumper tool to create the url
+   * to inlink node database.
+   */
+  public void dumpLinks(Path webGraphDb) throws IOException {
+
+    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+    long start = System.currentTimeMillis();
+    LOG.info("NodeDumper: starting at " + sdf.format(start));
+    Configuration conf = getConf();
+    FileSystem fs = FileSystem.get(conf);
+
+    Path linkdump = new Path(webGraphDb, DUMP_DIR);
+    Path nodeDb = new Path(webGraphDb, WebGraph.NODE_DIR);
+    Path outlinkDb = new Path(webGraphDb, WebGraph.OUTLINK_DIR);
+
+    // run the inverter job
+    Path tempInverted = new Path(webGraphDb, "inverted-"
+        + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
+    JobConf inverter = new NutchJob(conf);
+    inverter.setJobName("LinkDumper: inverter");
+    FileInputFormat.addInputPath(inverter, nodeDb);
+    FileInputFormat.addInputPath(inverter, outlinkDb);
+    inverter.setInputFormat(SequenceFileInputFormat.class);
+    inverter.setMapperClass(Inverter.class);
+    inverter.setReducerClass(Inverter.class);
+    inverter.setMapOutputKeyClass(Text.class);
+    inverter.setMapOutputValueClass(ObjectWritable.class);
+    inverter.setOutputKeyClass(Text.class);
+    inverter.setOutputValueClass(LinkNode.class);
+    FileOutputFormat.setOutputPath(inverter, tempInverted);
+    inverter.setOutputFormat(SequenceFileOutputFormat.class);
+
+    try {
+      LOG.info("LinkDumper: running inverter");
+      JobClient.runJob(inverter);
+      LOG.info("LinkDumper: finished inverter");
+    } catch (IOException e) {
+      LOG.error(StringUtils.stringifyException(e));
+      throw e;
+    }
+
+    // run the merger job
+    JobConf merger = new NutchJob(conf);
+    merger.setJobName("LinkDumper: merger");
+    FileInputFormat.addInputPath(merger, tempInverted);
+    merger.setInputFormat(SequenceFileInputFormat.class);
+    merger.setReducerClass(Merger.class);
+    merger.setMapOutputKeyClass(Text.class);
+    merger.setMapOutputValueClass(LinkNode.class);
+    merger.setOutputKeyClass(Text.class);
+    merger.setOutputValueClass(LinkNodes.class);
+    FileOutputFormat.setOutputPath(merger, linkdump);
+    merger.setOutputFormat(MapFileOutputFormat.class);
+
+    try {
+      LOG.info("LinkDumper: running merger");
+      JobClient.runJob(merger);
+      LOG.info("LinkDumper: finished merger");
+    } catch (IOException e) {
+      LOG.error(StringUtils.stringifyException(e));
+      throw e;
+    }
+
+    fs.delete(tempInverted, true);
+    long end = System.currentTimeMillis();
+    LOG.info("LinkDumper: finished at " + sdf.format(end) + ", elapsed: "
+        + TimingUtil.elapsedTime(start, end));
+  }
+
+  public static void main(String[] args) throws Exception {
+    int res = ToolRunner.run(NutchConfiguration.create(), new LinkDumper(),
+        args);
+    System.exit(res);
+  }
+
+  /**
+   * Runs the LinkDumper tool. This simply creates the database, to read the
+   * values the nested Reader tool must be used.
+   */
+  public int run(String[] args) throws Exception {
+
+    Options options = new Options();
+    OptionBuilder.withArgName("help");
+    OptionBuilder.withDescription("show this help message");
+    Option helpOpts = OptionBuilder.create("help");
+    options.addOption(helpOpts);
+
+    OptionBuilder.withArgName("webgraphdb");
+    OptionBuilder.hasArg();
+    OptionBuilder.withDescription("the web graph database to use");
+    Option webGraphDbOpts = OptionBuilder.create("webgraphdb");
+    options.addOption(webGraphDbOpts);
+
+    CommandLineParser parser = new GnuParser();
+    try {
+
+      CommandLine line = parser.parse(options, args);
+      if (line.hasOption("help") || !line.hasOption("webgraphdb")) {
+        HelpFormatter formatter = new HelpFormatter();
+        formatter.printHelp("LinkDumper", options);
+        return -1;
+      }
+
+      String webGraphDb = line.getOptionValue("webgraphdb");
+      dumpLinks(new Path(webGraphDb));
+      return 0;
+    } catch (Exception e) {
+      LOG.error("LinkDumper: " + StringUtils.stringifyException(e));
+      return -2;
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/scoring/webgraph/LinkRank.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/scoring/webgraph/LinkRank.java b/nutch-core/src/main/java/org/apache/nutch/scoring/webgraph/LinkRank.java
new file mode 100644
index 0000000..bd22828
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/scoring/webgraph/LinkRank.java
@@ -0,0 +1,677 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.scoring.webgraph;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.text.SimpleDateFormat;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Random;
+import java.util.Set;
+
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.CommandLineParser;
+import org.apache.commons.cli.GnuParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.OptionBuilder;
+import org.apache.commons.cli.Options;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.ObjectWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.io.WritableUtils;
+import org.apache.hadoop.mapred.FileInputFormat;
+import org.apache.hadoop.mapred.FileOutputFormat;
+import org.apache.hadoop.mapred.JobClient;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.MapFileOutputFormat;
+import org.apache.hadoop.mapred.Mapper;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reducer;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.hadoop.mapred.SequenceFileInputFormat;
+import org.apache.hadoop.mapred.SequenceFileOutputFormat;
+import org.apache.hadoop.mapred.TextOutputFormat;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.nutch.util.FSUtils;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.NutchJob;
+import org.apache.nutch.util.TimingUtil;
+import org.apache.nutch.util.URLUtil;
+
+public class LinkRank extends Configured implements Tool {
+
+  public static final Logger LOG = LoggerFactory.getLogger(LinkRank.class);
+  private static final String NUM_NODES = "_num_nodes_";
+
+  /**
+   * Runs the counter job. The counter job determines the number of links in the
+   * webgraph. This is used during analysis.
+   * 
+   * @param fs
+   *          The job file system.
+   * @param webGraphDb
+   *          The web graph database to use.
+   * 
+   * @return The number of nodes in the web graph.
+   * @throws IOException
+   *           If an error occurs while running the counter job.
+   */
+  private int runCounter(FileSystem fs, Path webGraphDb) throws IOException {
+
+    // configure the counter job
+    Path numLinksPath = new Path(webGraphDb, NUM_NODES);
+    Path nodeDb = new Path(webGraphDb, WebGraph.NODE_DIR);
+    JobConf counter = new NutchJob(getConf());
+    counter.setJobName("LinkRank Counter");
+    FileInputFormat.addInputPath(counter, nodeDb);
+    FileOutputFormat.setOutputPath(counter, numLinksPath);
+    counter.setInputFormat(SequenceFileInputFormat.class);
+    counter.setMapperClass(Counter.class);
+    counter.setCombinerClass(Counter.class);
+    counter.setReducerClass(Counter.class);
+    counter.setMapOutputKeyClass(Text.class);
+    counter.setMapOutputValueClass(LongWritable.class);
+    counter.setOutputKeyClass(Text.class);
+    counter.setOutputValueClass(LongWritable.class);
+    counter.setNumReduceTasks(1);
+    counter.setOutputFormat(TextOutputFormat.class);
+    counter.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs",
+        false);
+
+    // run the counter job, outputs to a single reduce task and file
+    LOG.info("Starting link counter job");
+    try {
+      JobClient.runJob(counter);
+    } catch (IOException e) {
+      LOG.error(StringUtils.stringifyException(e));
+      throw e;
+    }
+    LOG.info("Finished link counter job");
+
+    // read the first (and only) line from the file which should be the
+    // number of links in the web graph
+    LOG.info("Reading numlinks temp file");
+    FSDataInputStream readLinks = fs.open(new Path(numLinksPath, "part-00000"));
+    BufferedReader buffer = new BufferedReader(new InputStreamReader(readLinks));
+    String numLinksLine = buffer.readLine();
+    readLinks.close();
+
+    // check if there are links to process, if none, webgraph might be empty
+    if (numLinksLine == null || numLinksLine.length() == 0) {
+      fs.delete(numLinksPath, true);
+      throw new IOException("No links to process, is the webgraph empty?");
+    }
+
+    // delete temp file and convert and return the number of links as an int
+    LOG.info("Deleting numlinks temp file");
+    fs.delete(numLinksPath, true);
+    String numLinks = numLinksLine.split("\\s+")[1];
+    return Integer.parseInt(numLinks);
+  }
+
+  /**
+   * Runs the initializer job. The initializer job sets up the nodes with a
+   * default starting score for link analysis.
+   * 
+   * @param nodeDb
+   *          The node database to use.
+   * @param output
+   *          The job output directory.
+   * 
+   * @throws IOException
+   *           If an error occurs while running the initializer job.
+   */
+  private void runInitializer(Path nodeDb, Path output) throws IOException {
+
+    // configure the initializer
+    JobConf initializer = new NutchJob(getConf());
+    initializer.setJobName("LinkAnalysis Initializer");
+    FileInputFormat.addInputPath(initializer, nodeDb);
+    FileOutputFormat.setOutputPath(initializer, output);
+    initializer.setInputFormat(SequenceFileInputFormat.class);
+    initializer.setMapperClass(Initializer.class);
+    initializer.setMapOutputKeyClass(Text.class);
+    initializer.setMapOutputValueClass(Node.class);
+    initializer.setOutputKeyClass(Text.class);
+    initializer.setOutputValueClass(Node.class);
+    initializer.setOutputFormat(MapFileOutputFormat.class);
+    initializer.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs",
+        false);
+
+    // run the initializer
+    LOG.info("Starting initialization job");
+    try {
+      JobClient.runJob(initializer);
+    } catch (IOException e) {
+      LOG.error(StringUtils.stringifyException(e));
+      throw e;
+    }
+    LOG.info("Finished initialization job.");
+  }
+
+  /**
+   * Runs the inverter job. The inverter job flips outlinks to inlinks to be
+   * passed into the analysis job.
+   * 
+   * @param nodeDb
+   *          The node database to use.
+   * @param outlinkDb
+   *          The outlink database to use.
+   * @param output
+   *          The output directory.
+   * 
+   * @throws IOException
+   *           If an error occurs while running the inverter job.
+   */
+  private void runInverter(Path nodeDb, Path outlinkDb, Path output)
+      throws IOException {
+
+    // configure the inverter
+    JobConf inverter = new NutchJob(getConf());
+    inverter.setJobName("LinkAnalysis Inverter");
+    FileInputFormat.addInputPath(inverter, nodeDb);
+    FileInputFormat.addInputPath(inverter, outlinkDb);
+    FileOutputFormat.setOutputPath(inverter, output);
+    inverter.setInputFormat(SequenceFileInputFormat.class);
+    inverter.setMapperClass(Inverter.class);
+    inverter.setReducerClass(Inverter.class);
+    inverter.setMapOutputKeyClass(Text.class);
+    inverter.setMapOutputValueClass(ObjectWritable.class);
+    inverter.setOutputKeyClass(Text.class);
+    inverter.setOutputValueClass(LinkDatum.class);
+    inverter.setOutputFormat(SequenceFileOutputFormat.class);
+    inverter.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs",
+        false);
+
+    // run the inverter job
+    LOG.info("Starting inverter job");
+    try {
+      JobClient.runJob(inverter);
+    } catch (IOException e) {
+      LOG.error(StringUtils.stringifyException(e));
+      throw e;
+    }
+    LOG.info("Finished inverter job.");
+  }
+
+  /**
+   * Runs the link analysis job. The link analysis job applies the link rank
+   * formula to create a score per url and stores that score in the NodeDb.
+   * 
+   * Typically the link analysis job is run a number of times to allow the link
+   * rank scores to converge.
+   * 
+   * @param nodeDb
+   *          The node database from which we are getting previous link rank
+   *          scores.
+   * @param inverted
+   *          The inverted inlinks
+   * @param output
+   *          The link analysis output.
+   * @param iteration
+   *          The current iteration number.
+   * @param numIterations
+   *          The total number of link analysis iterations
+   * 
+   * @throws IOException
+   *           If an error occurs during link analysis.
+   */
+  private void runAnalysis(Path nodeDb, Path inverted, Path output,
+      int iteration, int numIterations, float rankOne) throws IOException {
+
+    JobConf analyzer = new NutchJob(getConf());
+    analyzer.set("link.analyze.iteration", String.valueOf(iteration + 1));
+    analyzer.setJobName("LinkAnalysis Analyzer, iteration " + (iteration + 1)
+        + " of " + numIterations);
+    FileInputFormat.addInputPath(analyzer, nodeDb);
+    FileInputFormat.addInputPath(analyzer, inverted);
+    FileOutputFormat.setOutputPath(analyzer, output);
+    analyzer.set("link.analyze.rank.one", String.valueOf(rankOne));
+    analyzer.setMapOutputKeyClass(Text.class);
+    analyzer.setMapOutputValueClass(ObjectWritable.class);
+    analyzer.setInputFormat(SequenceFileInputFormat.class);
+    analyzer.setMapperClass(Analyzer.class);
+    analyzer.setReducerClass(Analyzer.class);
+    analyzer.setOutputKeyClass(Text.class);
+    analyzer.setOutputValueClass(Node.class);
+    analyzer.setOutputFormat(MapFileOutputFormat.class);
+    analyzer.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs",
+        false);
+
+    LOG.info("Starting analysis job");
+    try {
+      JobClient.runJob(analyzer);
+    } catch (IOException e) {
+      LOG.error(StringUtils.stringifyException(e));
+      throw e;
+    }
+    LOG.info("Finished analysis job.");
+  }
+
+  /**
+   * The Counter job that determines the total number of nodes in the WebGraph.
+   * This is used to determine a rank one score for pages with zero inlinks but
+   * that contain outlinks.
+   */
+  private static class Counter implements
+      Mapper<Text, Node, Text, LongWritable>,
+      Reducer<Text, LongWritable, Text, LongWritable> {
+
+    private static Text numNodes = new Text(NUM_NODES);
+    private static LongWritable one = new LongWritable(1L);
+
+    public void configure(JobConf conf) {
+    }
+
+    /**
+     * Outputs one for every node.
+     */
+    public void map(Text key, Node value,
+        OutputCollector<Text, LongWritable> output, Reporter reporter)
+        throws IOException {
+      output.collect(numNodes, one);
+    }
+
+    /**
+     * Totals the node number and outputs a single total value.
+     */
+    public void reduce(Text key, Iterator<LongWritable> values,
+        OutputCollector<Text, LongWritable> output, Reporter reporter)
+        throws IOException {
+
+      long total = 0;
+      while (values.hasNext()) {
+        total += values.next().get();
+      }
+      output.collect(numNodes, new LongWritable(total));
+    }
+
+    public void close() {
+    }
+  }
+
+  private static class Initializer implements Mapper<Text, Node, Text, Node> {
+
+    private JobConf conf;
+    private float initialScore = 1.0f;
+
+    public void configure(JobConf conf) {
+      this.conf = conf;
+      initialScore = conf.getFloat("link.analyze.initial.score", 1.0f);
+    }
+
+    public void map(Text key, Node node, OutputCollector<Text, Node> output,
+        Reporter reporter) throws IOException {
+
+      String url = key.toString();
+      Node outNode = WritableUtils.clone(node, conf);
+      outNode.setInlinkScore(initialScore);
+
+      output.collect(new Text(url), outNode);
+    }
+
+    public void close() {
+    }
+  }
+
+  /**
+   * Inverts outlinks and attaches current score from the NodeDb of the
+   * WebGraph. The link analysis process consists of inverting, analyzing and
+   * scoring, in a loop for a given number of iterations.
+   */
+  private static class Inverter implements
+      Mapper<Text, Writable, Text, ObjectWritable>,
+      Reducer<Text, ObjectWritable, Text, LinkDatum> {
+
+    private JobConf conf;
+
+    public void configure(JobConf conf) {
+      this.conf = conf;
+    }
+
+    /**
+     * Convert values to ObjectWritable
+     */
+    public void map(Text key, Writable value,
+        OutputCollector<Text, ObjectWritable> output, Reporter reporter)
+        throws IOException {
+
+      ObjectWritable objWrite = new ObjectWritable();
+      objWrite.set(value);
+      output.collect(key, objWrite);
+    }
+
+    /**
+     * Inverts outlinks to inlinks, attaches current score for the outlink from
+     * the NodeDb of the WebGraph.
+     */
+    public void reduce(Text key, Iterator<ObjectWritable> values,
+        OutputCollector<Text, LinkDatum> output, Reporter reporter)
+        throws IOException {
+
+      String fromUrl = key.toString();
+      List<LinkDatum> outlinks = new ArrayList<LinkDatum>();
+      Node node = null;
+
+      // aggregate outlinks, assign other values
+      while (values.hasNext()) {
+        ObjectWritable write = values.next();
+        Object obj = write.get();
+        if (obj instanceof Node) {
+          node = (Node) obj;
+        } else if (obj instanceof LinkDatum) {
+          outlinks.add(WritableUtils.clone((LinkDatum) obj, conf));
+        }
+      }
+
+      // get the number of outlinks and the current inlink and outlink scores
+      // from the node of the url
+      int numOutlinks = node.getNumOutlinks();
+      float inlinkScore = node.getInlinkScore();
+      float outlinkScore = node.getOutlinkScore();
+      LOG.debug(fromUrl + ": num outlinks " + numOutlinks);
+
+      // can't invert if no outlinks
+      if (numOutlinks > 0) {
+        for (int i = 0; i < outlinks.size(); i++) {
+          LinkDatum outlink = outlinks.get(i);
+          String toUrl = outlink.getUrl();
+
+          outlink.setUrl(fromUrl);
+          outlink.setScore(outlinkScore);
+
+          // collect the inverted outlink
+          output.collect(new Text(toUrl), outlink);
+          LOG.debug(toUrl + ": inverting inlink from " + fromUrl
+              + " origscore: " + inlinkScore + " numOutlinks: " + numOutlinks
+              + " inlinkscore: " + outlinkScore);
+        }
+      }
+    }
+
+    public void close() {
+    }
+  }
+
+  /**
+   * Runs a single link analysis iteration.
+   */
+  private static class Analyzer implements
+      Mapper<Text, Writable, Text, ObjectWritable>,
+      Reducer<Text, ObjectWritable, Text, Node> {
+
+    private JobConf conf;
+    private float dampingFactor = 0.85f;
+    private float rankOne = 0.0f;
+    private int itNum = 0;
+    private boolean limitPages = true;
+    private boolean limitDomains = true;
+
+    /**
+     * Configures the job, sets the damping factor, rank one score, and other
+     * needed values for analysis.
+     */
+    public void configure(JobConf conf) {
+
+      try {
+        this.conf = conf;
+        this.dampingFactor = conf
+            .getFloat("link.analyze.damping.factor", 0.85f);
+        this.rankOne = conf.getFloat("link.analyze.rank.one", 0.0f);
+        this.itNum = conf.getInt("link.analyze.iteration", 0);
+        limitPages = conf.getBoolean("link.ignore.limit.page", true);
+        limitDomains = conf.getBoolean("link.ignore.limit.domain", true);
+      } catch (Exception e) {
+        LOG.error(StringUtils.stringifyException(e));
+        throw new IllegalArgumentException(e);
+      }
+    }
+
+    /**
+     * Convert values to ObjectWritable
+     */
+    public void map(Text key, Writable value,
+        OutputCollector<Text, ObjectWritable> output, Reporter reporter)
+        throws IOException {
+
+      ObjectWritable objWrite = new ObjectWritable();
+      objWrite.set(WritableUtils.clone(value, conf));
+      output.collect(key, objWrite);
+    }
+
+    /**
+     * Performs a single iteration of link analysis. The resulting scores are
+     * stored in a temporary NodeDb which replaces the NodeDb of the WebGraph.
+     */
+    public void reduce(Text key, Iterator<ObjectWritable> values,
+        OutputCollector<Text, Node> output, Reporter reporter)
+        throws IOException {
+
+      String url = key.toString();
+      Set<String> domains = new HashSet<String>();
+      Set<String> pages = new HashSet<String>();
+      Node node = null;
+
+      // a page with zero inlinks has a score of rankOne
+      int numInlinks = 0;
+      float totalInlinkScore = rankOne;
+
+      while (values.hasNext()) {
+
+        ObjectWritable next = values.next();
+        Object value = next.get();
+        if (value instanceof Node) {
+          node = (Node) value;
+        } else if (value instanceof LinkDatum) {
+
+          LinkDatum linkDatum = (LinkDatum) value;
+          float scoreFromInlink = linkDatum.getScore();
+          String inlinkUrl = linkDatum.getUrl();
+          String inLinkDomain = URLUtil.getDomainName(inlinkUrl);
+          String inLinkPage = URLUtil.getPage(inlinkUrl);
+
+          // limit counting duplicate inlinks by pages or domains
+          if ((limitPages && pages.contains(inLinkPage))
+              || (limitDomains && domains.contains(inLinkDomain))) {
+            LOG.debug(url + ": ignoring " + scoreFromInlink + " from "
+                + inlinkUrl + ", duplicate page or domain");
+            continue;
+          }
+
+          // aggregate total inlink score
+          numInlinks++;
+          totalInlinkScore += scoreFromInlink;
+          domains.add(inLinkDomain);
+          pages.add(inLinkPage);
+          LOG.debug(url + ": adding " + scoreFromInlink + " from " + inlinkUrl
+              + ", total: " + totalInlinkScore);
+        }
+      }
+
+      // calculate linkRank score formula
+      float linkRankScore = (1 - this.dampingFactor)
+          + (this.dampingFactor * totalInlinkScore);
+
+      LOG.debug(url + ": score: " + linkRankScore + " num inlinks: "
+          + numInlinks + " iteration: " + itNum);
+
+      // store the score in a temporary NodeDb
+      Node outNode = WritableUtils.clone(node, conf);
+      outNode.setInlinkScore(linkRankScore);
+      output.collect(key, outNode);
+    }
+
+    public void close() throws IOException {
+    }
+  }
+
+  /**
+   * Default constructor.
+   */
+  public LinkRank() {
+    super();
+  }
+
+  /**
+   * Configurable constructor.
+   */
+  public LinkRank(Configuration conf) {
+    super(conf);
+  }
+
+  public void close() {
+  }
+
+  /**
+   * Runs the complete link analysis job. The complete job determins rank one
+   * score. Then runs through a given number of invert and analyze iterations,
+   * by default 10. And finally replaces the NodeDb in the WebGraph with the
+   * link rank output.
+   * 
+   * @param webGraphDb
+   *          The WebGraph to run link analysis on.
+   * 
+   * @throws IOException
+   *           If an error occurs during link analysis.
+   */
+  public void analyze(Path webGraphDb) throws IOException {
+
+    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+    long start = System.currentTimeMillis();
+    LOG.info("Analysis: starting at " + sdf.format(start));
+
+    // store the link rank under the webgraphdb temporarily, final scores get
+    // upddated into the nodedb
+    Path linkRank = new Path(webGraphDb, "linkrank");
+    Configuration conf = getConf();
+    FileSystem fs = FileSystem.get(conf);
+
+    // create the linkrank directory if needed
+    if (!fs.exists(linkRank)) {
+      fs.mkdirs(linkRank);
+    }
+
+    // the webgraph outlink and node database paths
+    Path wgOutlinkDb = new Path(webGraphDb, WebGraph.OUTLINK_DIR);
+    Path wgNodeDb = new Path(webGraphDb, WebGraph.NODE_DIR);
+    Path nodeDb = new Path(linkRank, WebGraph.NODE_DIR);
+
+    // get the number of total nodes in the webgraph, used for rank one, then
+    // initialze all urls with a default score
+    int numLinks = runCounter(fs, webGraphDb);
+    runInitializer(wgNodeDb, nodeDb);
+    float rankOneScore = (1f / (float) numLinks);
+
+    if (LOG.isInfoEnabled()) {
+      LOG.info("Analysis: Number of links: " + numLinks);
+      LOG.info("Analysis: Rank One: " + rankOneScore);
+    }
+
+    // run invert and analysis for a given number of iterations to allow the
+    // link rank scores to converge
+    int numIterations = conf.getInt("link.analyze.num.iterations", 10);
+    for (int i = 0; i < numIterations; i++) {
+
+      // the input to inverting is always the previous output from analysis
+      LOG.info("Analysis: Starting iteration " + (i + 1) + " of "
+          + numIterations);
+      Path tempRank = new Path(linkRank + "-"
+          + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
+      fs.mkdirs(tempRank);
+      Path tempInverted = new Path(tempRank, "inverted");
+      Path tempNodeDb = new Path(tempRank, WebGraph.NODE_DIR);
+
+      // run invert and analysis
+      runInverter(nodeDb, wgOutlinkDb, tempInverted);
+      runAnalysis(nodeDb, tempInverted, tempNodeDb, i, numIterations,
+          rankOneScore);
+
+      // replace the temporary NodeDb with the output from analysis
+      LOG.info("Analysis: Installing new link scores");
+      FSUtils.replace(fs, linkRank, tempRank, true);
+      LOG.info("Analysis: finished iteration " + (i + 1) + " of "
+          + numIterations);
+    }
+
+    // replace the NodeDb in the WebGraph with the final output of analysis
+    LOG.info("Analysis: Installing web graph nodes");
+    FSUtils.replace(fs, wgNodeDb, nodeDb, true);
+
+    // remove the temporary link rank folder
+    fs.delete(linkRank, true);
+    long end = System.currentTimeMillis();
+    LOG.info("Analysis: finished at " + sdf.format(end) + ", elapsed: "
+        + TimingUtil.elapsedTime(start, end));
+  }
+
+  public static void main(String[] args) throws Exception {
+    int res = ToolRunner.run(NutchConfiguration.create(), new LinkRank(), args);
+    System.exit(res);
+  }
+
+  /**
+   * Runs the LinkRank tool.
+   */
+  public int run(String[] args) throws Exception {
+
+    Options options = new Options();
+    OptionBuilder.withArgName("help");
+    OptionBuilder.withDescription("show this help message");
+    Option helpOpts = OptionBuilder.create("help");
+    options.addOption(helpOpts);
+
+    OptionBuilder.withArgName("webgraphdb");
+    OptionBuilder.hasArg();
+    OptionBuilder.withDescription("the web graph db to use");
+    Option webgraphOpts = OptionBuilder.create("webgraphdb");
+    options.addOption(webgraphOpts);
+
+    CommandLineParser parser = new GnuParser();
+    try {
+
+      CommandLine line = parser.parse(options, args);
+      if (line.hasOption("help") || !line.hasOption("webgraphdb")) {
+        HelpFormatter formatter = new HelpFormatter();
+        formatter.printHelp("LinkRank", options);
+        return -1;
+      }
+
+      String webGraphDb = line.getOptionValue("webgraphdb");
+
+      analyze(new Path(webGraphDb));
+      return 0;
+    } catch (Exception e) {
+      LOG.error("LinkAnalysis: " + StringUtils.stringifyException(e));
+      return -2;
+    }
+  }
+}

[14/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build for nutch-core and nutch-plugins

Posted by th...@apache.org.

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-ext/src/test/java/org/apache/nutch/parse/ext/TestExtParser.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-ext/src/test/java/org/apache/nutch/parse/ext/TestExtParser.java b/nutch-plugins/parse-ext/src/test/java/org/apache/nutch/parse/ext/TestExtParser.java
new file mode 100644
index 0000000..a399273
--- /dev/null
+++ b/nutch-plugins/parse-ext/src/test/java/org/apache/nutch/parse/ext/TestExtParser.java
@@ -0,0 +1,130 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.ext;
+
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.parse.ParseException;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+
+/**
+ * Unit tests for ExtParser. First creates a temp file with fixed content, then
+ * fetch and parse it using external command 'cat' and 'md5sum' alternately for
+ * 10 times. Doing so also does a light stress test for class CommandRunner.java
+ * (as used in ExtParser.java).
+ * 
+ * Warning: currently only do test on linux platform.
+ * 
+ * @author John Xing
+ */
+public class TestExtParser {
+  private File tempFile = null;
+  private String urlString = null;
+  private Content content = null;
+  private Parse parse = null;
+
+  private String expectedText = "nutch rocks nutch rocks nutch rocks";
+  // echo -n "nutch rocks nutch rocks nutch rocks" | md5sum
+  private String expectedMD5sum = "df46711a1a48caafc98b1c3b83aa1526";
+
+  @Before
+  protected void setUp() throws ProtocolException, IOException {
+    // prepare a temp file with expectedText as its content
+    // This system property is defined in ./src/plugin/build-plugin.xml
+    String path = System.getProperty("test.data");
+    if (path != null) {
+      File tempDir = new File(path);
+      if (!tempDir.exists())
+        tempDir.mkdir();
+      tempFile = File.createTempFile("nutch.test.plugin.ExtParser.", ".txt",
+          tempDir);
+    } else {
+      // otherwise in java.io.tmpdir
+      tempFile = File.createTempFile("nutch.test.plugin.ExtParser.", ".txt");
+    }
+    urlString = tempFile.toURI().toURL().toString();
+
+    FileOutputStream fos = new FileOutputStream(tempFile);
+    fos.write(expectedText.getBytes());
+    fos.close();
+
+    // get nutch content
+    Protocol protocol = new ProtocolFactory(NutchConfiguration.create())
+        .getProtocol(urlString);
+    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum())
+        .getContent();
+    protocol = null;
+  }
+
+  @After
+  protected void tearDown() {
+    // clean content
+    content = null;
+
+    // clean temp file
+    // if (tempFile != null && tempFile.exists())
+    // tempFile.delete();
+  }
+
+  @Test
+  public void testIt() throws ParseException {
+    String contentType;
+
+    // now test only on linux platform
+    if (!System.getProperty("os.name").equalsIgnoreCase("linux")) {
+      System.err
+          .println("Current OS is " + System.getProperty("os.name") + ".");
+      System.err.println("No test is run on OS other than linux.");
+      return;
+    }
+
+    Configuration conf = NutchConfiguration.create();
+    // loop alternately, total 10*2 times of invoking external command
+    for (int i = 0; i < 10; i++) {
+      // check external parser that does 'cat'
+      contentType = "application/vnd.nutch.example.cat";
+      content.setContentType(contentType);
+      parse = new ParseUtil(conf).parseByExtensionId("parse-ext", content).get(
+          content.getUrl());
+      Assert.assertEquals(expectedText, parse.getText());
+
+      // check external parser that does 'md5sum'
+      contentType = "application/vnd.nutch.example.md5sum";
+      content.setContentType(contentType);
+      parse = new ParseUtil(conf).parseByExtensionId("parse-ext", content).get(
+          content.getUrl());
+      Assert.assertTrue(parse.getText().startsWith(expectedMD5sum));
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-html/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-html/build.xml b/nutch-plugins/parse-html/build.xml
new file mode 100755
index 0000000..a5b99b5
--- /dev/null
+++ b/nutch-plugins/parse-html/build.xml
@@ -0,0 +1,40 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="parse-html" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+  <!-- Build compilation dependencies -->
+  <target name="deps-jar">
+    <ant target="jar" inheritall="false" dir="../lib-nekohtml"/>
+  </target>
+
+  <!-- Add compilation dependencies to classpath -->
+  <path id="plugin.deps">
+    <fileset dir="${nutch.root}/build">
+      <include name="**/lib-nekohtml/*.jar" />
+    </fileset>
+  </path>
+
+  <!-- Deploy Unit test dependencies -->
+  <target name="deps-test">
+    <ant target="deploy" inheritall="false" dir="../lib-nekohtml"/>
+    <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
+  </target>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-html/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-html/ivy.xml b/nutch-plugins/parse-html/ivy.xml
new file mode 100644
index 0000000..e8a6135
--- /dev/null
+++ b/nutch-plugins/parse-html/ivy.xml
@@ -0,0 +1,42 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../../ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+   <dependency org="org.ccil.cowan.tagsoup" name="tagsoup" rev="1.2.1"/>
+  </dependencies>
+
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-html/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-html/plugin.xml b/nutch-plugins/parse-html/plugin.xml
new file mode 100755
index 0000000..3be70c3
--- /dev/null
+++ b/nutch-plugins/parse-html/plugin.xml
@@ -0,0 +1,48 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="parse-html"
+   name="Html Parse Plug-in"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="parse-html.jar">
+         <export name="*"/>
+      </library>
+      <library name="tagsoup-1.2.1.jar"/>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+      <import plugin="lib-nekohtml"/>
+   </requires>
+
+   <extension id="org.apache.nutch.parse.html"
+              name="HtmlParse"
+              point="org.apache.nutch.parse.Parser">
+
+      <implementation id="org.apache.nutch.parse.html.HtmlParser"
+                      class="org.apache.nutch.parse.html.HtmlParser">
+        <parameter name="contentType" value="text/html|application/xhtml+xml"/>
+        <parameter name="pathSuffix" value=""/>
+      </implementation>
+
+   </extension>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-html/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-html/pom.xml b/nutch-plugins/parse-html/pom.xml
new file mode 100644
index 0000000..589155b
--- /dev/null
+++ b/nutch-plugins/parse-html/pom.xml
@@ -0,0 +1,49 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>parse-html</artifactId>
+    <packaging>jar</packaging>
+
+    <name>parse-html</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+    <dependencies>
+        <dependency>
+            <groupId>org.ccil.cowan.tagsoup</groupId> <artifactId>tagsoup</artifactId> <version>1.2.1</version>
+        </dependency>
+        <dependency>
+            <groupId> net.sourceforge.nekohtml</groupId>
+            <artifactId>nekohtml</artifactId>
+            <version>1.9.22</version>
+        </dependency>
+
+    </dependencies>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-html/src/main/java/org/apache/nutch/parse/html/DOMBuilder.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-html/src/main/java/org/apache/nutch/parse/html/DOMBuilder.java b/nutch-plugins/parse-html/src/main/java/org/apache/nutch/parse/html/DOMBuilder.java
new file mode 100644
index 0000000..6a1038b
--- /dev/null
+++ b/nutch-plugins/parse-html/src/main/java/org/apache/nutch/parse/html/DOMBuilder.java
@@ -0,0 +1,766 @@
+/*
+ * XXX ab@apache.org: This class is copied verbatim from Xalan-J 2.6.0
+ * XXX distribution, org.apache.xml.utils.DOMBuilder, in order to
+ * avoid dependency on Xalan.
+ */
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Id$
+ */
+package org.apache.nutch.parse.html;
+
+import java.util.Stack;
+
+import org.w3c.dom.Comment;
+import org.w3c.dom.Document;
+import org.w3c.dom.DocumentFragment;
+import org.w3c.dom.Element;
+import org.w3c.dom.Node;
+import org.w3c.dom.Text;
+import org.w3c.dom.CDATASection;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.Locator;
+import org.xml.sax.ext.LexicalHandler;
+
+/**
+ * This class takes SAX events (in addition to some extra events that SAX
+ * doesn't handle yet) and adds the result to a document or document fragment.
+ */
+public class DOMBuilder implements ContentHandler, LexicalHandler {
+
+  /** Root document */
+  public Document m_doc;
+
+  /** Current node */
+  protected Node m_currentNode = null;
+
+  /** First node of document fragment or null if not a DocumentFragment */
+  public DocumentFragment m_docFrag = null;
+
+  /** Vector of element nodes */
+  protected Stack<Element> m_elemStack = new Stack<Element>();
+
+  /**
+   * DOMBuilder instance constructor... it will add the DOM nodes to the
+   * document fragment.
+   * 
+   * @param doc
+   *          Root document
+   * @param node
+   *          Current node
+   */
+  public DOMBuilder(Document doc, Node node) {
+    m_doc = doc;
+    m_currentNode = node;
+  }
+
+  /**
+   * DOMBuilder instance constructor... it will add the DOM nodes to the
+   * document fragment.
+   * 
+   * @param doc
+   *          Root document
+   * @param docFrag
+   *          Document fragment
+   */
+  public DOMBuilder(Document doc, DocumentFragment docFrag) {
+    m_doc = doc;
+    m_docFrag = docFrag;
+  }
+
+  /**
+   * DOMBuilder instance constructor... it will add the DOM nodes to the
+   * document.
+   * 
+   * @param doc
+   *          Root document
+   */
+  public DOMBuilder(Document doc) {
+    m_doc = doc;
+  }
+
+  /**
+   * Get the root node of the DOM being created. This is either a Document or a
+   * DocumentFragment.
+   * 
+   * @return The root document or document fragment if not null
+   */
+  public Node getRootNode() {
+    return (null != m_docFrag) ? (Node) m_docFrag : (Node) m_doc;
+  }
+
+  /**
+   * Get the node currently being processed.
+   * 
+   * @return the current node being processed
+   */
+  public Node getCurrentNode() {
+    return m_currentNode;
+  }
+
+  /**
+   * Return null since there is no Writer for this class.
+   * 
+   * @return null
+   */
+  public java.io.Writer getWriter() {
+    return null;
+  }
+
+  /**
+   * Append a node to the current container.
+   * 
+   * @param newNode
+   *          New node to append
+   */
+  protected void append(Node newNode) throws org.xml.sax.SAXException {
+
+    Node currentNode = m_currentNode;
+
+    if (null != currentNode) {
+      currentNode.appendChild(newNode);
+
+      // System.out.println(newNode.getNodeName());
+    } else if (null != m_docFrag) {
+      m_docFrag.appendChild(newNode);
+    } else {
+      boolean ok = true;
+      short type = newNode.getNodeType();
+
+      if (type == Node.TEXT_NODE) {
+        String data = newNode.getNodeValue();
+
+        if ((null != data) && (data.trim().length() > 0)) {
+          throw new org.xml.sax.SAXException(
+              "Warning: can't output text before document element!  Ignoring...");
+        }
+
+        ok = false;
+      } else if (type == Node.ELEMENT_NODE) {
+        if (m_doc.getDocumentElement() != null) {
+          throw new org.xml.sax.SAXException(
+              "Can't have more than one root on a DOM!");
+        }
+      }
+
+      if (ok)
+        m_doc.appendChild(newNode);
+    }
+  }
+
+  /**
+   * Receive an object for locating the origin of SAX document events.
+   * 
+   * <p>
+   * SAX parsers are strongly encouraged (though not absolutely required) to
+   * supply a locator: if it does so, it must supply the locator to the
+   * application by invoking this method before invoking any of the other
+   * methods in the ContentHandler interface.
+   * </p>
+   * 
+   * <p>
+   * The locator allows the application to determine the end position of any
+   * document-related event, even if the parser is not reporting an error.
+   * Typically, the application will use this information for reporting its own
+   * errors (such as character content that does not match an application's
+   * business rules). The information returned by the locator is probably not
+   * sufficient for use with a search engine.
+   * </p>
+   * 
+   * <p>
+   * Note that the locator will return correct information only during the
+   * invocation of the events in this interface. The application should not
+   * attempt to use it at any other time.
+   * </p>
+   * 
+   * @param locator
+   *          An object that can return the location of any SAX document event.
+   * @see org.xml.sax.Locator
+   */
+  public void setDocumentLocator(Locator locator) {
+
+    // No action for the moment.
+  }
+
+  /**
+   * Receive notification of the beginning of a document.
+   * 
+   * <p>
+   * The SAX parser will invoke this method only once, before any other methods
+   * in this interface or in DTDHandler (except for setDocumentLocator).
+   * </p>
+   */
+  public void startDocument() throws org.xml.sax.SAXException {
+
+    // No action for the moment.
+  }
+
+  /**
+   * Receive notification of the end of a document.
+   * 
+   * <p>
+   * The SAX parser will invoke this method only once, and it will be the last
+   * method invoked during the parse. The parser shall not invoke this method
+   * until it has either abandoned parsing (because of an unrecoverable error)
+   * or reached the end of input.
+   * </p>
+   */
+  public void endDocument() throws org.xml.sax.SAXException {
+
+    // No action for the moment.
+  }
+
+  /**
+   * Receive notification of the beginning of an element.
+   * 
+   * <p>
+   * The Parser will invoke this method at the beginning of every element in the
+   * XML document; there will be a corresponding endElement() event for every
+   * startElement() event (even when the element is empty). All of the element's
+   * content will be reported, in order, before the corresponding endElement()
+   * event.
+   * </p>
+   * 
+   * <p>
+   * If the element name has a namespace prefix, the prefix will still be
+   * attached. Note that the attribute list provided will contain only
+   * attributes with explicit values (specified or defaulted): #IMPLIED
+   * attributes will be omitted.
+   * </p>
+   * 
+   * 
+   * @param ns
+   *          The namespace of the node
+   * @param localName
+   *          The local part of the qualified name
+   * @param name
+   *          The element name.
+   * @param atts
+   *          The attributes attached to the element, if any.
+   * @see #endElement
+   * @see org.xml.sax.Attributes
+   */
+  public void startElement(String ns, String localName, String name,
+      Attributes atts) throws org.xml.sax.SAXException {
+
+    Element elem;
+
+    // Note that the namespace-aware call must be used to correctly
+    // construct a Level 2 DOM, even for non-namespaced nodes.
+    if ((null == ns) || (ns.length() == 0))
+      elem = m_doc.createElementNS(null, name);
+    else
+      elem = m_doc.createElementNS(ns, name);
+
+    append(elem);
+
+    try {
+      int nAtts = atts.getLength();
+
+      if (0 != nAtts) {
+        for (int i = 0; i < nAtts; i++) {
+
+          // System.out.println("type " + atts.getType(i) + " name " +
+          // atts.getLocalName(i) );
+          // First handle a possible ID attribute
+          if (atts.getType(i).equalsIgnoreCase("ID"))
+            setIDAttribute(atts.getValue(i), elem);
+
+          String attrNS = atts.getURI(i);
+
+          if ("".equals(attrNS))
+            attrNS = null; // DOM represents no-namespace as null
+
+          // System.out.println("attrNS: "+attrNS+", localName: "+atts.getQName(i)
+          // +", qname: "+atts.getQName(i)+", value: "+atts.getValue(i));
+          // Crimson won't let us set an xmlns: attribute on the DOM.
+          String attrQName = atts.getQName(i);
+
+          // In SAX, xmlns: attributes have an empty namespace, while in DOM
+          // they should have the xmlns namespace
+          if (attrQName.startsWith("xmlns:"))
+            attrNS = "http://www.w3.org/2000/xmlns/";
+
+          // ALWAYS use the DOM Level 2 call!
+          elem.setAttributeNS(attrNS, attrQName, atts.getValue(i));
+        }
+      }
+
+      // append(elem);
+
+      m_elemStack.push(elem);
+
+      m_currentNode = elem;
+
+      // append(elem);
+    } catch (java.lang.Exception de) {
+      // de.printStackTrace();
+      throw new org.xml.sax.SAXException(de);
+    }
+
+  }
+
+  /**
+   * 
+   * 
+   * 
+   * Receive notification of the end of an element.
+   * 
+   * <p>
+   * The SAX parser will invoke this method at the end of every element in the
+   * XML document; there will be a corresponding startElement() event for every
+   * endElement() event (even when the element is empty).
+   * </p>
+   * 
+   * <p>
+   * If the element name has a namespace prefix, the prefix will still be
+   * attached to the name.
+   * </p>
+   * 
+   * 
+   * @param ns
+   *          the namespace of the element
+   * @param localName
+   *          The local part of the qualified name of the element
+   * @param name
+   *          The element name
+   */
+  public void endElement(String ns, String localName, String name)
+      throws org.xml.sax.SAXException {
+    m_elemStack.pop();
+    m_currentNode = m_elemStack.isEmpty() ? null : (Node) m_elemStack.peek();
+  }
+
+  /**
+   * Set an ID string to node association in the ID table.
+   * 
+   * @param id
+   *          The ID string.
+   * @param elem
+   *          The associated ID.
+   */
+  public void setIDAttribute(String id, Element elem) {
+
+    // Do nothing. This method is meant to be overiden.
+  }
+
+  /**
+   * Receive notification of character data.
+   * 
+   * <p>
+   * The Parser will call this method to report each chunk of character data.
+   * SAX parsers may return all contiguous character data in a single chunk, or
+   * they may split it into several chunks; however, all of the characters in
+   * any single event must come from the same external entity, so that the
+   * Locator provides useful information.
+   * </p>
+   * 
+   * <p>
+   * The application must not attempt to read from the array outside of the
+   * specified range.
+   * </p>
+   * 
+   * <p>
+   * Note that some parsers will report whitespace using the
+   * ignorableWhitespace() method rather than this one (validating parsers must
+   * do so).
+   * </p>
+   * 
+   * @param ch
+   *          The characters from the XML document.
+   * @param start
+   *          The start position in the array.
+   * @param length
+   *          The number of characters to read from the array.
+   * @see #ignorableWhitespace
+   * @see org.xml.sax.Locator
+   */
+  public void characters(char ch[], int start, int length)
+      throws org.xml.sax.SAXException {
+    if (isOutsideDocElem()
+        && XMLCharacterRecognizer.isWhiteSpace(ch, start, length))
+      return; // avoid DOM006 Hierarchy request error
+
+    if (m_inCData) {
+      cdata(ch, start, length);
+
+      return;
+    }
+
+    String s = new String(ch, start, length);
+    Node childNode;
+    childNode = m_currentNode != null ? m_currentNode.getLastChild() : null;
+    if (childNode != null && childNode.getNodeType() == Node.TEXT_NODE) {
+      ((Text) childNode).appendData(s);
+    } else {
+      Text text = m_doc.createTextNode(s);
+      append(text);
+    }
+  }
+
+  /**
+   * If available, when the disable-output-escaping attribute is used, output
+   * raw text without escaping. A PI will be inserted in front of the node with
+   * the name "lotusxsl-next-is-raw" and a value of "formatter-to-dom".
+   * 
+   * @param ch
+   *          Array containing the characters
+   * @param start
+   *          Index to start of characters in the array
+   * @param length
+   *          Number of characters in the array
+   */
+  public void charactersRaw(char ch[], int start, int length)
+      throws org.xml.sax.SAXException {
+    if (isOutsideDocElem()
+        && XMLCharacterRecognizer.isWhiteSpace(ch, start, length))
+      return; // avoid DOM006 Hierarchy request error
+
+    String s = new String(ch, start, length);
+
+    append(m_doc.createProcessingInstruction("xslt-next-is-raw",
+        "formatter-to-dom"));
+    append(m_doc.createTextNode(s));
+  }
+
+  /**
+   * Report the beginning of an entity.
+   * 
+   * The start and end of the document entity are not reported. The start and
+   * end of the external DTD subset are reported using the pseudo-name "[dtd]".
+   * All other events must be properly nested within start/end entity events.
+   * 
+   * @param name
+   *          The name of the entity. If it is a parameter entity, the name will
+   *          begin with '%'.
+   * @see #endEntity
+   * @see org.xml.sax.ext.DeclHandler#internalEntityDecl
+   * @see org.xml.sax.ext.DeclHandler#externalEntityDecl
+   */
+  public void startEntity(String name) throws org.xml.sax.SAXException {
+
+    // Almost certainly the wrong behavior...
+    // entityReference(name);
+  }
+
+  /**
+   * Report the end of an entity.
+   * 
+   * @param name
+   *          The name of the entity that is ending.
+   * @see #startEntity
+   */
+  public void endEntity(String name) throws org.xml.sax.SAXException {
+  }
+
+  /**
+   * Receive notivication of a entityReference.
+   * 
+   * @param name
+   *          name of the entity reference
+   */
+  public void entityReference(String name) throws org.xml.sax.SAXException {
+    append(m_doc.createEntityReference(name));
+  }
+
+  /**
+   * Receive notification of ignorable whitespace in element content.
+   * 
+   * <p>
+   * Validating Parsers must use this method to report each chunk of ignorable
+   * whitespace (see the W3C XML 1.0 recommendation, section 2.10):
+   * non-validating parsers may also use this method if they are capable of
+   * parsing and using content models.
+   * </p>
+   * 
+   * <p>
+   * SAX parsers may return all contiguous whitespace in a single chunk, or they
+   * may split it into several chunks; however, all of the characters in any
+   * single event must come from the same external entity, so that the Locator
+   * provides useful information.
+   * </p>
+   * 
+   * <p>
+   * The application must not attempt to read from the array outside of the
+   * specified range.
+   * </p>
+   * 
+   * @param ch
+   *          The characters from the XML document.
+   * @param start
+   *          The start position in the array.
+   * @param length
+   *          The number of characters to read from the array.
+   * @see #characters
+   */
+  public void ignorableWhitespace(char ch[], int start, int length)
+      throws org.xml.sax.SAXException {
+    if (isOutsideDocElem())
+      return; // avoid DOM006 Hierarchy request error
+
+    String s = new String(ch, start, length);
+
+    append(m_doc.createTextNode(s));
+  }
+
+  /**
+   * Tell if the current node is outside the document element.
+   * 
+   * @return true if the current node is outside the document element.
+   */
+  private boolean isOutsideDocElem() {
+    return (null == m_docFrag)
+        && m_elemStack.size() == 0
+        && (null == m_currentNode || m_currentNode.getNodeType() == Node.DOCUMENT_NODE);
+  }
+
+  /**
+   * Receive notification of a processing instruction.
+   * 
+   * <p>
+   * The Parser will invoke this method once for each processing instruction
+   * found: note that processing instructions may occur before or after the main
+   * document element.
+   * </p>
+   * 
+   * <p>
+   * A SAX parser should never report an XML declaration (XML 1.0, section 2.8)
+   * or a text declaration (XML 1.0, section 4.3.1) using this method.
+   * </p>
+   * 
+   * @param target
+   *          The processing instruction target.
+   * @param data
+   *          The processing instruction data, or null if none was supplied.
+   */
+  public void processingInstruction(String target, String data)
+      throws org.xml.sax.SAXException {
+    append(m_doc.createProcessingInstruction(target, data));
+  }
+
+  /**
+   * Report an XML comment anywhere in the document.
+   * 
+   * This callback will be used for comments inside or outside the document
+   * element, including comments in the external DTD subset (if read).
+   * 
+   * @param ch
+   *          An array holding the characters in the comment.
+   * @param start
+   *          The starting position in the array.
+   * @param length
+   *          The number of characters to use from the array.
+   */
+  public void comment(char ch[], int start, int length)
+      throws org.xml.sax.SAXException {
+    // tagsoup sometimes submits invalid values here
+    if (ch == null || start < 0 || length >= (ch.length - start) || length < 0)
+      return;
+    append(m_doc.createComment(new String(ch, start, length)));
+  }
+
+  /** Flag indicating that we are processing a CData section */
+  protected boolean m_inCData = false;
+
+  /**
+   * Report the start of a CDATA section.
+   * 
+   * @see #endCDATA
+   */
+  public void startCDATA() throws org.xml.sax.SAXException {
+    m_inCData = true;
+    append(m_doc.createCDATASection(""));
+  }
+
+  /**
+   * Report the end of a CDATA section.
+   * 
+   * @see #startCDATA
+   */
+  public void endCDATA() throws org.xml.sax.SAXException {
+    m_inCData = false;
+  }
+
+  /**
+   * Receive notification of cdata.
+   * 
+   * <p>
+   * The Parser will call this method to report each chunk of character data.
+   * SAX parsers may return all contiguous character data in a single chunk, or
+   * they may split it into several chunks; however, all of the characters in
+   * any single event must come from the same external entity, so that the
+   * Locator provides useful information.
+   * </p>
+   * 
+   * <p>
+   * The application must not attempt to read from the array outside of the
+   * specified range.
+   * </p>
+   * 
+   * <p>
+   * Note that some parsers will report whitespace using the
+   * ignorableWhitespace() method rather than this one (validating parsers must
+   * do so).
+   * </p>
+   * 
+   * @param ch
+   *          The characters from the XML document.
+   * @param start
+   *          The start position in the array.
+   * @param length
+   *          The number of characters to read from the array.
+   * @see #ignorableWhitespace
+   * @see org.xml.sax.Locator
+   */
+  public void cdata(char ch[], int start, int length)
+      throws org.xml.sax.SAXException {
+    if (isOutsideDocElem()
+        && XMLCharacterRecognizer.isWhiteSpace(ch, start, length))
+      return; // avoid DOM006 Hierarchy request error
+
+    String s = new String(ch, start, length);
+
+    // XXX ab@apache.org: modified from the original, to accomodate TagSoup.
+    Node n = m_currentNode.getLastChild();
+    if (n instanceof CDATASection)
+      ((CDATASection) n).appendData(s);
+    else if (n instanceof Comment)
+      ((Comment) n).appendData(s);
+  }
+
+  /**
+   * Report the start of DTD declarations, if any.
+   * 
+   * Any declarations are assumed to be in the internal subset unless otherwise
+   * indicated.
+   * 
+   * @param name
+   *          The document type name.
+   * @param publicId
+   *          The declared public identifier for the external DTD subset, or
+   *          null if none was declared.
+   * @param systemId
+   *          The declared system identifier for the external DTD subset, or
+   *          null if none was declared.
+   * @see #endDTD
+   * @see #startEntity
+   */
+  public void startDTD(String name, String publicId, String systemId)
+      throws org.xml.sax.SAXException {
+
+    // Do nothing for now.
+  }
+
+  /**
+   * Report the end of DTD declarations.
+   * 
+   * @see #startDTD
+   */
+  public void endDTD() throws org.xml.sax.SAXException {
+
+    // Do nothing for now.
+  }
+
+  /**
+   * Begin the scope of a prefix-URI Namespace mapping.
+   * 
+   * <p>
+   * The information from this event is not necessary for normal Namespace
+   * processing: the SAX XML reader will automatically replace prefixes for
+   * element and attribute names when the http://xml.org/sax/features/namespaces
+   * feature is true (the default).
+   * </p>
+   * 
+   * <p>
+   * There are cases, however, when applications need to use prefixes in
+   * character data or in attribute values, where they cannot safely be expanded
+   * automatically; the start/endPrefixMapping event supplies the information to
+   * the application to expand prefixes in those contexts itself, if necessary.
+   * </p>
+   * 
+   * <p>
+   * Note that start/endPrefixMapping events are not guaranteed to be properly
+   * nested relative to each-other: all startPrefixMapping events will occur
+   * before the corresponding startElement event, and all endPrefixMapping
+   * events will occur after the corresponding endElement event, but their order
+   * is not guaranteed.
+   * </p>
+   * 
+   * @param prefix
+   *          The Namespace prefix being declared.
+   * @param uri
+   *          The Namespace URI the prefix is mapped to.
+   * @see #endPrefixMapping
+   * @see #startElement
+   */
+  public void startPrefixMapping(String prefix, String uri)
+      throws org.xml.sax.SAXException {
+
+    /*
+     * // Not sure if this is needed or wanted // Also, it fails in the stree.
+     * if((null != m_currentNode) && (m_currentNode.getNodeType() ==
+     * Node.ELEMENT_NODE)) { String qname; if(((null != prefix) &&
+     * (prefix.length() == 0)) || (null == prefix)) qname = "xmlns"; else qname
+     * = "xmlns:"+prefix;
+     * 
+     * Element elem = (Element)m_currentNode; String val =
+     * elem.getAttribute(qname); // Obsolete, should be DOM2...? if(val == null)
+     * { elem.setAttributeNS("http://www.w3.org/XML/1998/namespace", qname,
+     * uri); } }
+     */
+  }
+
+  /**
+   * End the scope of a prefix-URI mapping.
+   * 
+   * <p>
+   * See startPrefixMapping for details. This event will always occur after the
+   * corresponding endElement event, but the order of endPrefixMapping events is
+   * not otherwise guaranteed.
+   * </p>
+   * 
+   * @param prefix
+   *          The prefix that was being mapping.
+   * @see #startPrefixMapping
+   * @see #endElement
+   */
+  public void endPrefixMapping(String prefix) throws org.xml.sax.SAXException {
+  }
+
+  /**
+   * Receive notification of a skipped entity.
+   * 
+   * <p>
+   * The Parser will invoke this method once for each entity skipped.
+   * Non-validating processors may skip entities if they have not seen the
+   * declarations (because, for example, the entity was declared in an external
+   * DTD subset). All processors may skip external entities, depending on the
+   * values of the http://xml.org/sax/features/external-general-entities and the
+   * http://xml.org/sax/features/external-parameter-entities properties.
+   * </p>
+   * 
+   * @param name
+   *          The name of the skipped entity. If it is a parameter entity, the
+   *          name will begin with '%'.
+   */
+  public void skippedEntity(String name) throws org.xml.sax.SAXException {
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-html/src/main/java/org/apache/nutch/parse/html/DOMContentUtils.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-html/src/main/java/org/apache/nutch/parse/html/DOMContentUtils.java b/nutch-plugins/parse-html/src/main/java/org/apache/nutch/parse/html/DOMContentUtils.java
new file mode 100644
index 0000000..3c2aba0
--- /dev/null
+++ b/nutch-plugins/parse-html/src/main/java/org/apache/nutch/parse/html/DOMContentUtils.java
@@ -0,0 +1,400 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.html;
+
+import java.net.URL;
+import java.net.MalformedURLException;
+import java.util.Collection;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Stack;
+
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.util.NodeWalker;
+import org.apache.nutch.util.URLUtil;
+import org.apache.hadoop.conf.Configuration;
+
+import org.w3c.dom.*;
+
+/**
+ * A collection of methods for extracting content from DOM trees.
+ * 
+ * This class holds a few utility methods for pulling content out of DOM nodes,
+ * such as getOutlinks, getText, etc.
+ * 
+ */
+public class DOMContentUtils {
+
+  public static class LinkParams {
+    public String elName;
+    public String attrName;
+    public int childLen;
+
+    public LinkParams(String elName, String attrName, int childLen) {
+      this.elName = elName;
+      this.attrName = attrName;
+      this.childLen = childLen;
+    }
+
+    public String toString() {
+      return "LP[el=" + elName + ",attr=" + attrName + ",len=" + childLen + "]";
+    }
+  }
+
+  private HashMap<String, LinkParams> linkParams = new HashMap<String, LinkParams>();
+  private Configuration conf;
+
+  public DOMContentUtils(Configuration conf) {
+    setConf(conf);
+  }
+
+  public void setConf(Configuration conf) {
+    // forceTags is used to override configurable tag ignoring, later on
+    Collection<String> forceTags = new ArrayList<String>(1);
+
+    this.conf = conf;
+    linkParams.clear();
+    linkParams.put("a", new LinkParams("a", "href", 1));
+    linkParams.put("area", new LinkParams("area", "href", 0));
+    if (conf.getBoolean("parser.html.form.use_action", true)) {
+      linkParams.put("form", new LinkParams("form", "action", 1));
+      if (conf.get("parser.html.form.use_action") != null)
+        forceTags.add("form");
+    }
+    linkParams.put("frame", new LinkParams("frame", "src", 0));
+    linkParams.put("iframe", new LinkParams("iframe", "src", 0));
+    linkParams.put("script", new LinkParams("script", "src", 0));
+    linkParams.put("link", new LinkParams("link", "href", 0));
+    linkParams.put("img", new LinkParams("img", "src", 0));
+
+    // remove unwanted link tags from the linkParams map
+    String[] ignoreTags = conf.getStrings("parser.html.outlinks.ignore_tags");
+    for (int i = 0; ignoreTags != null && i < ignoreTags.length; i++) {
+      if (!forceTags.contains(ignoreTags[i]))
+        linkParams.remove(ignoreTags[i]);
+    }
+  }
+
+  /**
+   * This method takes a {@link StringBuffer} and a DOM {@link Node}, and will
+   * append all the content text found beneath the DOM node to the
+   * <code>StringBuffer</code>.
+   * 
+   * <p>
+   * 
+   * If <code>abortOnNestedAnchors</code> is true, DOM traversal will be aborted
+   * and the <code>StringBuffer</code> will not contain any text encountered
+   * after a nested anchor is found.
+   * 
+   * <p>
+   * 
+   * @return true if nested anchors were found
+   */
+  public boolean getText(StringBuffer sb, Node node,
+      boolean abortOnNestedAnchors) {
+    if (getTextHelper(sb, node, abortOnNestedAnchors, 0)) {
+      return true;
+    }
+    return false;
+  }
+
+  /**
+   * This is a convinience method, equivalent to
+   * {@link #getText(StringBuffer,Node,boolean) getText(sb, node, false)}.
+   * 
+   */
+  public void getText(StringBuffer sb, Node node) {
+    getText(sb, node, false);
+  }
+
+  // returns true if abortOnNestedAnchors is true and we find nested
+  // anchors
+  private boolean getTextHelper(StringBuffer sb, Node node,
+      boolean abortOnNestedAnchors, int anchorDepth) {
+    boolean abort = false;
+    NodeWalker walker = new NodeWalker(node);
+
+    while (walker.hasNext()) {
+
+      Node currentNode = walker.nextNode();
+      String nodeName = currentNode.getNodeName();
+      short nodeType = currentNode.getNodeType();
+
+      if ("script".equalsIgnoreCase(nodeName)) {
+        walker.skipChildren();
+      }
+      if ("style".equalsIgnoreCase(nodeName)) {
+        walker.skipChildren();
+      }
+      if (abortOnNestedAnchors && "a".equalsIgnoreCase(nodeName)) {
+        anchorDepth++;
+        if (anchorDepth > 1) {
+          abort = true;
+          break;
+        }
+      }
+      if (nodeType == Node.COMMENT_NODE) {
+        walker.skipChildren();
+      }
+      if (nodeType == Node.TEXT_NODE) {
+        // cleanup and trim the value
+        String text = currentNode.getNodeValue();
+        text = text.replaceAll("\\s+", " ");
+        text = text.trim();
+        if (text.length() > 0) {
+          if (sb.length() > 0)
+            sb.append(' ');
+          sb.append(text);
+        }
+      }
+    }
+
+    return abort;
+  }
+
+  /**
+   * This method takes a {@link StringBuffer} and a DOM {@link Node}, and will
+   * append the content text found beneath the first <code>title</code> node to
+   * the <code>StringBuffer</code>.
+   * 
+   * @return true if a title node was found, false otherwise
+   */
+  public boolean getTitle(StringBuffer sb, Node node) {
+
+    NodeWalker walker = new NodeWalker(node);
+
+    while (walker.hasNext()) {
+
+      Node currentNode = walker.nextNode();
+      String nodeName = currentNode.getNodeName();
+      short nodeType = currentNode.getNodeType();
+
+      if ("body".equalsIgnoreCase(nodeName)) { // stop after HEAD
+        return false;
+      }
+
+      if (nodeType == Node.ELEMENT_NODE) {
+        if ("title".equalsIgnoreCase(nodeName)) {
+          getText(sb, currentNode);
+          return true;
+        }
+      }
+    }
+
+    return false;
+  }
+
+  /** If Node contains a BASE tag then it's HREF is returned. */
+  public URL getBase(Node node) {
+
+    NodeWalker walker = new NodeWalker(node);
+
+    while (walker.hasNext()) {
+
+      Node currentNode = walker.nextNode();
+      String nodeName = currentNode.getNodeName();
+      short nodeType = currentNode.getNodeType();
+
+      // is this node a BASE tag?
+      if (nodeType == Node.ELEMENT_NODE) {
+
+        if ("body".equalsIgnoreCase(nodeName)) { // stop after HEAD
+          return null;
+        }
+
+        if ("base".equalsIgnoreCase(nodeName)) {
+          NamedNodeMap attrs = currentNode.getAttributes();
+          for (int i = 0; i < attrs.getLength(); i++) {
+            Node attr = attrs.item(i);
+            if ("href".equalsIgnoreCase(attr.getNodeName())) {
+              try {
+                return new URL(attr.getNodeValue());
+              } catch (MalformedURLException e) {
+              }
+            }
+          }
+        }
+      }
+    }
+
+    // no.
+    return null;
+  }
+
+  private boolean hasOnlyWhiteSpace(Node node) {
+    String val = node.getNodeValue();
+    for (int i = 0; i < val.length(); i++) {
+      if (!Character.isWhitespace(val.charAt(i)))
+        return false;
+    }
+    return true;
+  }
+
+  // this only covers a few cases of empty links that are symptomatic
+  // of nekohtml's DOM-fixup process...
+  private boolean shouldThrowAwayLink(Node node, NodeList children,
+      int childLen, LinkParams params) {
+    if (childLen == 0) {
+      // this has no inner structure
+      if (params.childLen == 0)
+        return false;
+      else
+        return true;
+    } else if ((childLen == 1)
+        && (children.item(0).getNodeType() == Node.ELEMENT_NODE)
+        && (params.elName.equalsIgnoreCase(children.item(0).getNodeName()))) {
+      // single nested link
+      return true;
+
+    } else if (childLen == 2) {
+
+      Node c0 = children.item(0);
+      Node c1 = children.item(1);
+
+      if ((c0.getNodeType() == Node.ELEMENT_NODE)
+          && (params.elName.equalsIgnoreCase(c0.getNodeName()))
+          && (c1.getNodeType() == Node.TEXT_NODE) && hasOnlyWhiteSpace(c1)) {
+        // single link followed by whitespace node
+        return true;
+      }
+
+      if ((c1.getNodeType() == Node.ELEMENT_NODE)
+          && (params.elName.equalsIgnoreCase(c1.getNodeName()))
+          && (c0.getNodeType() == Node.TEXT_NODE) && hasOnlyWhiteSpace(c0)) {
+        // whitespace node followed by single link
+        return true;
+      }
+
+    } else if (childLen == 3) {
+      Node c0 = children.item(0);
+      Node c1 = children.item(1);
+      Node c2 = children.item(2);
+
+      if ((c1.getNodeType() == Node.ELEMENT_NODE)
+          && (params.elName.equalsIgnoreCase(c1.getNodeName()))
+          && (c0.getNodeType() == Node.TEXT_NODE)
+          && (c2.getNodeType() == Node.TEXT_NODE) && hasOnlyWhiteSpace(c0)
+          && hasOnlyWhiteSpace(c2)) {
+        // single link surrounded by whitespace nodes
+        return true;
+      }
+    }
+
+    return false;
+  }
+
+  /**
+   * This method finds all anchors below the supplied DOM <code>node</code>, and
+   * creates appropriate {@link Outlink} records for each (relative to the
+   * supplied <code>base</code> URL), and adds them to the <code>outlinks</code>
+   * {@link ArrayList}.
+   * 
+   * <p>
+   * 
+   * Links without inner structure (tags, text, etc) are discarded, as are links
+   * which contain only single nested links and empty text nodes (this is a
+   * common DOM-fixup artifact, at least with nekohtml).
+   */
+  public void getOutlinks(URL base, ArrayList<Outlink> outlinks, Node node) {
+
+    NodeWalker walker = new NodeWalker(node);
+    while (walker.hasNext()) {
+
+      Node currentNode = walker.nextNode();
+      String nodeName = currentNode.getNodeName();
+      short nodeType = currentNode.getNodeType();
+      NodeList children = currentNode.getChildNodes();
+      int childLen = (children != null) ? children.getLength() : 0;
+
+      if (nodeType == Node.ELEMENT_NODE) {
+
+        nodeName = nodeName.toLowerCase();
+        LinkParams params = (LinkParams) linkParams.get(nodeName);
+        if (params != null) {
+          if (!shouldThrowAwayLink(currentNode, children, childLen, params)) {
+
+            StringBuffer linkText = new StringBuffer();
+            getText(linkText, currentNode, true);
+            if (linkText.toString().trim().length() == 0) {
+              // try harder - use img alt if present
+              NodeWalker subWalker = new NodeWalker(currentNode);
+              while (subWalker.hasNext()) {
+                Node subNode = subWalker.nextNode();
+                if (subNode.getNodeType() == Node.ELEMENT_NODE) {
+                  if (subNode.getNodeName().toLowerCase().equals("img")) {
+                    NamedNodeMap subAttrs = subNode.getAttributes();
+                    Node alt = subAttrs.getNamedItem("alt");
+                    if (alt != null) {
+                      String altTxt = alt.getTextContent();
+                      if (altTxt != null && altTxt.trim().length() > 0) {
+                        if (linkText.length() > 0)
+                          linkText.append(' ');
+                        linkText.append(altTxt);
+                      }
+                    }
+                  } else {
+                    // ignore other types of elements
+
+                  }
+                } else if (subNode.getNodeType() == Node.TEXT_NODE) {
+                  String txt = subNode.getTextContent();
+                  if (txt != null && txt.length() > 0) {
+                    if (linkText.length() > 0)
+                      linkText.append(' ');
+                    linkText.append(txt);
+                  }
+                }
+              }
+            }
+
+            NamedNodeMap attrs = currentNode.getAttributes();
+            String target = null;
+            boolean noFollow = false;
+            boolean post = false;
+            for (int i = 0; i < attrs.getLength(); i++) {
+              Node attr = attrs.item(i);
+              String attrName = attr.getNodeName();
+              if (params.attrName.equalsIgnoreCase(attrName)) {
+                target = attr.getNodeValue();
+              } else if ("rel".equalsIgnoreCase(attrName)
+                  && "nofollow".equalsIgnoreCase(attr.getNodeValue())) {
+                noFollow = true;
+              } else if ("method".equalsIgnoreCase(attrName)
+                  && "post".equalsIgnoreCase(attr.getNodeValue())) {
+                post = true;
+              }
+            }
+            if (target != null && !noFollow && !post)
+              try {
+
+                URL url = URLUtil.resolveURL(base, target);
+                outlinks.add(new Outlink(url.toString(), linkText.toString()
+                    .trim()));
+              } catch (MalformedURLException e) {
+                // don't care
+              }
+          }
+          // this should not have any children, skip them
+          if (params.childLen == 0)
+            continue;
+        }
+      }
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-html/src/main/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-html/src/main/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java b/nutch-plugins/parse-html/src/main/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java
new file mode 100644
index 0000000..159aa76
--- /dev/null
+++ b/nutch-plugins/parse-html/src/main/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java
@@ -0,0 +1,214 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.html;
+
+import java.net.URL;
+
+import org.apache.nutch.parse.HTMLMetaTags;
+import org.w3c.dom.*;
+
+/**
+ * Class for parsing META Directives from DOM trees. This class handles
+ * specifically Robots META directives (all, none, nofollow, noindex), finding
+ * BASE HREF tags, and HTTP-EQUIV no-cache instructions. All meta directives are
+ * stored in a HTMLMetaTags instance.
+ */
+public class HTMLMetaProcessor {
+
+  /**
+   * Utility class with indicators for the robots directives "noindex" and
+   * "nofollow", and HTTP-EQUIV/no-cache
+   */
+
+  /**
+   * Sets the indicators in <code>robotsMeta</code> to appropriate values, based
+   * on any META tags found under the given <code>node</code>.
+   */
+  public static final void getMetaTags(HTMLMetaTags metaTags, Node node,
+      URL currURL) {
+
+    metaTags.reset();
+    getMetaTagsHelper(metaTags, node, currURL);
+  }
+
+  private static final void getMetaTagsHelper(HTMLMetaTags metaTags, Node node,
+      URL currURL) {
+
+    if (node.getNodeType() == Node.ELEMENT_NODE) {
+
+      if ("body".equalsIgnoreCase(node.getNodeName())) {
+        // META tags should not be under body
+        return;
+      }
+
+      if ("meta".equalsIgnoreCase(node.getNodeName())) {
+        NamedNodeMap attrs = node.getAttributes();
+        Node nameNode = null;
+        Node equivNode = null;
+        Node contentNode = null;
+        // Retrieves name, http-equiv and content attribues
+        for (int i = 0; i < attrs.getLength(); i++) {
+          Node attr = attrs.item(i);
+          String attrName = attr.getNodeName().toLowerCase();
+          if (attrName.equals("name")) {
+            nameNode = attr;
+          } else if (attrName.equals("http-equiv")) {
+            equivNode = attr;
+          } else if (attrName.equals("content")) {
+            contentNode = attr;
+          }
+        }
+
+        if (nameNode != null) {
+          if (contentNode != null) {
+            String name = nameNode.getNodeValue().toLowerCase();
+            metaTags.getGeneralTags().add(name, contentNode.getNodeValue());
+            if ("robots".equals(name)) {
+
+              if (contentNode != null) {
+                String directives = contentNode.getNodeValue().toLowerCase();
+                int index = directives.indexOf("none");
+
+                if (index >= 0) {
+                  metaTags.setNoIndex();
+                  metaTags.setNoFollow();
+                }
+
+                index = directives.indexOf("all");
+                if (index >= 0) {
+                  // do nothing...
+                }
+
+                index = directives.indexOf("noindex");
+                if (index >= 0) {
+                  metaTags.setNoIndex();
+                }
+
+                index = directives.indexOf("nofollow");
+                if (index >= 0) {
+                  metaTags.setNoFollow();
+                }
+
+                index = directives.indexOf("noarchive");
+                if (index >= 0) {
+                  metaTags.setNoCache();
+                }
+              }
+
+            } // end if (name == robots)
+          }
+        }
+
+        if (equivNode != null) {
+          if (contentNode != null) {
+            String name = equivNode.getNodeValue().toLowerCase();
+            String content = contentNode.getNodeValue();
+            metaTags.getHttpEquivTags().setProperty(name, content);
+            if ("pragma".equals(name)) {
+              content = content.toLowerCase();
+              int index = content.indexOf("no-cache");
+              if (index >= 0)
+                metaTags.setNoCache();
+            } else if ("refresh".equals(name)) {
+              int idx = content.indexOf(';');
+              String time = null;
+              if (idx == -1) { // just the refresh time
+                time = content;
+              } else
+                time = content.substring(0, idx);
+              try {
+                metaTags.setRefreshTime(Integer.parseInt(time));
+                // skip this if we couldn't parse the time
+                metaTags.setRefresh(true);
+              } catch (Exception e) {
+                ;
+              }
+              URL refreshUrl = null;
+              if (metaTags.getRefresh() && idx != -1) { // set the URL
+                idx = content.toLowerCase().indexOf("url=");
+                if (idx == -1) { // assume a mis-formatted entry with just the
+                                 // url
+                  idx = content.indexOf(';') + 1;
+                } else
+                  idx += 4;
+                if (idx != -1) {
+                  String url = content.substring(idx);
+                  try {
+                    refreshUrl = new URL(url);
+                  } catch (Exception e) {
+                    // XXX according to the spec, this has to be an absolute
+                    // XXX url. However, many websites use relative URLs and
+                    // XXX expect browsers to handle that.
+                    // XXX Unfortunately, in some cases this may create a
+                    // XXX infinitely recursive paths (a crawler trap)...
+                    // if (!url.startsWith("/")) url = "/" + url;
+                    try {
+                      refreshUrl = new URL(currURL, url);
+                    } catch (Exception e1) {
+                      refreshUrl = null;
+                    }
+                  }
+                }
+              }
+              if (metaTags.getRefresh()) {
+                if (refreshUrl == null) {
+                  // apparently only refresh time was present. set the URL
+                  // to the same URL.
+                  refreshUrl = currURL;
+                }
+                metaTags.setRefreshHref(refreshUrl);
+              }
+            }
+          }
+        }
+
+      } else if ("base".equalsIgnoreCase(node.getNodeName())) {
+        NamedNodeMap attrs = node.getAttributes();
+        Node hrefNode = attrs.getNamedItem("href");
+
+        if (hrefNode != null) {
+          String urlString = hrefNode.getNodeValue();
+
+          URL url = null;
+          try {
+            if (currURL == null)
+              url = new URL(urlString);
+            else
+              url = new URL(currURL, urlString);
+          } catch (Exception e) {
+            ;
+          }
+
+          if (url != null)
+            metaTags.setBaseHref(url);
+        }
+
+      }
+
+    }
+
+    NodeList children = node.getChildNodes();
+    if (children != null) {
+      int len = children.getLength();
+      for (int i = 0; i < len; i++) {
+        getMetaTagsHelper(metaTags, children.item(i), currURL);
+      }
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-html/src/main/java/org/apache/nutch/parse/html/HtmlParser.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-html/src/main/java/org/apache/nutch/parse/html/HtmlParser.java b/nutch-plugins/parse-html/src/main/java/org/apache/nutch/parse/html/HtmlParser.java
new file mode 100644
index 0000000..4d043ba
--- /dev/null
+++ b/nutch-plugins/parse-html/src/main/java/org/apache/nutch/parse/html/HtmlParser.java
@@ -0,0 +1,352 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.html;
+
+import java.util.ArrayList;
+import java.util.Map;
+import java.net.URL;
+import java.net.MalformedURLException;
+import java.nio.charset.StandardCharsets;
+import java.io.*;
+import java.util.regex.*;
+
+import org.cyberneko.html.parsers.*;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+import org.w3c.dom.*;
+import org.apache.html.dom.*;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.protocol.Content;
+import org.apache.hadoop.conf.*;
+import org.apache.nutch.parse.*;
+import org.apache.nutch.util.*;
+
+public class HtmlParser implements Parser {
+  public static final Logger LOG = LoggerFactory
+      .getLogger("org.apache.nutch.parse.html");
+
+  // I used 1000 bytes at first, but found that some documents have
+  // meta tag well past the first 1000 bytes.
+  // (e.g. http://cn.promo.yahoo.com/customcare/music.html)
+  // NUTCH-2042 (cf. TIKA-357): increased to 8 kB
+  private static final int CHUNK_SIZE = 8192;
+
+  // NUTCH-1006 Meta equiv with single quotes not accepted
+  private static Pattern metaPattern = Pattern.compile(
+      "<meta\\s+([^>]*http-equiv=(\"|')?content-type(\"|')?[^>]*)>",
+      Pattern.CASE_INSENSITIVE);
+  private static Pattern charsetPattern = Pattern.compile(
+      "charset=\\s*([a-z][_\\-0-9a-z]*)", Pattern.CASE_INSENSITIVE);
+  private static Pattern charsetPatternHTML5 = Pattern.compile(
+      "<meta\\s+charset\\s*=\\s*[\"']?([a-z][_\\-0-9a-z]*)[^>]*>",
+      Pattern.CASE_INSENSITIVE);
+
+  private String parserImpl;
+
+  /**
+   * Given a <code>byte[]</code> representing an html file of an
+   * <em>unknown</em> encoding, read out 'charset' parameter in the meta tag
+   * from the first <code>CHUNK_SIZE</code> bytes. If there's no meta tag for
+   * Content-Type or no charset is specified, the content is checked for a
+   * Unicode Byte Order Mark (BOM). This will also cover non-byte oriented
+   * character encodings (UTF-16 only). If no character set can be determined,
+   * <code>null</code> is returned. <br />
+   * See also
+   * http://www.w3.org/International/questions/qa-html-encoding-declarations,
+   * http://www.w3.org/TR/2011/WD-html5-diff-20110405/#character-encoding, and
+   * http://www.w3.org/TR/REC-xml/#sec-guessing
+   * 
+   * @param content
+   *          <code>byte[]</code> representation of an html file
+   */
+
+  private static String sniffCharacterEncoding(byte[] content) {
+    int length = content.length < CHUNK_SIZE ? content.length : CHUNK_SIZE;
+
+    // We don't care about non-ASCII parts so that it's sufficient
+    // to just inflate each byte to a 16-bit value by padding.
+    // For instance, the sequence {0x41, 0x82, 0xb7} will be turned into
+    // {U+0041, U+0082, U+00B7}.
+    String str = new String(content, 0, length, StandardCharsets.US_ASCII);
+
+    Matcher metaMatcher = metaPattern.matcher(str);
+    String encoding = null;
+    if (metaMatcher.find()) {
+      Matcher charsetMatcher = charsetPattern.matcher(metaMatcher.group(1));
+      if (charsetMatcher.find())
+        encoding = new String(charsetMatcher.group(1));
+    }
+    if (encoding == null) {
+      // check for HTML5 meta charset
+      metaMatcher = charsetPatternHTML5.matcher(str);
+      if (metaMatcher.find()) {
+        encoding = new String(metaMatcher.group(1));
+      }
+    }
+    if (encoding == null) {
+      // check for BOM
+      if (content.length >= 3 && content[0] == (byte) 0xEF
+          && content[1] == (byte) 0xBB && content[2] == (byte) 0xBF) {
+        encoding = "UTF-8";
+      } else if (content.length >= 2) {
+        if (content[0] == (byte) 0xFF && content[1] == (byte) 0xFE) {
+          encoding = "UTF-16LE";
+        } else if (content[0] == (byte) 0xFE && content[1] == (byte) 0xFF) {
+          encoding = "UTF-16BE";
+        }
+      }
+    }
+
+    return encoding;
+  }
+
+  private String defaultCharEncoding;
+
+  private Configuration conf;
+
+  private DOMContentUtils utils;
+
+  private HtmlParseFilters htmlParseFilters;
+
+  private String cachingPolicy;
+
+  public ParseResult getParse(Content content) {
+    HTMLMetaTags metaTags = new HTMLMetaTags();
+
+    URL base;
+    try {
+      base = new URL(content.getBaseUrl());
+    } catch (MalformedURLException e) {
+      return new ParseStatus(e)
+          .getEmptyParseResult(content.getUrl(), getConf());
+    }
+
+    String text = "";
+    String title = "";
+    Outlink[] outlinks = new Outlink[0];
+    Metadata metadata = new Metadata();
+
+    // parse the content
+    DocumentFragment root;
+    try {
+      byte[] contentInOctets = content.getContent();
+      InputSource input = new InputSource(new ByteArrayInputStream(
+          contentInOctets));
+
+      EncodingDetector detector = new EncodingDetector(conf);
+      detector.autoDetectClues(content, true);
+      detector.addClue(sniffCharacterEncoding(contentInOctets), "sniffed");
+      String encoding = detector.guessEncoding(content, defaultCharEncoding);
+
+      metadata.set(Metadata.ORIGINAL_CHAR_ENCODING, encoding);
+      metadata.set(Metadata.CHAR_ENCODING_FOR_CONVERSION, encoding);
+
+      input.setEncoding(encoding);
+      if (LOG.isTraceEnabled()) {
+        LOG.trace("Parsing...");
+      }
+      root = parse(input);
+    } catch (IOException e) {
+      return new ParseStatus(e)
+          .getEmptyParseResult(content.getUrl(), getConf());
+    } catch (DOMException e) {
+      return new ParseStatus(e)
+          .getEmptyParseResult(content.getUrl(), getConf());
+    } catch (SAXException e) {
+      return new ParseStatus(e)
+          .getEmptyParseResult(content.getUrl(), getConf());
+    } catch (Exception e) {
+      LOG.error("Error: ", e);
+      return new ParseStatus(e)
+          .getEmptyParseResult(content.getUrl(), getConf());
+    }
+
+    // get meta directives
+    HTMLMetaProcessor.getMetaTags(metaTags, root, base);
+
+    // populate Nutch metadata with HTML meta directives
+    metadata.addAll(metaTags.getGeneralTags());
+
+    if (LOG.isTraceEnabled()) {
+      LOG.trace("Meta tags for " + base + ": " + metaTags.toString());
+    }
+    // check meta directives
+    if (!metaTags.getNoIndex()) { // okay to index
+      StringBuffer sb = new StringBuffer();
+      if (LOG.isTraceEnabled()) {
+        LOG.trace("Getting text...");
+      }
+      utils.getText(sb, root); // extract text
+      text = sb.toString();
+      sb.setLength(0);
+      if (LOG.isTraceEnabled()) {
+        LOG.trace("Getting title...");
+      }
+      utils.getTitle(sb, root); // extract title
+      title = sb.toString().trim();
+    }
+
+    if (!metaTags.getNoFollow()) { // okay to follow links
+      ArrayList<Outlink> l = new ArrayList<Outlink>(); // extract outlinks
+      URL baseTag = utils.getBase(root);
+      if (LOG.isTraceEnabled()) {
+        LOG.trace("Getting links...");
+      }
+      utils.getOutlinks(baseTag != null ? baseTag : base, l, root);
+      outlinks = l.toArray(new Outlink[l.size()]);
+      if (LOG.isTraceEnabled()) {
+        LOG.trace("found " + outlinks.length + " outlinks in "
+            + content.getUrl());
+      }
+    }
+
+    ParseStatus status = new ParseStatus(ParseStatus.SUCCESS);
+    if (metaTags.getRefresh()) {
+      status.setMinorCode(ParseStatus.SUCCESS_REDIRECT);
+      status.setArgs(new String[] { metaTags.getRefreshHref().toString(),
+          Integer.toString(metaTags.getRefreshTime()) });
+    }
+    ParseData parseData = new ParseData(status, title, outlinks,
+        content.getMetadata(), metadata);
+    ParseResult parseResult = ParseResult.createParseResult(content.getUrl(),
+        new ParseImpl(text, parseData));
+
+    // run filters on parse
+    ParseResult filteredParse = this.htmlParseFilters.filter(content,
+        parseResult, metaTags, root);
+    if (metaTags.getNoCache()) { // not okay to cache
+      for (Map.Entry<org.apache.hadoop.io.Text, Parse> entry : filteredParse)
+        entry.getValue().getData().getParseMeta()
+            .set(Nutch.CACHING_FORBIDDEN_KEY, cachingPolicy);
+    }
+    return filteredParse;
+  }
+
+  private DocumentFragment parse(InputSource input) throws Exception {
+    if (parserImpl.equalsIgnoreCase("tagsoup"))
+      return parseTagSoup(input);
+    else
+      return parseNeko(input);
+  }
+
+  private DocumentFragment parseTagSoup(InputSource input) throws Exception {
+    HTMLDocumentImpl doc = new HTMLDocumentImpl();
+    DocumentFragment frag = doc.createDocumentFragment();
+    DOMBuilder builder = new DOMBuilder(doc, frag);
+    org.ccil.cowan.tagsoup.Parser reader = new org.ccil.cowan.tagsoup.Parser();
+    reader.setContentHandler(builder);
+    reader.setFeature(org.ccil.cowan.tagsoup.Parser.ignoreBogonsFeature, true);
+    reader.setFeature(org.ccil.cowan.tagsoup.Parser.bogonsEmptyFeature, false);
+    reader
+        .setProperty("http://xml.org/sax/properties/lexical-handler", builder);
+    reader.parse(input);
+    return frag;
+  }
+
+  private DocumentFragment parseNeko(InputSource input) throws Exception {
+    DOMFragmentParser parser = new DOMFragmentParser();
+    try {
+      parser
+          .setFeature(
+              "http://cyberneko.org/html/features/scanner/allow-selfclosing-iframe",
+              true);
+      parser.setFeature("http://cyberneko.org/html/features/augmentations",
+          true);
+      parser.setProperty(
+          "http://cyberneko.org/html/properties/default-encoding",
+          defaultCharEncoding);
+      parser
+          .setFeature(
+              "http://cyberneko.org/html/features/scanner/ignore-specified-charset",
+              true);
+      parser
+          .setFeature(
+              "http://cyberneko.org/html/features/balance-tags/ignore-outside-content",
+              false);
+      parser.setFeature(
+          "http://cyberneko.org/html/features/balance-tags/document-fragment",
+          true);
+      parser.setFeature("http://cyberneko.org/html/features/report-errors",
+          LOG.isTraceEnabled());
+    } catch (SAXException e) {
+    }
+    // convert Document to DocumentFragment
+    HTMLDocumentImpl doc = new HTMLDocumentImpl();
+    doc.setErrorChecking(false);
+    DocumentFragment res = doc.createDocumentFragment();
+    DocumentFragment frag = doc.createDocumentFragment();
+    parser.parse(input, frag);
+    res.appendChild(frag);
+
+    try {
+      while (true) {
+        frag = doc.createDocumentFragment();
+        parser.parse(input, frag);
+        if (!frag.hasChildNodes())
+          break;
+        if (LOG.isInfoEnabled()) {
+          LOG.info(" - new frag, " + frag.getChildNodes().getLength()
+              + " nodes.");
+        }
+        res.appendChild(frag);
+      }
+    } catch (Exception e) {
+      LOG.error("Error: ", e);
+    }
+    ;
+    return res;
+  }
+
+  public static void main(String[] args) throws Exception {
+    // LOG.setLevel(Level.FINE);
+    String name = args[0];
+    String url = "file:" + name;
+    File file = new File(name);
+    byte[] bytes = new byte[(int) file.length()];
+    DataInputStream in = new DataInputStream(new FileInputStream(file));
+    in.readFully(bytes);
+    Configuration conf = NutchConfiguration.create();
+    HtmlParser parser = new HtmlParser();
+    parser.setConf(conf);
+    Parse parse = parser.getParse(
+        new Content(url, url, bytes, "text/html", new Metadata(), conf)).get(
+        url);
+    System.out.println("data: " + parse.getData());
+
+    System.out.println("text: " + parse.getText());
+
+  }
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    this.htmlParseFilters = new HtmlParseFilters(getConf());
+    this.parserImpl = getConf().get("parser.html.impl", "neko");
+    this.defaultCharEncoding = getConf().get(
+        "parser.character.encoding.default", "windows-1252");
+    this.utils = new DOMContentUtils(conf);
+    this.cachingPolicy = getConf().get("parser.caching.forbidden.policy",
+        Nutch.CACHING_FORBIDDEN_CONTENT);
+  }
+
+  public Configuration getConf() {
+    return this.conf;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-html/src/main/java/org/apache/nutch/parse/html/XMLCharacterRecognizer.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-html/src/main/java/org/apache/nutch/parse/html/XMLCharacterRecognizer.java b/nutch-plugins/parse-html/src/main/java/org/apache/nutch/parse/html/XMLCharacterRecognizer.java
new file mode 100644
index 0000000..eb382e8
--- /dev/null
+++ b/nutch-plugins/parse-html/src/main/java/org/apache/nutch/parse/html/XMLCharacterRecognizer.java
@@ -0,0 +1,112 @@
+/*
+ * XXX ab@apache.org: This class is copied verbatim from Xalan-J 2.6.0
+ * XXX distribution, org.apache.xml.utils.XMLCharacterRecognizer,
+ * XXX in order to avoid dependency on Xalan.
+ */
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Id$
+ */
+package org.apache.nutch.parse.html;
+
+/**
+ * Class used to verify whether the specified <var>ch</var> conforms to the XML
+ * 1.0 definition of whitespace.
+ */
+public class XMLCharacterRecognizer {
+
+  /**
+   * Returns whether the specified <var>ch</var> conforms to the XML 1.0
+   * definition of whitespace. Refer to <A
+   * href="http://www.w3.org/TR/1998/REC-xml-19980210#NT-S"> the definition of
+   * <CODE>S</CODE></A> for details.
+   * 
+   * @param ch
+   *          Character to check as XML whitespace.
+   * @return =true if <var>ch</var> is XML whitespace; otherwise =false.
+   */
+  public static boolean isWhiteSpace(char ch) {
+    return (ch == 0x20) || (ch == 0x09) || (ch == 0xD) || (ch == 0xA);
+  }
+
+  /**
+   * Tell if the string is whitespace.
+   * 
+   * @param ch
+   *          Character array to check as XML whitespace.
+   * @param start
+   *          Start index of characters in the array
+   * @param length
+   *          Number of characters in the array
+   * @return True if the characters in the array are XML whitespace; otherwise,
+   *         false.
+   */
+  public static boolean isWhiteSpace(char ch[], int start, int length) {
+
+    int end = start + length;
+
+    for (int s = start; s < end; s++) {
+      if (!isWhiteSpace(ch[s]))
+        return false;
+    }
+
+    return true;
+  }
+
+  /**
+   * Tell if the string is whitespace.
+   * 
+   * @param buf
+   *          StringBuffer to check as XML whitespace.
+   * @return True if characters in buffer are XML whitespace, false otherwise
+   */
+  public static boolean isWhiteSpace(StringBuffer buf) {
+
+    int n = buf.length();
+
+    for (int i = 0; i < n; i++) {
+      if (!isWhiteSpace(buf.charAt(i)))
+        return false;
+    }
+
+    return true;
+  }
+
+  /**
+   * Tell if the string is whitespace.
+   * 
+   * @param s
+   *          String to check as XML whitespace.
+   * @return True if characters in buffer are XML whitespace, false otherwise
+   */
+  public static boolean isWhiteSpace(String s) {
+
+    if (null != s) {
+      int n = s.length();
+
+      for (int i = 0; i < n; i++) {
+        if (!isWhiteSpace(s.charAt(i)))
+          return false;
+      }
+    }
+
+    return true;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-html/src/main/java/org/apache/nutch/parse/html/package.html
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-html/src/main/java/org/apache/nutch/parse/html/package.html b/nutch-plugins/parse-html/src/main/java/org/apache/nutch/parse/html/package.html
new file mode 100644
index 0000000..c650389
--- /dev/null
+++ b/nutch-plugins/parse-html/src/main/java/org/apache/nutch/parse/html/package.html
@@ -0,0 +1,5 @@
+<html>
+<body>
+<p>An HTML document parsing plugin.</p><p>This package relies on <a href="http://www.apache.org/~andyc/neko/doc/html/index.html">NekoHTML</a>.</p>
+</body>
+</html>