You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by th...@apache.org on 2016/07/16 19:48:31 UTC

[15/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build for nutch-core and nutch-plugins

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-selenium/src/main/java/org/apache/nutch/protocol/selenium/HttpWebClient.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-selenium/src/main/java/org/apache/nutch/protocol/selenium/HttpWebClient.java b/nutch-plugins/lib-selenium/src/main/java/org/apache/nutch/protocol/selenium/HttpWebClient.java
new file mode 100644
index 0000000..86692ae
--- /dev/null
+++ b/nutch-plugins/lib-selenium/src/main/java/org/apache/nutch/protocol/selenium/HttpWebClient.java
@@ -0,0 +1,236 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.selenium;
+
+import java.io.BufferedInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.net.URL;
+import java.util.concurrent.TimeUnit;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IOUtils;
+import org.openqa.selenium.By;
+import org.openqa.selenium.OutputType;
+import org.openqa.selenium.TakesScreenshot;
+import org.openqa.selenium.TimeoutException;
+import org.openqa.selenium.WebDriver;
+import org.openqa.selenium.chrome.ChromeDriver;
+import org.openqa.selenium.firefox.FirefoxBinary;
+import org.openqa.selenium.firefox.FirefoxDriver;
+import org.openqa.selenium.firefox.FirefoxProfile;
+import org.openqa.selenium.io.TemporaryFilesystem;
+import org.openqa.selenium.remote.DesiredCapabilities;
+import org.openqa.selenium.remote.RemoteWebDriver;
+import org.openqa.selenium.safari.SafariDriver;
+import org.openqa.selenium.phantomjs.PhantomJSDriver;
+import org.openqa.selenium.phantomjs.PhantomJSDriverService;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.opera.core.systems.OperaDriver;
+
+public class HttpWebClient {
+
+  private static final Logger LOG = LoggerFactory.getLogger(HttpWebClient.class);
+
+  public static ThreadLocal<WebDriver> threadWebDriver = new ThreadLocal<WebDriver>() {
+
+    @Override
+    protected WebDriver initialValue()
+    {
+      FirefoxProfile profile = new FirefoxProfile();
+      profile.setPreference("permissions.default.stylesheet", 2);
+      profile.setPreference("permissions.default.image", 2);
+      profile.setPreference("dom.ipc.plugins.enabled.libflashplayer.so", "false");
+      profile.setPreference(FirefoxProfile.ALLOWED_HOSTS_PREFERENCE, "localhost");
+      WebDriver driver = new FirefoxDriver(profile);
+      return driver;          
+    };
+  };
+
+  public static WebDriver getDriverForPage(String url, Configuration conf) {
+      WebDriver driver = null;
+      DesiredCapabilities capabilities = null;
+      long pageLoadWait = conf.getLong("page.load.delay", 3);
+
+      try {
+        String driverType  = conf.get("selenium.driver", "firefox");
+        switch (driverType) {
+          case "firefox":
+          	String allowedHost = conf.get("selenium.firefox.allowed.hosts", "localhost");
+          	long firefoxBinaryTimeout = conf.getLong("selenium.firefox.binary.timeout", 45);
+          	boolean enableFlashPlayer = conf.getBoolean("selenium.firefox.enable.flash", false);
+          	int loadImage = conf.getInt("selenium.firefox.load.image", 1);
+          	int loadStylesheet = conf.getInt("selenium.firefox.load.stylesheet", 1);
+    		    FirefoxProfile profile = new FirefoxProfile();
+    		    FirefoxBinary binary = new FirefoxBinary();
+    		    profile.setPreference(FirefoxProfile.ALLOWED_HOSTS_PREFERENCE, allowedHost);
+    		    profile.setPreference("dom.ipc.plugins.enabled.libflashplayer.so", enableFlashPlayer);
+    		    profile.setPreference("permissions.default.stylesheet", loadStylesheet);
+  	      	profile.setPreference("permissions.default.image", loadImage);
+    		    binary.setTimeout(TimeUnit.SECONDS.toMillis(firefoxBinaryTimeout));
+            driver = new FirefoxDriver(binary, profile);
+            break;
+          case "chrome":
+            driver = new ChromeDriver();
+            break;
+          case "safari":
+            driver = new SafariDriver();
+            break;
+          case "opera":
+            driver = new OperaDriver();
+            break;
+          case "phantomjs":
+            driver = new PhantomJSDriver();
+            break;
+          case "remote":
+            String seleniumHubHost = conf.get("selenium.hub.host", "localhost");
+            int seleniumHubPort = Integer.parseInt(conf.get("selenium.hub.port", "4444"));
+            String seleniumHubPath = conf.get("selenium.hub.path", "/wd/hub");
+            String seleniumHubProtocol = conf.get("selenium.hub.protocol", "http");
+            String seleniumGridDriver = conf.get("selenium.grid.driver","firefox");
+            String seleniumGridBinary = conf.get("selenium.grid.binary");
+
+            switch (seleniumGridDriver){
+              case "firefox":
+                capabilities = DesiredCapabilities.firefox();
+                capabilities.setBrowserName("firefox");
+                capabilities.setJavascriptEnabled(true);
+                capabilities.setCapability("firefox_binary",seleniumGridBinary);
+                System.setProperty("webdriver.reap_profile", "false");
+                driver = new RemoteWebDriver(new URL(seleniumHubProtocol, seleniumHubHost, seleniumHubPort, seleniumHubPath), capabilities);
+                break;
+              case "phantomjs":
+                capabilities = DesiredCapabilities.phantomjs();
+                capabilities.setBrowserName("phantomjs");
+                capabilities.setJavascriptEnabled(true);
+                capabilities.setCapability(PhantomJSDriverService.PHANTOMJS_EXECUTABLE_PATH_PROPERTY,seleniumGridBinary);
+                driver = new RemoteWebDriver(new URL(seleniumHubProtocol, seleniumHubHost, seleniumHubPort, seleniumHubPath), capabilities);
+                break;
+              default:
+                LOG.error("The Selenium Grid WebDriver choice {} is not available... defaulting to FirefoxDriver().", driverType);
+                driver = new RemoteWebDriver(new URL(seleniumHubProtocol, seleniumHubHost, seleniumHubPort, seleniumHubPath), DesiredCapabilities.firefox());
+                break;
+            }
+            break;
+          default:
+            LOG.error("The Selenium WebDriver choice {} is not available... defaulting to FirefoxDriver().", driverType);
+            driver = new FirefoxDriver();
+            break;
+        }
+        LOG.debug("Selenium {} WebDriver selected.", driverType);
+  
+        driver.manage().timeouts().pageLoadTimeout(pageLoadWait, TimeUnit.SECONDS);
+        driver.get(url);
+      } catch (Exception e) {
+			  if(e instanceof TimeoutException) {
+          LOG.debug("Selenium WebDriver: Timeout Exception: Capturing whatever loaded so far...");
+          return driver;
+			  }
+			  cleanUpDriver(driver);
+		    throw new RuntimeException(e);
+	    } 
+
+      return driver;
+  }
+
+  public static String getHTMLContent(WebDriver driver, Configuration conf) {
+      if (conf.getBoolean("take.screenshot", false)) {
+        takeScreenshot(driver, conf);
+      }
+
+      return driver.findElement(By.tagName("body")).getAttribute("innerHTML");
+  }
+
+  public static void cleanUpDriver(WebDriver driver) {
+    if (driver != null) {
+      try {
+	      driver.close();
+        driver.quit();
+        TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles();
+      } catch (Exception e) {
+        throw new RuntimeException(e);
+      }
+    }
+  }
+
+  /**
+   * Function for obtaining the HTML BODY using the selected
+   * {@link org.openqa.selenium.WebDriver}.
+   * There are a number of configuration properties within
+   * <code>nutch-site.xml</code> which determine whether to
+   * take screenshots of the rendered pages and persist them
+   * as timestamped .png's into HDFS.
+   * @param url the URL to fetch and render
+   * @param conf the {@link org.apache.hadoop.conf.Configuration}
+   * @return the rendered inner HTML page
+   */
+  public static String getHtmlPage(String url, Configuration conf) {
+    WebDriver driver = getDriverForPage(url, conf);
+    
+    try {
+      if (conf.getBoolean("take.screenshot", false)) {
+        takeScreenshot(driver, conf);
+      }
+
+      String innerHtml = driver.findElement(By.tagName("body")).getAttribute("innerHTML");
+      return innerHtml;
+
+      // I'm sure this catch statement is a code smell ; borrowing it from lib-htmlunit
+    } catch (Exception e) {
+      TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles();
+      throw new RuntimeException(e);
+    } finally {
+      cleanUpDriver(driver);
+    }
+  }
+
+  public static String getHtmlPage(String url) {
+    return getHtmlPage(url, null);
+  }
+
+  private static void takeScreenshot(WebDriver driver, Configuration conf) {
+    try {
+      String url = driver.getCurrentUrl();
+      File srcFile = ((TakesScreenshot)driver).getScreenshotAs(OutputType.FILE);
+      LOG.debug("In-memory screenshot taken of: {}", url);
+      FileSystem fs = FileSystem.get(conf);
+      if (conf.get("screenshot.location") != null) {
+        Path screenshotPath = new Path(conf.get("screenshot.location") + "/" + srcFile.getName());
+        OutputStream os = null;
+        if (!fs.exists(screenshotPath)) {
+          LOG.debug("No existing screenshot already exists... creating new file at {} {}.", screenshotPath, srcFile.getName());
+          os = fs.create(screenshotPath);
+        }
+        InputStream is = new BufferedInputStream(new FileInputStream(srcFile));
+        IOUtils.copyBytes(is, os, conf);
+        LOG.debug("Screenshot for {} successfully saved to: {} {}", url, screenshotPath, srcFile.getName()); 
+      } else {
+        LOG.warn("Screenshot for {} not saved to HDFS (subsequently disgarded) as value for "
+            + "'screenshot.location' is absent from nutch-site.xml.", url);
+      }
+    } catch (Exception e) {
+      cleanUpDriver(driver);
+      throw new RuntimeException(e);
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-xml/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-xml/build.xml b/nutch-plugins/lib-xml/build.xml
new file mode 100644
index 0000000..0f87c07
--- /dev/null
+++ b/nutch-plugins/lib-xml/build.xml
@@ -0,0 +1,36 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="lib-xml" default="jar">
+
+	<import file="../build-plugin.xml" />
+
+	<!--
+   ! Override the compile and jar targets,
+   ! since there is nothing to compile here.
+   ! -->
+	<target name="compile" depends="init, resolve-default" />
+
+	<!--
+	<target name="jar" depends="compile">
+		<copy todir="${build.dir}" verbose="true">
+			<fileset dir="./lib" includes="**/*.jar" />
+		</copy>
+	</target>
+	-->
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-xml/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-xml/ivy.xml b/nutch-plugins/lib-xml/ivy.xml
new file mode 100644
index 0000000..414f38a
--- /dev/null
+++ b/nutch-plugins/lib-xml/ivy.xml
@@ -0,0 +1,44 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+    <dependency org="org.jdom" name="jdom" rev="1.1" conf="*->default"/>
+    <dependency org="jaxen" name="jaxen" rev="1.1.1" conf="*->master"/>
+    <dependency org="xerces" name="xercesImpl" rev="2.11.0" conf="*->master"/>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-xml/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-xml/plugin.xml b/nutch-plugins/lib-xml/plugin.xml
new file mode 100644
index 0000000..79bd17f
--- /dev/null
+++ b/nutch-plugins/lib-xml/plugin.xml
@@ -0,0 +1,65 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<!--
+ ! XML library - Gathers many XML related libraries:
+ !
+ ! * Jaxen
+ !     - Download : http://jaxen.org/releases.html
+ !     - License  : http://jaxen.org/license.html
+ !
+ !   * Xerces-J 2.6.1
+ !     - Download : http://xerces.apache.org/xerces2-j/download.cgi
+ !     - License  : http://www.apache.org/licenses/LICENSE-2.0
+ !
+ !   * SAXPath 1.0 FCS
+ !     - Note     : SAXPath has been incorporated into Jaxen.
+ !                  It has been merged into the Jaxen codebase
+ !                  and is no longer being maintained separately
+ !     - Download : http://sourceforge.net/project/showfiles.php?group_id=26014
+ !     - License  : OSI-Approved Open Source
+ !
+ !   * jdom 1.0 beta8-dev
+ !     - Download : http://www.jdom.org/downloads/index.html
+ !     - License  : http://www.jdom.org/docs/faq.html#a0030
+ !
+ !-->
+<plugin
+   id="lib-xml"
+   name="XML Libraries"
+   version="1.0"
+   provider-name="org.apache.nutch.xml">
+
+   <runtime>
+     <library name="jaxen-core.jar">
+       <export name="*"/>
+     </library>
+     <library name="jaxen-jdom.jar">
+       <export name="*"/>
+     </library>
+     <library name="xercesImpl.jar">
+       <export name="*"/>
+     </library>
+     <library name="saxpath.jar">
+       <export name="*"/>
+     </library>
+     <library name="jdom.jar">
+        <export name="*"/>
+     </library>
+   </runtime>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/lib-xml/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-xml/pom.xml b/nutch-plugins/lib-xml/pom.xml
new file mode 100644
index 0000000..132d0f2
--- /dev/null
+++ b/nutch-plugins/lib-xml/pom.xml
@@ -0,0 +1,38 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>lib-xml</artifactId>
+    <packaging>jar</packaging>
+
+    <name>lib-xml</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/microformats-reltag/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/microformats-reltag/build.xml b/nutch-plugins/microformats-reltag/build.xml
new file mode 100644
index 0000000..395afee
--- /dev/null
+++ b/nutch-plugins/microformats-reltag/build.xml
@@ -0,0 +1,27 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="microformats-reltag" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+  <!-- Deploy Unit test dependencies -->
+  <target name="deps-test">
+    <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
+  </target>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/microformats-reltag/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/microformats-reltag/ivy.xml b/nutch-plugins/microformats-reltag/ivy.xml
new file mode 100644
index 0000000..1a86d68
--- /dev/null
+++ b/nutch-plugins/microformats-reltag/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/microformats-reltag/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/microformats-reltag/plugin.xml b/nutch-plugins/microformats-reltag/plugin.xml
new file mode 100644
index 0000000..b35e1f4
--- /dev/null
+++ b/nutch-plugins/microformats-reltag/plugin.xml
@@ -0,0 +1,49 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="microformats-reltag"
+   name="Rel-Tag microformat Parser/Indexer/Querier"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+    <runtime>
+      <library name="microformats-reltag.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+   <extension id="org.apache.nutch.microformats.reltag.RelTagParser"
+              name="Rel-Tag parser"
+              point="org.apache.nutch.parse.HtmlParseFilter">
+      <implementation id="RelTagParser"
+                      class="org.apache.nutch.microformats.reltag.RelTagParser"/>
+   </extension>
+
+   <extension id="org.apache.nutch.microformats.reltag.RelTagIndexingFilter"
+              name="Rel-Tag indexing filter"
+              point="org.apache.nutch.indexer.IndexingFilter">
+      <implementation id="RelTagIndexingFilter"
+                      class="org.apache.nutch.microformats.reltag.RelTagIndexingFilter"/>
+   </extension>
+
+</plugin>
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/microformats-reltag/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/microformats-reltag/pom.xml b/nutch-plugins/microformats-reltag/pom.xml
new file mode 100644
index 0000000..8579cb5
--- /dev/null
+++ b/nutch-plugins/microformats-reltag/pom.xml
@@ -0,0 +1,38 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>microformats-reltag</artifactId>
+    <packaging>jar</packaging>
+
+    <name>microformats-reltag</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/microformats-reltag/src/main/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/microformats-reltag/src/main/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java b/nutch-plugins/microformats-reltag/src/main/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java
new file mode 100644
index 0000000..e50a150
--- /dev/null
+++ b/nutch-plugins/microformats-reltag/src/main/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java
@@ -0,0 +1,77 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.microformats.reltag;
+
+// Nutch imports
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.IndexingFilter;
+import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.parse.Parse;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
+
+/**
+ * An {@link org.apache.nutch.indexer.IndexingFilter} that add <code>tag</code>
+ * field(s) to the document.
+ * 
+ * @see <a href="http://www.microformats.org/wiki/rel-tag">
+ *      http://www.microformats.org/wiki/rel-tag</a>
+ * @author J&eacute;r&ocirc;me Charron
+ */
+public class RelTagIndexingFilter implements IndexingFilter {
+
+  private Configuration conf;
+
+  // Inherited JavaDoc
+  public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
+      CrawlDatum datum, Inlinks inlinks) throws IndexingException {
+
+    // Check if some Rel-Tags found, possibly put there by RelTagParser
+    String[] tags = parse.getData().getParseMeta()
+        .getValues(RelTagParser.REL_TAG);
+    if (tags != null) {
+      for (int i = 0; i < tags.length; i++) {
+        doc.add("tag", tags[i]);
+      }
+    }
+
+    return doc;
+  }
+
+  /*
+   * ----------------------------- * <implementation:Configurable> *
+   * -----------------------------
+   */
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+  }
+
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+  /*
+   * ------------------------------ * </implementation:Configurable> *
+   * ------------------------------
+   */
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/microformats-reltag/src/main/java/org/apache/nutch/microformats/reltag/RelTagParser.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/microformats-reltag/src/main/java/org/apache/nutch/microformats/reltag/RelTagParser.java b/nutch-plugins/microformats-reltag/src/main/java/org/apache/nutch/microformats/reltag/RelTagParser.java
new file mode 100644
index 0000000..9176a1e
--- /dev/null
+++ b/nutch-plugins/microformats-reltag/src/main/java/org/apache/nutch/microformats/reltag/RelTagParser.java
@@ -0,0 +1,148 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.microformats.reltag;
+
+// JDK imports
+import java.net.URL;
+import java.net.URLDecoder;
+import java.util.Iterator;
+import java.util.Set;
+import java.util.TreeSet;
+import org.w3c.dom.DocumentFragment;
+import org.w3c.dom.NamedNodeMap;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+
+// Commons Logging imports
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+// Nutch imports
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.HTMLMetaTags;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseResult;
+import org.apache.nutch.parse.HtmlParseFilter;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.StringUtil;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
+
+/**
+ * Adds microformat rel-tags of document if found.
+ * 
+ * @see <a href="http://www.microformats.org/wiki/rel-tag">
+ *      http://www.microformats.org/wiki/rel-tag</a>
+ */
+public class RelTagParser implements HtmlParseFilter {
+
+  public final static Logger LOG = LoggerFactory.getLogger(RelTagParser.class);
+
+  public final static String REL_TAG = "Rel-Tag";
+
+  private Configuration conf = null;
+
+  /**
+   * Scan the HTML document looking at possible rel-tags
+   */
+  public ParseResult filter(Content content, ParseResult parseResult,
+      HTMLMetaTags metaTags, DocumentFragment doc) {
+
+    // get parse obj
+    Parse parse = parseResult.get(content.getUrl());
+    // Trying to find the document's rel-tags
+    Parser parser = new Parser(doc);
+    Set<?> tags = parser.getRelTags();
+    Iterator<?> iter = tags.iterator();
+    Metadata metadata = parse.getData().getParseMeta();
+    while (iter.hasNext())
+      metadata.add(REL_TAG, (String) iter.next());
+
+    return parseResult;
+  }
+
+  private static class Parser {
+
+    Set<String> tags = null;
+
+    Parser(Node node) {
+      tags = new TreeSet<String>();
+      parse(node);
+    }
+
+    Set<String> getRelTags() {
+      return tags;
+    }
+
+    void parse(Node node) {
+
+      if (node.getNodeType() == Node.ELEMENT_NODE) {
+        // Look for <a> tag
+        if ("a".equalsIgnoreCase(node.getNodeName())) {
+          NamedNodeMap attrs = node.getAttributes();
+          Node hrefNode = attrs.getNamedItem("href");
+          // Checks that it contains a href attribute
+          if (hrefNode != null) {
+            Node relNode = attrs.getNamedItem("rel");
+            // Checks that it contains a rel attribute too
+            if (relNode != null) {
+              // Finaly checks that rel=tag
+              if ("tag".equalsIgnoreCase(relNode.getNodeValue())) {
+                String tag = parseTag(hrefNode.getNodeValue());
+                if (!StringUtil.isEmpty(tag)) {
+                  if (!tags.contains(tag)) {
+                    tags.add(tag);
+                    LOG.debug("Adding tag: " + tag + " to tag set.");
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+
+      // Recurse
+      NodeList children = node.getChildNodes();
+      for (int i = 0; children != null && i < children.getLength(); i++)
+        parse(children.item(i));
+    }
+
+    private final static String parseTag(String url) {
+      String tag = null;
+      try {
+        URL u = new URL(url);
+        String path = u.getPath();
+        tag = URLDecoder.decode(path.substring(path.lastIndexOf('/') + 1),
+            "UTF-8");
+      } catch (Exception e) {
+        // Malformed tag...
+        tag = null;
+      }
+      return tag;
+    }
+
+  }
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+  }
+
+  public Configuration getConf() {
+    return this.conf;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/microformats-reltag/src/main/java/org/apache/nutch/microformats/reltag/package.html
----------------------------------------------------------------------
diff --git a/nutch-plugins/microformats-reltag/src/main/java/org/apache/nutch/microformats/reltag/package.html b/nutch-plugins/microformats-reltag/src/main/java/org/apache/nutch/microformats/reltag/package.html
new file mode 100644
index 0000000..bef5409
--- /dev/null
+++ b/nutch-plugins/microformats-reltag/src/main/java/org/apache/nutch/microformats/reltag/package.html
@@ -0,0 +1,8 @@
+<html>
+<body>
+<p>
+A microformats <a href="http://www.microformats.org/wiki/Rel-Tag">Rel-Tag</a>
+Parser/Indexer/Querier plugin.
+</p>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/mimetype-filter/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/mimetype-filter/build.xml b/nutch-plugins/mimetype-filter/build.xml
new file mode 100644
index 0000000..977e643
--- /dev/null
+++ b/nutch-plugins/mimetype-filter/build.xml
@@ -0,0 +1,28 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="mimetype-filter" default="jar-core">
+
+    <import file="../build-plugin.xml" />
+
+    <!-- for junit test -->
+    <mkdir dir="${build.test}/data"/>
+    <copy todir="${build.test}/data">
+        <fileset dir="sample" includes="**/*.txt"/>
+    </copy>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/mimetype-filter/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/mimetype-filter/ivy.xml b/nutch-plugins/mimetype-filter/ivy.xml
new file mode 100644
index 0000000..0a363f7
--- /dev/null
+++ b/nutch-plugins/mimetype-filter/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="${nutch.root}/ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/mimetype-filter/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/mimetype-filter/plugin.xml b/nutch-plugins/mimetype-filter/plugin.xml
new file mode 100644
index 0000000..d038447
--- /dev/null
+++ b/nutch-plugins/mimetype-filter/plugin.xml
@@ -0,0 +1,37 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="mimetype-filter"
+   name="Filter indexed documents by the detected MIME"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="mimetype-filter.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+    <extension id="org.apache.nutch.indexer.filter"
+               name="Nutch MIME filter"
+               point="org.apache.nutch.indexer.IndexingFilter">
+        <implementation id="MimeTypeIndexingFilter"
+                        class="org.apache.nutch.indexer.filter.MimeTypeIndexingFilter"/>
+    </extension>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/mimetype-filter/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/mimetype-filter/pom.xml b/nutch-plugins/mimetype-filter/pom.xml
new file mode 100644
index 0000000..29c0798
--- /dev/null
+++ b/nutch-plugins/mimetype-filter/pom.xml
@@ -0,0 +1,38 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>mimetype-filter</artifactId>
+    <packaging>jar</packaging>
+
+    <name>mimetype-filter</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/mimetype-filter/src/main/java/org/apache/nutch/indexer/filter/MimeTypeIndexingFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/mimetype-filter/src/main/java/org/apache/nutch/indexer/filter/MimeTypeIndexingFilter.java b/nutch-plugins/mimetype-filter/src/main/java/org/apache/nutch/indexer/filter/MimeTypeIndexingFilter.java
new file mode 100644
index 0000000..494d888
--- /dev/null
+++ b/nutch-plugins/mimetype-filter/src/main/java/org/apache/nutch/indexer/filter/MimeTypeIndexingFilter.java
@@ -0,0 +1,273 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.indexer.filter;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.Options;
+import org.apache.commons.cli.OptionBuilder;
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.CommandLineParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.GnuParser;
+import org.apache.commons.cli.UnrecognizedOptionException;
+
+// Nutch imports
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+
+import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.indexer.IndexingFilter;
+import org.apache.nutch.indexer.NutchDocument;
+
+import org.apache.nutch.net.protocols.Response;
+
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.ParseStatus;
+
+import org.apache.nutch.metadata.Metadata;
+
+import org.apache.nutch.util.MimeUtil;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.PrefixStringMatcher;
+import org.apache.nutch.util.TrieStringMatcher;
+import org.apache.tika.Tika;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * An {@link org.apache.nutch.indexer.IndexingFilter} that allows filtering
+ * of documents based on the MIME Type detected by Tika
+ *
+ */
+public class MimeTypeIndexingFilter implements IndexingFilter {
+
+  public static final String MIMEFILTER_REGEX_FILE = "mimetype.filter.file";
+
+  private static final Logger LOG = LoggerFactory
+      .getLogger(MimeTypeIndexingFilter.class);
+
+  private MimeUtil MIME;
+  private Tika tika = new Tika();
+
+  private TrieStringMatcher trie;
+
+  private Configuration conf;
+
+  private boolean acceptMode = true;
+
+  // Inherited JavaDoc
+  @Override
+  public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
+      CrawlDatum datum, Inlinks inlinks) throws IndexingException {
+
+    String mimeType;
+    String contentType;
+
+    Writable tcontentType = datum.getMetaData()
+        .get(new Text(Response.CONTENT_TYPE));
+
+    if (tcontentType != null) {
+      contentType = tcontentType.toString();
+    } else {
+      contentType = parse.getData().getMeta(Response.CONTENT_TYPE);
+    }
+
+    if (contentType == null) {
+      mimeType = tika.detect(url.toString());
+    } else {
+      mimeType = MIME.forName(MimeUtil.cleanMimeType(contentType));
+    }
+
+    contentType = mimeType;
+
+    if (LOG.isInfoEnabled()) {
+      LOG.info(String.format("[%s] %s", contentType, url));
+    }
+
+    if (trie != null) {
+      if (trie.shortestMatch(contentType) == null) {
+        // no match, but
+        if (acceptMode) {
+          return doc;
+        }
+        return null;
+      } else {
+        // matched, but we are blocking
+        if (acceptMode) {
+          return null;
+        }
+      }
+    }
+
+    return doc;
+  }
+
+  /*
+   * -----------------------------
+   * <implementation:Configurable> *
+   * -----------------------------
+   */
+  @Override
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    MIME = new MimeUtil(conf);
+
+    // load the file of the values
+    String file = conf.get(MIMEFILTER_REGEX_FILE, "");
+
+    if (file != null) {
+      if (file.isEmpty()) {
+        LOG.warn(String
+            .format("Missing %s property, ALL mimetypes will be allowed",
+                MIMEFILTER_REGEX_FILE));
+      } else {
+        Reader reader = conf.getConfResourceAsReader(file);
+
+        try {
+          readConfiguration(reader);
+        } catch (IOException e) {
+          if (LOG.isErrorEnabled()) {
+            LOG.error(e.getMessage());
+          }
+
+          throw new RuntimeException(e.getMessage(), e);
+        }
+      }
+    }
+  }
+
+  private void readConfiguration(Reader reader) throws IOException {
+    BufferedReader in = new BufferedReader(reader);
+    String line;
+    List rules = new ArrayList();
+
+    while (null != (line = in.readLine())) {
+      if (line.length() == 0) {
+        continue;
+      }
+
+      char first = line.charAt(0);
+      switch (first) {
+      case ' ':
+      case '\n':
+      case '#': // skip blank & comment lines
+        break;
+      case '+':
+        acceptMode = true;
+        break;
+      case '-':
+        acceptMode = false;
+        break;
+      default:
+        rules.add(line);
+        break;
+      }
+    }
+
+    trie = new PrefixStringMatcher(rules);
+  }
+
+  @Override
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+  /**
+   * Main method for invoking this tool
+   *
+   * @throws IOException, IndexingException
+   */
+  public static void main(String[] args) throws IOException, IndexingException {
+    Option helpOpt = new Option("h", "help", false, "show this help message");
+    Option rulesOpt = OptionBuilder.withArgName("file").hasArg()
+        .withDescription(
+            "Rules file to be used in the tests relative to the conf directory")
+        .isRequired().create("rules");
+
+    Options options = new Options();
+    options.addOption(helpOpt).addOption(rulesOpt);
+
+    CommandLineParser parser = new GnuParser();
+    HelpFormatter formatter = new HelpFormatter();
+    String rulesFile;
+
+    try {
+      CommandLine line = parser.parse(options, args);
+
+      if (line.hasOption("help") || !line.hasOption("rules")) {
+        formatter
+            .printHelp("org.apache.nutch.indexer.filter.MimeTypeIndexingFilter",
+                options, true);
+        return;
+      }
+
+      rulesFile = line.getOptionValue("rules");
+    } catch (UnrecognizedOptionException e) {
+      formatter
+          .printHelp("org.apache.nutch.indexer.filter.MimeTypeIndexingFilter",
+              options, true);
+      return;
+    } catch (Exception e) {
+      LOG.error(StringUtils.stringifyException(e));
+      e.printStackTrace();
+      return;
+    }
+
+    MimeTypeIndexingFilter filter = new MimeTypeIndexingFilter();
+    Configuration conf = NutchConfiguration.create();
+    conf.set(MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE, rulesFile);
+    filter.setConf(conf);
+
+    BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
+    String line;
+
+    while ((line = in.readLine()) != null && !line.isEmpty()) {
+      Metadata metadata = new Metadata();
+      metadata.set(Response.CONTENT_TYPE, line);
+      ParseImpl parse = new ParseImpl("text",
+          new ParseData(new ParseStatus(), "title", new Outlink[0], metadata));
+
+      NutchDocument doc = filter.filter(new NutchDocument(), parse,
+          new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
+
+      if (doc != null) {
+        System.out.print("+ ");
+        System.out.println(line);
+      } else {
+        System.out.print("- ");
+        System.out.println(line);
+      }
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/mimetype-filter/src/test/java/org/apache/nutch/indexer/filter/MimeTypeIndexingFilterTest.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/mimetype-filter/src/test/java/org/apache/nutch/indexer/filter/MimeTypeIndexingFilterTest.java b/nutch-plugins/mimetype-filter/src/test/java/org/apache/nutch/indexer/filter/MimeTypeIndexingFilterTest.java
new file mode 100644
index 0000000..bca230f
--- /dev/null
+++ b/nutch-plugins/mimetype-filter/src/test/java/org/apache/nutch/indexer/filter/MimeTypeIndexingFilterTest.java
@@ -0,0 +1,114 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.indexer.filter;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.util.NutchConfiguration;
+
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+/**
+ * JUnit based tests of class
+ * {@link org.apache.nutch.indexer.filter.MimeTypeIndexingFilter}
+ *
+ */
+public class MimeTypeIndexingFilterTest {
+
+  private Configuration conf = NutchConfiguration.create();
+  private MimeTypeIndexingFilter filter = new MimeTypeIndexingFilter();
+  private String[] MIME_TYPES = { "text/html", "image/png", "application/pdf" };
+  private ParseImpl[] parses = new ParseImpl[MIME_TYPES.length];
+  private String sampleDir = System.getProperty("test.data", ".");
+
+  @Before
+  public void setUp() throws Exception {
+    for (int i = 0; i < MIME_TYPES.length; i++) {
+      Metadata metadata = new Metadata();
+      metadata.add(Response.CONTENT_TYPE, MIME_TYPES[i]);
+
+      ParseImpl parse = new ParseImpl("text",
+          new ParseData(new ParseStatus(), "title", new Outlink[0], metadata));
+
+      parses[i] = parse;
+    }
+  }
+
+  @Test
+  public void testMissingConfigFile() throws Exception {
+    String file = conf.get(MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE, "");
+    Assert.assertEquals(String
+        .format("Property %s must not be present in the the configuration file",
+            MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE), "", file);
+
+    filter.setConf(conf);
+
+    // property not set so in this cases all documents must pass the filter
+    for (int i = 0; i < parses.length; i++) {
+      NutchDocument doc = filter.filter(new NutchDocument(), parses[i],
+          new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
+
+      Assert.assertNotNull("All documents must be allowed by default", doc);
+    }
+  }
+
+  @Test
+  public void testAllowOnlyImages() throws Exception {
+    conf.set(MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE, "allow-images.txt");
+    filter.setConf(conf);
+
+    for (int i = 0; i < parses.length; i++) {
+      NutchDocument doc = filter.filter(new NutchDocument(), parses[i],
+          new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
+
+      if (MIME_TYPES[i].contains("image")) {
+        Assert.assertNotNull("Allow only images", doc);
+      } else {
+        Assert.assertNull("Block everything else", doc);
+      }
+    }
+  }
+
+  @Test
+  public void testBlockHTML() throws Exception {
+    conf.set(MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE, "block-html.txt");
+    filter.setConf(conf);
+
+    for (int i = 0; i < parses.length; i++) {
+      NutchDocument doc = filter.filter(new NutchDocument(), parses[i],
+          new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
+
+      if (MIME_TYPES[i].contains("html")) {
+        Assert.assertNull("Block only HTML documents", doc);
+      } else {
+        Assert.assertNotNull("Allow everything else", doc);
+      }
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/mimetype-filter/src/test/resources/allow-images.txt
----------------------------------------------------------------------
diff --git a/nutch-plugins/mimetype-filter/src/test/resources/allow-images.txt b/nutch-plugins/mimetype-filter/src/test/resources/allow-images.txt
new file mode 100644
index 0000000..0f5f136
--- /dev/null
+++ b/nutch-plugins/mimetype-filter/src/test/resources/allow-images.txt
@@ -0,0 +1,34 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This filter can be configured to work in one of two modes (similar to
+# suffix-url-filter)
+
+# default to reject ('-'): in this mode, only documents with a mimetype that
+# match the ones specified in the config file will be accepted, all other
+# mimetypes will be rejected.
+
+# default to accept ('+'): in this mode, only documents with a mimetype
+# that match the ones specified in the config file will be rejected,
+# all other mimetypes will be accepted.
+
+# The format of this config file is one mimetype per line, with no preceding
+# whitespace. Order, in which suffixes are specified, doesn't matter. Blank
+# lines and comments (#) are allowed.
+#
+
+-
+
+image

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/mimetype-filter/src/test/resources/block-html.txt
----------------------------------------------------------------------
diff --git a/nutch-plugins/mimetype-filter/src/test/resources/block-html.txt b/nutch-plugins/mimetype-filter/src/test/resources/block-html.txt
new file mode 100644
index 0000000..69600ec
--- /dev/null
+++ b/nutch-plugins/mimetype-filter/src/test/resources/block-html.txt
@@ -0,0 +1,34 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This filter can be configured to work in one of two modes (similar to
+# suffix-url-filter)
+
+# default to reject ('-'): in this mode, only documents with a mimetype that
+# match the ones specified in the config file will be accepted, all other
+# mimetypes will be rejected.
+
+# default to accept ('+'): in this mode, only documents with a mimetype
+# that match the ones specified in the config file will be rejected,
+# all other mimetypes will be accepted.
+
+# The format of this config file is one mimetype per line, with no preceding
+# whitespace. Order, in which suffixes are specified, doesn't matter. Blank
+# lines and comments (#) are allowed.
+#
+
++
+
+text/html
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/nutch-extensionpoints/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/nutch-extensionpoints/build.xml b/nutch-plugins/nutch-extensionpoints/build.xml
new file mode 100644
index 0000000..45eb815
--- /dev/null
+++ b/nutch-plugins/nutch-extensionpoints/build.xml
@@ -0,0 +1,30 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="nutch-extensionpoints" default="jar">
+
+  <import file="../build-plugin.xml"/>
+
+  <!--
+   ! Override the compile and jar targets,
+   ! since there is nothing to compile here.
+   ! -->
+  <target name="compile" depends="init, resolve-default"/>
+
+  <!--target name="jar" depends="compile"/-->
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/nutch-extensionpoints/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/nutch-extensionpoints/ivy.xml b/nutch-plugins/nutch-extensionpoints/ivy.xml
new file mode 100644
index 0000000..1a86d68
--- /dev/null
+++ b/nutch-plugins/nutch-extensionpoints/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/nutch-extensionpoints/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/nutch-extensionpoints/plugin.xml b/nutch-plugins/nutch-extensionpoints/plugin.xml
new file mode 100644
index 0000000..8cf7a23
--- /dev/null
+++ b/nutch-plugins/nutch-extensionpoints/plugin.xml
@@ -0,0 +1,67 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="nutch-extensionpoints"
+   name="the nutch core extension points"
+   version="2.0.0"
+   provider-name="nutch.org">
+
+   <!-- this file hosts all extension points nutch core code offers. 
+   Please not that plugins can define extension points as well to be extendable.-->
+
+<extension-point
+      id="org.apache.nutch.indexer.IndexingFilter"
+      name="Nutch Indexing Filter"/>
+
+<extension-point
+      id="org.apache.nutch.indexer.IndexWriter"
+      name="Nutch Index Writer"/>
+
+<extension-point
+      id="org.apache.nutch.parse.Parser"
+      name="Nutch Content Parser"/>
+ 
+<extension-point
+      id="org.apache.nutch.parse.HtmlParseFilter"
+      name="HTML Parse Filter"/>
+
+<extension-point
+      id="org.apache.nutch.protocol.Protocol"
+      name="Nutch Protocol"/>
+
+<extension-point
+      id="org.apache.nutch.net.URLFilter"
+      name="Nutch URL Filter"/>
+
+<extension-point
+        id="org.apache.nutch.net.URLExemptionFilter"
+        name="Nutch URL Ignore Exemption Filter"/>
+
+<extension-point
+      id="org.apache.nutch.net.URLNormalizer"
+      name="Nutch URL Normalizer"/>
+
+<extension-point
+      id="org.apache.nutch.scoring.ScoringFilter"
+      name="Nutch Scoring"/>
+
+<extension-point
+      id="org.apache.nutch.segment.SegmentMergeFilter"
+      name="Nutch Segment Merge Filter"/>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/nutch-extensionpoints/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/nutch-extensionpoints/pom.xml b/nutch-plugins/nutch-extensionpoints/pom.xml
new file mode 100644
index 0000000..db76178
--- /dev/null
+++ b/nutch-plugins/nutch-extensionpoints/pom.xml
@@ -0,0 +1,38 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>nutch-extensionpoints</artifactId>
+    <packaging>jar</packaging>
+
+    <name>nutch-extensionpoints</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-ext/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-ext/build.xml b/nutch-plugins/parse-ext/build.xml
new file mode 100644
index 0000000..25552fa
--- /dev/null
+++ b/nutch-plugins/parse-ext/build.xml
@@ -0,0 +1,32 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="parse-ext" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+  <!-- Deploy Unit test dependencies -->
+  <target name="deps-test">
+    <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
+    <ant target="deploy" inheritall="false" dir="../protocol-file"/>
+  </target>
+
+
+  <copy file="command" todir="${deploy.dir}" preservelastmodified="true"/>
+  <chmod file="${deploy.dir}/command" perm="755"/>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-ext/command
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-ext/command b/nutch-plugins/parse-ext/command
new file mode 100644
index 0000000..f42c055
--- /dev/null
+++ b/nutch-plugins/parse-ext/command
@@ -0,0 +1,24 @@
+#!/bin/bash
+#
+# Sample bash script as external command invoked by parse-ext plugin
+#
+# 20040701, John Xing
+
+set -e
+
+if  [ $# -ne 1 ]; then
+  echo Usage:$0 mimeType >&2
+  exit 1
+fi
+
+case $1 in
+"application/vnd.nutch.example.cat")
+  cat
+  ;;
+"application/vnd.nutch.example.md5sum")
+  md5sum
+  ;;
+*)
+  echo "Can't parse mimeType $1" >&2
+  exit 1
+esac

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-ext/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-ext/ivy.xml b/nutch-plugins/parse-ext/ivy.xml
new file mode 100644
index 0000000..1a86d68
--- /dev/null
+++ b/nutch-plugins/parse-ext/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-ext/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-ext/plugin.xml b/nutch-plugins/parse-ext/plugin.xml
new file mode 100644
index 0000000..6819b36
--- /dev/null
+++ b/nutch-plugins/parse-ext/plugin.xml
@@ -0,0 +1,60 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="parse-ext"
+   name="External Parser Plug-in"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="parse-ext.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+   <extension id="org.apache.nutch.parse.ext"
+              name="ExtParse"
+              point="org.apache.nutch.parse.Parser">
+
+      <implementation id="ExtParser"
+                      class="org.apache.nutch.parse.ext.ExtParser">
+        <parameter name="contentType" value="application/vnd.nutch.example.cat"/>
+        <parameter name="pathSuffix"  value=""/>
+        <parameter name="command"     value="./build/plugins/parse-ext/command"/>
+        <parameter name="timeout"     value="10"/>
+        <!-- can optionally specify an encoding parameter now, see NUTCH-564-->
+        <!-- <parameter name="encoding" value="UTF-8"/> -->
+      </implementation>
+
+      <implementation id="ExtParser"
+                      class="org.apache.nutch.parse.ext.ExtParser">
+        <parameter name="contentType" value="application/vnd.nutch.example.md5sum"/>
+        <parameter name="pathSuffix"  value=""/>
+        <parameter name="command"     value="./build/plugins/parse-ext/command"/>
+        <parameter name="timeout"     value="20"/>
+        <!-- can optionally specify an encoding parameter now, see NUTCH-564-->
+        <!-- <parameter name="encoding" value="UTF-8"/> -->
+      </implementation>
+
+   </extension>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-ext/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-ext/pom.xml b/nutch-plugins/parse-ext/pom.xml
new file mode 100644
index 0000000..5a7b7be
--- /dev/null
+++ b/nutch-plugins/parse-ext/pom.xml
@@ -0,0 +1,38 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>parse-ext</artifactId>
+    <packaging>jar</packaging>
+
+    <name>parse-ext</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-ext/src/main/java/org/apache/nutch/parse/ext/ExtParser.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-ext/src/main/java/org/apache/nutch/parse/ext/ExtParser.java b/nutch-plugins/parse-ext/src/main/java/org/apache/nutch/parse/ext/ExtParser.java
new file mode 100644
index 0000000..94d9b32
--- /dev/null
+++ b/nutch-plugins/parse-ext/src/main/java/org/apache/nutch/parse/ext/ExtParser.java
@@ -0,0 +1,183 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.ext;
+
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.parse.ParseResult;
+import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.parse.Parser;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.OutlinkExtractor;
+
+import org.apache.nutch.util.CommandRunner;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.hadoop.conf.Configuration;
+
+import org.apache.nutch.plugin.Extension;
+import org.apache.nutch.plugin.PluginRepository;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.Hashtable;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.nio.charset.Charset;
+
+/**
+ * A wrapper that invokes external command to do real parsing job.
+ * 
+ * @author John Xing
+ */
+
+public class ExtParser implements Parser {
+
+  public static final Logger LOG = LoggerFactory
+      .getLogger("org.apache.nutch.parse.ext");
+
+  static final int BUFFER_SIZE = 4096;
+
+  static final int TIMEOUT_DEFAULT = 30; // in seconds
+
+  // handy map from String contentType to String[] {command, timeoutString,
+  // encoding}
+  Hashtable<String, String[]> TYPE_PARAMS_MAP = new Hashtable<String, String[]>();
+
+  private Configuration conf;
+
+  public ExtParser() {
+  }
+
+  public ParseResult getParse(Content content) {
+
+    String contentType = content.getContentType();
+
+    String[] params = (String[]) TYPE_PARAMS_MAP.get(contentType);
+    if (params == null)
+      return new ParseStatus(ParseStatus.FAILED,
+          "No external command defined for contentType: " + contentType)
+          .getEmptyParseResult(content.getUrl(), getConf());
+
+    String command = params[0];
+    int timeout = Integer.parseInt(params[1]);
+    String encoding = params[2];
+
+    if (LOG.isTraceEnabled()) {
+      LOG.trace("Use " + command + " with timeout=" + timeout + "secs");
+    }
+
+    String text = null;
+    String title = null;
+
+    try {
+
+      byte[] raw = content.getContent();
+
+      String contentLength = content.getMetadata().get(Response.CONTENT_LENGTH);
+      if (contentLength != null
+          && raw.length != Integer.parseInt(contentLength)) {
+        return new ParseStatus(ParseStatus.FAILED,
+            ParseStatus.FAILED_TRUNCATED, "Content truncated at " + raw.length
+                + " bytes. Parser can't handle incomplete " + contentType
+                + " file.").getEmptyParseResult(content.getUrl(), getConf());
+      }
+
+      ByteArrayOutputStream os = new ByteArrayOutputStream(BUFFER_SIZE);
+      ByteArrayOutputStream es = new ByteArrayOutputStream(BUFFER_SIZE / 4);
+
+      CommandRunner cr = new CommandRunner();
+
+      cr.setCommand(command + " " + contentType);
+      cr.setInputStream(new ByteArrayInputStream(raw));
+      cr.setStdOutputStream(os);
+      cr.setStdErrorStream(es);
+
+      cr.setTimeout(timeout);
+
+      cr.evaluate();
+
+      if (cr.getExitValue() != 0)
+        return new ParseStatus(ParseStatus.FAILED, "External command "
+            + command + " failed with error: " + es.toString())
+            .getEmptyParseResult(content.getUrl(), getConf());
+
+      text = os.toString(encoding);
+
+    } catch (Exception e) { // run time exception
+      return new ParseStatus(e)
+          .getEmptyParseResult(content.getUrl(), getConf());
+    }
+
+    if (text == null)
+      text = "";
+
+    if (title == null)
+      title = "";
+
+    // collect outlink
+    Outlink[] outlinks = OutlinkExtractor.getOutlinks(text, getConf());
+
+    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title,
+        outlinks, content.getMetadata());
+    return ParseResult.createParseResult(content.getUrl(), new ParseImpl(text,
+        parseData));
+  }
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    Extension[] extensions = PluginRepository.get(conf)
+        .getExtensionPoint("org.apache.nutch.parse.Parser").getExtensions();
+
+    String contentType, command, timeoutString, encoding;
+
+    for (int i = 0; i < extensions.length; i++) {
+      Extension extension = extensions[i];
+
+      // only look for extensions defined by plugin parse-ext
+      if (!extension.getDescriptor().getPluginId().equals("parse-ext"))
+        continue;
+
+      contentType = extension.getAttribute("contentType");
+      if (contentType == null || contentType.equals(""))
+        continue;
+
+      command = extension.getAttribute("command");
+      if (command == null || command.equals(""))
+        continue;
+
+      // null encoding means default
+      encoding = extension.getAttribute("encoding");
+      if (encoding == null)
+        encoding = Charset.defaultCharset().name();
+
+      timeoutString = extension.getAttribute("timeout");
+      if (timeoutString == null || timeoutString.equals(""))
+        timeoutString = "" + TIMEOUT_DEFAULT;
+
+      TYPE_PARAMS_MAP.put(contentType, new String[] { command, timeoutString,
+          encoding });
+    }
+  }
+
+  public Configuration getConf() {
+    return this.conf;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-ext/src/main/java/org/apache/nutch/parse/ext/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-ext/src/main/java/org/apache/nutch/parse/ext/package-info.java b/nutch-plugins/parse-ext/src/main/java/org/apache/nutch/parse/ext/package-info.java
new file mode 100644
index 0000000..6394489
--- /dev/null
+++ b/nutch-plugins/parse-ext/src/main/java/org/apache/nutch/parse/ext/package-info.java
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Parse wrapper to run external command to do the parsing.
+ */
+package org.apache.nutch.parse.ext;
+