You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by th...@apache.org on 2016/07/05 22:48:56 UTC

[12/69] [abbrv] [partial] nutch git commit: Re arranged the source code as per maven conventions for build

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java
----------------------------------------------------------------------
diff --git a/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java b/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java
deleted file mode 100644
index 86692ae..0000000
--- a/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java
+++ /dev/null
@@ -1,236 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.protocol.selenium;
-
-import java.io.BufferedInputStream;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.net.URL;
-import java.util.concurrent.TimeUnit;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.IOUtils;
-import org.openqa.selenium.By;
-import org.openqa.selenium.OutputType;
-import org.openqa.selenium.TakesScreenshot;
-import org.openqa.selenium.TimeoutException;
-import org.openqa.selenium.WebDriver;
-import org.openqa.selenium.chrome.ChromeDriver;
-import org.openqa.selenium.firefox.FirefoxBinary;
-import org.openqa.selenium.firefox.FirefoxDriver;
-import org.openqa.selenium.firefox.FirefoxProfile;
-import org.openqa.selenium.io.TemporaryFilesystem;
-import org.openqa.selenium.remote.DesiredCapabilities;
-import org.openqa.selenium.remote.RemoteWebDriver;
-import org.openqa.selenium.safari.SafariDriver;
-import org.openqa.selenium.phantomjs.PhantomJSDriver;
-import org.openqa.selenium.phantomjs.PhantomJSDriverService;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import com.opera.core.systems.OperaDriver;
-
-public class HttpWebClient {
-
-  private static final Logger LOG = LoggerFactory.getLogger(HttpWebClient.class);
-
-  public static ThreadLocal<WebDriver> threadWebDriver = new ThreadLocal<WebDriver>() {
-
-    @Override
-    protected WebDriver initialValue()
-    {
-      FirefoxProfile profile = new FirefoxProfile();
-      profile.setPreference("permissions.default.stylesheet", 2);
-      profile.setPreference("permissions.default.image", 2);
-      profile.setPreference("dom.ipc.plugins.enabled.libflashplayer.so", "false");
-      profile.setPreference(FirefoxProfile.ALLOWED_HOSTS_PREFERENCE, "localhost");
-      WebDriver driver = new FirefoxDriver(profile);
-      return driver;          
-    };
-  };
-
-  public static WebDriver getDriverForPage(String url, Configuration conf) {
-      WebDriver driver = null;
-      DesiredCapabilities capabilities = null;
-      long pageLoadWait = conf.getLong("page.load.delay", 3);
-
-      try {
-        String driverType  = conf.get("selenium.driver", "firefox");
-        switch (driverType) {
-          case "firefox":
-          	String allowedHost = conf.get("selenium.firefox.allowed.hosts", "localhost");
-          	long firefoxBinaryTimeout = conf.getLong("selenium.firefox.binary.timeout", 45);
-          	boolean enableFlashPlayer = conf.getBoolean("selenium.firefox.enable.flash", false);
-          	int loadImage = conf.getInt("selenium.firefox.load.image", 1);
-          	int loadStylesheet = conf.getInt("selenium.firefox.load.stylesheet", 1);
-    		    FirefoxProfile profile = new FirefoxProfile();
-    		    FirefoxBinary binary = new FirefoxBinary();
-    		    profile.setPreference(FirefoxProfile.ALLOWED_HOSTS_PREFERENCE, allowedHost);
-    		    profile.setPreference("dom.ipc.plugins.enabled.libflashplayer.so", enableFlashPlayer);
-    		    profile.setPreference("permissions.default.stylesheet", loadStylesheet);
-  	      	profile.setPreference("permissions.default.image", loadImage);
-    		    binary.setTimeout(TimeUnit.SECONDS.toMillis(firefoxBinaryTimeout));
-            driver = new FirefoxDriver(binary, profile);
-            break;
-          case "chrome":
-            driver = new ChromeDriver();
-            break;
-          case "safari":
-            driver = new SafariDriver();
-            break;
-          case "opera":
-            driver = new OperaDriver();
-            break;
-          case "phantomjs":
-            driver = new PhantomJSDriver();
-            break;
-          case "remote":
-            String seleniumHubHost = conf.get("selenium.hub.host", "localhost");
-            int seleniumHubPort = Integer.parseInt(conf.get("selenium.hub.port", "4444"));
-            String seleniumHubPath = conf.get("selenium.hub.path", "/wd/hub");
-            String seleniumHubProtocol = conf.get("selenium.hub.protocol", "http");
-            String seleniumGridDriver = conf.get("selenium.grid.driver","firefox");
-            String seleniumGridBinary = conf.get("selenium.grid.binary");
-
-            switch (seleniumGridDriver){
-              case "firefox":
-                capabilities = DesiredCapabilities.firefox();
-                capabilities.setBrowserName("firefox");
-                capabilities.setJavascriptEnabled(true);
-                capabilities.setCapability("firefox_binary",seleniumGridBinary);
-                System.setProperty("webdriver.reap_profile", "false");
-                driver = new RemoteWebDriver(new URL(seleniumHubProtocol, seleniumHubHost, seleniumHubPort, seleniumHubPath), capabilities);
-                break;
-              case "phantomjs":
-                capabilities = DesiredCapabilities.phantomjs();
-                capabilities.setBrowserName("phantomjs");
-                capabilities.setJavascriptEnabled(true);
-                capabilities.setCapability(PhantomJSDriverService.PHANTOMJS_EXECUTABLE_PATH_PROPERTY,seleniumGridBinary);
-                driver = new RemoteWebDriver(new URL(seleniumHubProtocol, seleniumHubHost, seleniumHubPort, seleniumHubPath), capabilities);
-                break;
-              default:
-                LOG.error("The Selenium Grid WebDriver choice {} is not available... defaulting to FirefoxDriver().", driverType);
-                driver = new RemoteWebDriver(new URL(seleniumHubProtocol, seleniumHubHost, seleniumHubPort, seleniumHubPath), DesiredCapabilities.firefox());
-                break;
-            }
-            break;
-          default:
-            LOG.error("The Selenium WebDriver choice {} is not available... defaulting to FirefoxDriver().", driverType);
-            driver = new FirefoxDriver();
-            break;
-        }
-        LOG.debug("Selenium {} WebDriver selected.", driverType);
-  
-        driver.manage().timeouts().pageLoadTimeout(pageLoadWait, TimeUnit.SECONDS);
-        driver.get(url);
-      } catch (Exception e) {
-			  if(e instanceof TimeoutException) {
-          LOG.debug("Selenium WebDriver: Timeout Exception: Capturing whatever loaded so far...");
-          return driver;
-			  }
-			  cleanUpDriver(driver);
-		    throw new RuntimeException(e);
-	    } 
-
-      return driver;
-  }
-
-  public static String getHTMLContent(WebDriver driver, Configuration conf) {
-      if (conf.getBoolean("take.screenshot", false)) {
-        takeScreenshot(driver, conf);
-      }
-
-      return driver.findElement(By.tagName("body")).getAttribute("innerHTML");
-  }
-
-  public static void cleanUpDriver(WebDriver driver) {
-    if (driver != null) {
-      try {
-	      driver.close();
-        driver.quit();
-        TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles();
-      } catch (Exception e) {
-        throw new RuntimeException(e);
-      }
-    }
-  }
-
-  /**
-   * Function for obtaining the HTML BODY using the selected
-   * {@link org.openqa.selenium.WebDriver}.
-   * There are a number of configuration properties within
-   * <code>nutch-site.xml</code> which determine whether to
-   * take screenshots of the rendered pages and persist them
-   * as timestamped .png's into HDFS.
-   * @param url the URL to fetch and render
-   * @param conf the {@link org.apache.hadoop.conf.Configuration}
-   * @return the rendered inner HTML page
-   */
-  public static String getHtmlPage(String url, Configuration conf) {
-    WebDriver driver = getDriverForPage(url, conf);
-    
-    try {
-      if (conf.getBoolean("take.screenshot", false)) {
-        takeScreenshot(driver, conf);
-      }
-
-      String innerHtml = driver.findElement(By.tagName("body")).getAttribute("innerHTML");
-      return innerHtml;
-
-      // I'm sure this catch statement is a code smell ; borrowing it from lib-htmlunit
-    } catch (Exception e) {
-      TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles();
-      throw new RuntimeException(e);
-    } finally {
-      cleanUpDriver(driver);
-    }
-  }
-
-  public static String getHtmlPage(String url) {
-    return getHtmlPage(url, null);
-  }
-
-  private static void takeScreenshot(WebDriver driver, Configuration conf) {
-    try {
-      String url = driver.getCurrentUrl();
-      File srcFile = ((TakesScreenshot)driver).getScreenshotAs(OutputType.FILE);
-      LOG.debug("In-memory screenshot taken of: {}", url);
-      FileSystem fs = FileSystem.get(conf);
-      if (conf.get("screenshot.location") != null) {
-        Path screenshotPath = new Path(conf.get("screenshot.location") + "/" + srcFile.getName());
-        OutputStream os = null;
-        if (!fs.exists(screenshotPath)) {
-          LOG.debug("No existing screenshot already exists... creating new file at {} {}.", screenshotPath, srcFile.getName());
-          os = fs.create(screenshotPath);
-        }
-        InputStream is = new BufferedInputStream(new FileInputStream(srcFile));
-        IOUtils.copyBytes(is, os, conf);
-        LOG.debug("Screenshot for {} successfully saved to: {} {}", url, screenshotPath, srcFile.getName()); 
-      } else {
-        LOG.warn("Screenshot for {} not saved to HDFS (subsequently disgarded) as value for "
-            + "'screenshot.location' is absent from nutch-site.xml.", url);
-      }
-    } catch (Exception e) {
-      cleanUpDriver(driver);
-      throw new RuntimeException(e);
-    }
-  }
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/lib-xml/build.xml
----------------------------------------------------------------------
diff --git a/src/plugin/lib-xml/build.xml b/src/plugin/lib-xml/build.xml
deleted file mode 100644
index 0f87c07..0000000
--- a/src/plugin/lib-xml/build.xml
+++ /dev/null
@@ -1,36 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="lib-xml" default="jar">
-
-	<import file="../build-plugin.xml" />
-
-	<!--
-   ! Override the compile and jar targets,
-   ! since there is nothing to compile here.
-   ! -->
-	<target name="compile" depends="init, resolve-default" />
-
-	<!--
-	<target name="jar" depends="compile">
-		<copy todir="${build.dir}" verbose="true">
-			<fileset dir="./lib" includes="**/*.jar" />
-		</copy>
-	</target>
-	-->
-
-</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/lib-xml/ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/lib-xml/ivy.xml b/src/plugin/lib-xml/ivy.xml
deleted file mode 100644
index 414f38a..0000000
--- a/src/plugin/lib-xml/ivy.xml
+++ /dev/null
@@ -1,44 +0,0 @@
-<?xml version="1.0" ?>
-
-<!--
-   Licensed to the Apache Software Foundation (ASF) under one or more
-   contributor license agreements.  See the NOTICE file distributed with
-   this work for additional information regarding copyright ownership.
-   The ASF licenses this file to You under the Apache License, Version 2.0
-   (the "License"); you may not use this file except in compliance with
-   the License.  You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
--->
-
-<ivy-module version="1.0">
-  <info organisation="org.apache.nutch" module="${ant.project.name}">
-    <license name="Apache 2.0"/>
-    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
-    <description>
-        Apache Nutch
-    </description>
-  </info>
-
-  <configurations>
-    <include file="../../..//ivy/ivy-configurations.xml"/>
-  </configurations>
-
-  <publications>
-    <!--get the artifact from our module name-->
-    <artifact conf="master"/>
-  </publications>
-
-  <dependencies>
-    <dependency org="org.jdom" name="jdom" rev="1.1" conf="*->default"/>
-    <dependency org="jaxen" name="jaxen" rev="1.1.1" conf="*->master"/>
-    <dependency org="xerces" name="xercesImpl" rev="2.11.0" conf="*->master"/>
-  </dependencies>
-  
-</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/lib-xml/plugin.xml
----------------------------------------------------------------------
diff --git a/src/plugin/lib-xml/plugin.xml b/src/plugin/lib-xml/plugin.xml
deleted file mode 100644
index 79bd17f..0000000
--- a/src/plugin/lib-xml/plugin.xml
+++ /dev/null
@@ -1,65 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<!--
- ! XML library - Gathers many XML related libraries:
- !
- ! * Jaxen
- !     - Download : http://jaxen.org/releases.html
- !     - License  : http://jaxen.org/license.html
- !
- !   * Xerces-J 2.6.1
- !     - Download : http://xerces.apache.org/xerces2-j/download.cgi
- !     - License  : http://www.apache.org/licenses/LICENSE-2.0
- !
- !   * SAXPath 1.0 FCS
- !     - Note     : SAXPath has been incorporated into Jaxen.
- !                  It has been merged into the Jaxen codebase
- !                  and is no longer being maintained separately
- !     - Download : http://sourceforge.net/project/showfiles.php?group_id=26014
- !     - License  : OSI-Approved Open Source
- !
- !   * jdom 1.0 beta8-dev
- !     - Download : http://www.jdom.org/downloads/index.html
- !     - License  : http://www.jdom.org/docs/faq.html#a0030
- !
- !-->
-<plugin
-   id="lib-xml"
-   name="XML Libraries"
-   version="1.0"
-   provider-name="org.apache.nutch.xml">
-
-   <runtime>
-     <library name="jaxen-core.jar">
-       <export name="*"/>
-     </library>
-     <library name="jaxen-jdom.jar">
-       <export name="*"/>
-     </library>
-     <library name="xercesImpl.jar">
-       <export name="*"/>
-     </library>
-     <library name="saxpath.jar">
-       <export name="*"/>
-     </library>
-     <library name="jdom.jar">
-        <export name="*"/>
-     </library>
-   </runtime>
-
-</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/microformats-reltag/build.xml
----------------------------------------------------------------------
diff --git a/src/plugin/microformats-reltag/build.xml b/src/plugin/microformats-reltag/build.xml
deleted file mode 100644
index 395afee..0000000
--- a/src/plugin/microformats-reltag/build.xml
+++ /dev/null
@@ -1,27 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="microformats-reltag" default="jar-core">
-
-  <import file="../build-plugin.xml"/>
-
-  <!-- Deploy Unit test dependencies -->
-  <target name="deps-test">
-    <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
-  </target>
-
-</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/microformats-reltag/ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/microformats-reltag/ivy.xml b/src/plugin/microformats-reltag/ivy.xml
deleted file mode 100644
index 1a86d68..0000000
--- a/src/plugin/microformats-reltag/ivy.xml
+++ /dev/null
@@ -1,41 +0,0 @@
-<?xml version="1.0" ?>
-
-<!--
-   Licensed to the Apache Software Foundation (ASF) under one or more
-   contributor license agreements.  See the NOTICE file distributed with
-   this work for additional information regarding copyright ownership.
-   The ASF licenses this file to You under the Apache License, Version 2.0
-   (the "License"); you may not use this file except in compliance with
-   the License.  You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
--->
-
-<ivy-module version="1.0">
-  <info organisation="org.apache.nutch" module="${ant.project.name}">
-    <license name="Apache 2.0"/>
-    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
-    <description>
-        Apache Nutch
-    </description>
-  </info>
-
-  <configurations>
-    <include file="../../..//ivy/ivy-configurations.xml"/>
-  </configurations>
-
-  <publications>
-    <!--get the artifact from our module name-->
-    <artifact conf="master"/>
-  </publications>
-
-  <dependencies>
-  </dependencies>
-  
-</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/microformats-reltag/plugin.xml
----------------------------------------------------------------------
diff --git a/src/plugin/microformats-reltag/plugin.xml b/src/plugin/microformats-reltag/plugin.xml
deleted file mode 100644
index b35e1f4..0000000
--- a/src/plugin/microformats-reltag/plugin.xml
+++ /dev/null
@@ -1,49 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<plugin
-   id="microformats-reltag"
-   name="Rel-Tag microformat Parser/Indexer/Querier"
-   version="1.0.0"
-   provider-name="nutch.org">
-
-    <runtime>
-      <library name="microformats-reltag.jar">
-         <export name="*"/>
-      </library>
-   </runtime>
-
-   <requires>
-      <import plugin="nutch-extensionpoints"/>
-   </requires>
-
-   <extension id="org.apache.nutch.microformats.reltag.RelTagParser"
-              name="Rel-Tag parser"
-              point="org.apache.nutch.parse.HtmlParseFilter">
-      <implementation id="RelTagParser"
-                      class="org.apache.nutch.microformats.reltag.RelTagParser"/>
-   </extension>
-
-   <extension id="org.apache.nutch.microformats.reltag.RelTagIndexingFilter"
-              name="Rel-Tag indexing filter"
-              point="org.apache.nutch.indexer.IndexingFilter">
-      <implementation id="RelTagIndexingFilter"
-                      class="org.apache.nutch.microformats.reltag.RelTagIndexingFilter"/>
-   </extension>
-
-</plugin>
-

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java
----------------------------------------------------------------------
diff --git a/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java b/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java
deleted file mode 100644
index e50a150..0000000
--- a/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java
+++ /dev/null
@@ -1,77 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.microformats.reltag;
-
-// Nutch imports
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.crawl.Inlinks;
-import org.apache.nutch.indexer.IndexingFilter;
-import org.apache.nutch.indexer.IndexingException;
-import org.apache.nutch.indexer.NutchDocument;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.parse.Parse;
-
-// Hadoop imports
-import org.apache.hadoop.conf.Configuration;
-
-/**
- * An {@link org.apache.nutch.indexer.IndexingFilter} that add <code>tag</code>
- * field(s) to the document.
- * 
- * @see <a href="http://www.microformats.org/wiki/rel-tag">
- *      http://www.microformats.org/wiki/rel-tag</a>
- * @author J&eacute;r&ocirc;me Charron
- */
-public class RelTagIndexingFilter implements IndexingFilter {
-
-  private Configuration conf;
-
-  // Inherited JavaDoc
-  public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
-      CrawlDatum datum, Inlinks inlinks) throws IndexingException {
-
-    // Check if some Rel-Tags found, possibly put there by RelTagParser
-    String[] tags = parse.getData().getParseMeta()
-        .getValues(RelTagParser.REL_TAG);
-    if (tags != null) {
-      for (int i = 0; i < tags.length; i++) {
-        doc.add("tag", tags[i]);
-      }
-    }
-
-    return doc;
-  }
-
-  /*
-   * ----------------------------- * <implementation:Configurable> *
-   * -----------------------------
-   */
-
-  public void setConf(Configuration conf) {
-    this.conf = conf;
-  }
-
-  public Configuration getConf() {
-    return this.conf;
-  }
-
-  /*
-   * ------------------------------ * </implementation:Configurable> *
-   * ------------------------------
-   */
-
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java
----------------------------------------------------------------------
diff --git a/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java b/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java
deleted file mode 100644
index 9176a1e..0000000
--- a/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java
+++ /dev/null
@@ -1,148 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.microformats.reltag;
-
-// JDK imports
-import java.net.URL;
-import java.net.URLDecoder;
-import java.util.Iterator;
-import java.util.Set;
-import java.util.TreeSet;
-import org.w3c.dom.DocumentFragment;
-import org.w3c.dom.NamedNodeMap;
-import org.w3c.dom.Node;
-import org.w3c.dom.NodeList;
-
-// Commons Logging imports
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-// Nutch imports
-import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.parse.HTMLMetaTags;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseResult;
-import org.apache.nutch.parse.HtmlParseFilter;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.util.StringUtil;
-
-// Hadoop imports
-import org.apache.hadoop.conf.Configuration;
-
-/**
- * Adds microformat rel-tags of document if found.
- * 
- * @see <a href="http://www.microformats.org/wiki/rel-tag">
- *      http://www.microformats.org/wiki/rel-tag</a>
- */
-public class RelTagParser implements HtmlParseFilter {
-
-  public final static Logger LOG = LoggerFactory.getLogger(RelTagParser.class);
-
-  public final static String REL_TAG = "Rel-Tag";
-
-  private Configuration conf = null;
-
-  /**
-   * Scan the HTML document looking at possible rel-tags
-   */
-  public ParseResult filter(Content content, ParseResult parseResult,
-      HTMLMetaTags metaTags, DocumentFragment doc) {
-
-    // get parse obj
-    Parse parse = parseResult.get(content.getUrl());
-    // Trying to find the document's rel-tags
-    Parser parser = new Parser(doc);
-    Set<?> tags = parser.getRelTags();
-    Iterator<?> iter = tags.iterator();
-    Metadata metadata = parse.getData().getParseMeta();
-    while (iter.hasNext())
-      metadata.add(REL_TAG, (String) iter.next());
-
-    return parseResult;
-  }
-
-  private static class Parser {
-
-    Set<String> tags = null;
-
-    Parser(Node node) {
-      tags = new TreeSet<String>();
-      parse(node);
-    }
-
-    Set<String> getRelTags() {
-      return tags;
-    }
-
-    void parse(Node node) {
-
-      if (node.getNodeType() == Node.ELEMENT_NODE) {
-        // Look for <a> tag
-        if ("a".equalsIgnoreCase(node.getNodeName())) {
-          NamedNodeMap attrs = node.getAttributes();
-          Node hrefNode = attrs.getNamedItem("href");
-          // Checks that it contains a href attribute
-          if (hrefNode != null) {
-            Node relNode = attrs.getNamedItem("rel");
-            // Checks that it contains a rel attribute too
-            if (relNode != null) {
-              // Finaly checks that rel=tag
-              if ("tag".equalsIgnoreCase(relNode.getNodeValue())) {
-                String tag = parseTag(hrefNode.getNodeValue());
-                if (!StringUtil.isEmpty(tag)) {
-                  if (!tags.contains(tag)) {
-                    tags.add(tag);
-                    LOG.debug("Adding tag: " + tag + " to tag set.");
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-
-      // Recurse
-      NodeList children = node.getChildNodes();
-      for (int i = 0; children != null && i < children.getLength(); i++)
-        parse(children.item(i));
-    }
-
-    private final static String parseTag(String url) {
-      String tag = null;
-      try {
-        URL u = new URL(url);
-        String path = u.getPath();
-        tag = URLDecoder.decode(path.substring(path.lastIndexOf('/') + 1),
-            "UTF-8");
-      } catch (Exception e) {
-        // Malformed tag...
-        tag = null;
-      }
-      return tag;
-    }
-
-  }
-
-  public void setConf(Configuration conf) {
-    this.conf = conf;
-  }
-
-  public Configuration getConf() {
-    return this.conf;
-  }
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/package.html
----------------------------------------------------------------------
diff --git a/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/package.html b/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/package.html
deleted file mode 100644
index bef5409..0000000
--- a/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/package.html
+++ /dev/null
@@ -1,8 +0,0 @@
-<html>
-<body>
-<p>
-A microformats <a href="http://www.microformats.org/wiki/Rel-Tag">Rel-Tag</a>
-Parser/Indexer/Querier plugin.
-</p>
-</body>
-</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/mimetype-filter/build.xml
----------------------------------------------------------------------
diff --git a/src/plugin/mimetype-filter/build.xml b/src/plugin/mimetype-filter/build.xml
deleted file mode 100644
index 977e643..0000000
--- a/src/plugin/mimetype-filter/build.xml
+++ /dev/null
@@ -1,28 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="mimetype-filter" default="jar-core">
-
-    <import file="../build-plugin.xml" />
-
-    <!-- for junit test -->
-    <mkdir dir="${build.test}/data"/>
-    <copy todir="${build.test}/data">
-        <fileset dir="sample" includes="**/*.txt"/>
-    </copy>
-
-</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/mimetype-filter/ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/mimetype-filter/ivy.xml b/src/plugin/mimetype-filter/ivy.xml
deleted file mode 100644
index 0a363f7..0000000
--- a/src/plugin/mimetype-filter/ivy.xml
+++ /dev/null
@@ -1,41 +0,0 @@
-<?xml version="1.0" ?>
-
-<!--
-   Licensed to the Apache Software Foundation (ASF) under one or more
-   contributor license agreements.  See the NOTICE file distributed with
-   this work for additional information regarding copyright ownership.
-   The ASF licenses this file to You under the Apache License, Version 2.0
-   (the "License"); you may not use this file except in compliance with
-   the License.  You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
--->
-
-<ivy-module version="1.0">
-  <info organisation="org.apache.nutch" module="${ant.project.name}">
-    <license name="Apache 2.0"/>
-    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
-    <description>
-        Apache Nutch
-    </description>
-  </info>
-
-  <configurations>
-    <include file="${nutch.root}/ivy/ivy-configurations.xml"/>
-  </configurations>
-
-  <publications>
-    <!--get the artifact from our module name-->
-    <artifact conf="master"/>
-  </publications>
-
-  <dependencies>
-  </dependencies>
-  
-</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/mimetype-filter/plugin.xml
----------------------------------------------------------------------
diff --git a/src/plugin/mimetype-filter/plugin.xml b/src/plugin/mimetype-filter/plugin.xml
deleted file mode 100644
index d038447..0000000
--- a/src/plugin/mimetype-filter/plugin.xml
+++ /dev/null
@@ -1,37 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<plugin
-   id="mimetype-filter"
-   name="Filter indexed documents by the detected MIME"
-   version="1.0.0"
-   provider-name="nutch.org">
-
-   <runtime>
-      <library name="mimetype-filter.jar">
-         <export name="*"/>
-      </library>
-   </runtime>
-
-    <extension id="org.apache.nutch.indexer.filter"
-               name="Nutch MIME filter"
-               point="org.apache.nutch.indexer.IndexingFilter">
-        <implementation id="MimeTypeIndexingFilter"
-                        class="org.apache.nutch.indexer.filter.MimeTypeIndexingFilter"/>
-    </extension>
-
-</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/mimetype-filter/sample/allow-images.txt
----------------------------------------------------------------------
diff --git a/src/plugin/mimetype-filter/sample/allow-images.txt b/src/plugin/mimetype-filter/sample/allow-images.txt
deleted file mode 100644
index 0f5f136..0000000
--- a/src/plugin/mimetype-filter/sample/allow-images.txt
+++ /dev/null
@@ -1,34 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# This filter can be configured to work in one of two modes (similar to
-# suffix-url-filter)
-
-# default to reject ('-'): in this mode, only documents with a mimetype that
-# match the ones specified in the config file will be accepted, all other
-# mimetypes will be rejected.
-
-# default to accept ('+'): in this mode, only documents with a mimetype
-# that match the ones specified in the config file will be rejected,
-# all other mimetypes will be accepted.
-
-# The format of this config file is one mimetype per line, with no preceding
-# whitespace. Order, in which suffixes are specified, doesn't matter. Blank
-# lines and comments (#) are allowed.
-#
-
--
-
-image

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/mimetype-filter/sample/block-html.txt
----------------------------------------------------------------------
diff --git a/src/plugin/mimetype-filter/sample/block-html.txt b/src/plugin/mimetype-filter/sample/block-html.txt
deleted file mode 100644
index 69600ec..0000000
--- a/src/plugin/mimetype-filter/sample/block-html.txt
+++ /dev/null
@@ -1,34 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# This filter can be configured to work in one of two modes (similar to
-# suffix-url-filter)
-
-# default to reject ('-'): in this mode, only documents with a mimetype that
-# match the ones specified in the config file will be accepted, all other
-# mimetypes will be rejected.
-
-# default to accept ('+'): in this mode, only documents with a mimetype
-# that match the ones specified in the config file will be rejected,
-# all other mimetypes will be accepted.
-
-# The format of this config file is one mimetype per line, with no preceding
-# whitespace. Order, in which suffixes are specified, doesn't matter. Blank
-# lines and comments (#) are allowed.
-#
-
-+
-
-text/html
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/mimetype-filter/src/java/org/apache/nutch/indexer/filter/MimeTypeIndexingFilter.java
----------------------------------------------------------------------
diff --git a/src/plugin/mimetype-filter/src/java/org/apache/nutch/indexer/filter/MimeTypeIndexingFilter.java b/src/plugin/mimetype-filter/src/java/org/apache/nutch/indexer/filter/MimeTypeIndexingFilter.java
deleted file mode 100644
index 494d888..0000000
--- a/src/plugin/mimetype-filter/src/java/org/apache/nutch/indexer/filter/MimeTypeIndexingFilter.java
+++ /dev/null
@@ -1,273 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.indexer.filter;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import org.apache.commons.cli.Option;
-import org.apache.commons.cli.Options;
-import org.apache.commons.cli.OptionBuilder;
-import org.apache.commons.cli.CommandLine;
-import org.apache.commons.cli.CommandLineParser;
-import org.apache.commons.cli.HelpFormatter;
-import org.apache.commons.cli.GnuParser;
-import org.apache.commons.cli.UnrecognizedOptionException;
-
-// Nutch imports
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.util.StringUtils;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.io.Writable;
-
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.crawl.Inlinks;
-
-import org.apache.nutch.indexer.IndexingException;
-import org.apache.nutch.indexer.IndexingFilter;
-import org.apache.nutch.indexer.NutchDocument;
-
-import org.apache.nutch.net.protocols.Response;
-
-import org.apache.nutch.parse.Outlink;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseData;
-import org.apache.nutch.parse.ParseImpl;
-import org.apache.nutch.parse.ParseStatus;
-
-import org.apache.nutch.metadata.Metadata;
-
-import org.apache.nutch.util.MimeUtil;
-import org.apache.nutch.util.NutchConfiguration;
-import org.apache.nutch.util.PrefixStringMatcher;
-import org.apache.nutch.util.TrieStringMatcher;
-import org.apache.tika.Tika;
-
-import java.io.BufferedReader;
-import java.io.IOException;
-import java.io.InputStreamReader;
-import java.io.Reader;
-import java.util.ArrayList;
-import java.util.List;
-
-/**
- * An {@link org.apache.nutch.indexer.IndexingFilter} that allows filtering
- * of documents based on the MIME Type detected by Tika
- *
- */
-public class MimeTypeIndexingFilter implements IndexingFilter {
-
-  public static final String MIMEFILTER_REGEX_FILE = "mimetype.filter.file";
-
-  private static final Logger LOG = LoggerFactory
-      .getLogger(MimeTypeIndexingFilter.class);
-
-  private MimeUtil MIME;
-  private Tika tika = new Tika();
-
-  private TrieStringMatcher trie;
-
-  private Configuration conf;
-
-  private boolean acceptMode = true;
-
-  // Inherited JavaDoc
-  @Override
-  public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
-      CrawlDatum datum, Inlinks inlinks) throws IndexingException {
-
-    String mimeType;
-    String contentType;
-
-    Writable tcontentType = datum.getMetaData()
-        .get(new Text(Response.CONTENT_TYPE));
-
-    if (tcontentType != null) {
-      contentType = tcontentType.toString();
-    } else {
-      contentType = parse.getData().getMeta(Response.CONTENT_TYPE);
-    }
-
-    if (contentType == null) {
-      mimeType = tika.detect(url.toString());
-    } else {
-      mimeType = MIME.forName(MimeUtil.cleanMimeType(contentType));
-    }
-
-    contentType = mimeType;
-
-    if (LOG.isInfoEnabled()) {
-      LOG.info(String.format("[%s] %s", contentType, url));
-    }
-
-    if (trie != null) {
-      if (trie.shortestMatch(contentType) == null) {
-        // no match, but
-        if (acceptMode) {
-          return doc;
-        }
-        return null;
-      } else {
-        // matched, but we are blocking
-        if (acceptMode) {
-          return null;
-        }
-      }
-    }
-
-    return doc;
-  }
-
-  /*
-   * -----------------------------
-   * <implementation:Configurable> *
-   * -----------------------------
-   */
-  @Override
-  public void setConf(Configuration conf) {
-    this.conf = conf;
-    MIME = new MimeUtil(conf);
-
-    // load the file of the values
-    String file = conf.get(MIMEFILTER_REGEX_FILE, "");
-
-    if (file != null) {
-      if (file.isEmpty()) {
-        LOG.warn(String
-            .format("Missing %s property, ALL mimetypes will be allowed",
-                MIMEFILTER_REGEX_FILE));
-      } else {
-        Reader reader = conf.getConfResourceAsReader(file);
-
-        try {
-          readConfiguration(reader);
-        } catch (IOException e) {
-          if (LOG.isErrorEnabled()) {
-            LOG.error(e.getMessage());
-          }
-
-          throw new RuntimeException(e.getMessage(), e);
-        }
-      }
-    }
-  }
-
-  private void readConfiguration(Reader reader) throws IOException {
-    BufferedReader in = new BufferedReader(reader);
-    String line;
-    List rules = new ArrayList();
-
-    while (null != (line = in.readLine())) {
-      if (line.length() == 0) {
-        continue;
-      }
-
-      char first = line.charAt(0);
-      switch (first) {
-      case ' ':
-      case '\n':
-      case '#': // skip blank & comment lines
-        break;
-      case '+':
-        acceptMode = true;
-        break;
-      case '-':
-        acceptMode = false;
-        break;
-      default:
-        rules.add(line);
-        break;
-      }
-    }
-
-    trie = new PrefixStringMatcher(rules);
-  }
-
-  @Override
-  public Configuration getConf() {
-    return this.conf;
-  }
-
-  /**
-   * Main method for invoking this tool
-   *
-   * @throws IOException, IndexingException
-   */
-  public static void main(String[] args) throws IOException, IndexingException {
-    Option helpOpt = new Option("h", "help", false, "show this help message");
-    Option rulesOpt = OptionBuilder.withArgName("file").hasArg()
-        .withDescription(
-            "Rules file to be used in the tests relative to the conf directory")
-        .isRequired().create("rules");
-
-    Options options = new Options();
-    options.addOption(helpOpt).addOption(rulesOpt);
-
-    CommandLineParser parser = new GnuParser();
-    HelpFormatter formatter = new HelpFormatter();
-    String rulesFile;
-
-    try {
-      CommandLine line = parser.parse(options, args);
-
-      if (line.hasOption("help") || !line.hasOption("rules")) {
-        formatter
-            .printHelp("org.apache.nutch.indexer.filter.MimeTypeIndexingFilter",
-                options, true);
-        return;
-      }
-
-      rulesFile = line.getOptionValue("rules");
-    } catch (UnrecognizedOptionException e) {
-      formatter
-          .printHelp("org.apache.nutch.indexer.filter.MimeTypeIndexingFilter",
-              options, true);
-      return;
-    } catch (Exception e) {
-      LOG.error(StringUtils.stringifyException(e));
-      e.printStackTrace();
-      return;
-    }
-
-    MimeTypeIndexingFilter filter = new MimeTypeIndexingFilter();
-    Configuration conf = NutchConfiguration.create();
-    conf.set(MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE, rulesFile);
-    filter.setConf(conf);
-
-    BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
-    String line;
-
-    while ((line = in.readLine()) != null && !line.isEmpty()) {
-      Metadata metadata = new Metadata();
-      metadata.set(Response.CONTENT_TYPE, line);
-      ParseImpl parse = new ParseImpl("text",
-          new ParseData(new ParseStatus(), "title", new Outlink[0], metadata));
-
-      NutchDocument doc = filter.filter(new NutchDocument(), parse,
-          new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
-
-      if (doc != null) {
-        System.out.print("+ ");
-        System.out.println(line);
-      } else {
-        System.out.print("- ");
-        System.out.println(line);
-      }
-    }
-  }
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/mimetype-filter/src/test/org/apache/nutch/indexer/filter/MimeTypeIndexingFilterTest.java
----------------------------------------------------------------------
diff --git a/src/plugin/mimetype-filter/src/test/org/apache/nutch/indexer/filter/MimeTypeIndexingFilterTest.java b/src/plugin/mimetype-filter/src/test/org/apache/nutch/indexer/filter/MimeTypeIndexingFilterTest.java
deleted file mode 100644
index bca230f..0000000
--- a/src/plugin/mimetype-filter/src/test/org/apache/nutch/indexer/filter/MimeTypeIndexingFilterTest.java
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.indexer.filter;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.crawl.Inlinks;
-import org.apache.nutch.indexer.NutchDocument;
-import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.net.protocols.Response;
-import org.apache.nutch.parse.Outlink;
-import org.apache.nutch.parse.ParseData;
-import org.apache.nutch.parse.ParseImpl;
-import org.apache.nutch.parse.ParseStatus;
-import org.apache.nutch.util.NutchConfiguration;
-
-import org.junit.Assert;
-import org.junit.Before;
-import org.junit.Test;
-
-/**
- * JUnit based tests of class
- * {@link org.apache.nutch.indexer.filter.MimeTypeIndexingFilter}
- *
- */
-public class MimeTypeIndexingFilterTest {
-
-  private Configuration conf = NutchConfiguration.create();
-  private MimeTypeIndexingFilter filter = new MimeTypeIndexingFilter();
-  private String[] MIME_TYPES = { "text/html", "image/png", "application/pdf" };
-  private ParseImpl[] parses = new ParseImpl[MIME_TYPES.length];
-  private String sampleDir = System.getProperty("test.data", ".");
-
-  @Before
-  public void setUp() throws Exception {
-    for (int i = 0; i < MIME_TYPES.length; i++) {
-      Metadata metadata = new Metadata();
-      metadata.add(Response.CONTENT_TYPE, MIME_TYPES[i]);
-
-      ParseImpl parse = new ParseImpl("text",
-          new ParseData(new ParseStatus(), "title", new Outlink[0], metadata));
-
-      parses[i] = parse;
-    }
-  }
-
-  @Test
-  public void testMissingConfigFile() throws Exception {
-    String file = conf.get(MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE, "");
-    Assert.assertEquals(String
-        .format("Property %s must not be present in the the configuration file",
-            MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE), "", file);
-
-    filter.setConf(conf);
-
-    // property not set so in this cases all documents must pass the filter
-    for (int i = 0; i < parses.length; i++) {
-      NutchDocument doc = filter.filter(new NutchDocument(), parses[i],
-          new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
-
-      Assert.assertNotNull("All documents must be allowed by default", doc);
-    }
-  }
-
-  @Test
-  public void testAllowOnlyImages() throws Exception {
-    conf.set(MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE, "allow-images.txt");
-    filter.setConf(conf);
-
-    for (int i = 0; i < parses.length; i++) {
-      NutchDocument doc = filter.filter(new NutchDocument(), parses[i],
-          new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
-
-      if (MIME_TYPES[i].contains("image")) {
-        Assert.assertNotNull("Allow only images", doc);
-      } else {
-        Assert.assertNull("Block everything else", doc);
-      }
-    }
-  }
-
-  @Test
-  public void testBlockHTML() throws Exception {
-    conf.set(MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE, "block-html.txt");
-    filter.setConf(conf);
-
-    for (int i = 0; i < parses.length; i++) {
-      NutchDocument doc = filter.filter(new NutchDocument(), parses[i],
-          new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
-
-      if (MIME_TYPES[i].contains("html")) {
-        Assert.assertNull("Block only HTML documents", doc);
-      } else {
-        Assert.assertNotNull("Allow everything else", doc);
-      }
-    }
-  }
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/nutch-extensionpoints/build.xml
----------------------------------------------------------------------
diff --git a/src/plugin/nutch-extensionpoints/build.xml b/src/plugin/nutch-extensionpoints/build.xml
deleted file mode 100644
index 45eb815..0000000
--- a/src/plugin/nutch-extensionpoints/build.xml
+++ /dev/null
@@ -1,30 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="nutch-extensionpoints" default="jar">
-
-  <import file="../build-plugin.xml"/>
-
-  <!--
-   ! Override the compile and jar targets,
-   ! since there is nothing to compile here.
-   ! -->
-  <target name="compile" depends="init, resolve-default"/>
-
-  <!--target name="jar" depends="compile"/-->
-
-</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/nutch-extensionpoints/ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/nutch-extensionpoints/ivy.xml b/src/plugin/nutch-extensionpoints/ivy.xml
deleted file mode 100644
index 1a86d68..0000000
--- a/src/plugin/nutch-extensionpoints/ivy.xml
+++ /dev/null
@@ -1,41 +0,0 @@
-<?xml version="1.0" ?>
-
-<!--
-   Licensed to the Apache Software Foundation (ASF) under one or more
-   contributor license agreements.  See the NOTICE file distributed with
-   this work for additional information regarding copyright ownership.
-   The ASF licenses this file to You under the Apache License, Version 2.0
-   (the "License"); you may not use this file except in compliance with
-   the License.  You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
--->
-
-<ivy-module version="1.0">
-  <info organisation="org.apache.nutch" module="${ant.project.name}">
-    <license name="Apache 2.0"/>
-    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
-    <description>
-        Apache Nutch
-    </description>
-  </info>
-
-  <configurations>
-    <include file="../../..//ivy/ivy-configurations.xml"/>
-  </configurations>
-
-  <publications>
-    <!--get the artifact from our module name-->
-    <artifact conf="master"/>
-  </publications>
-
-  <dependencies>
-  </dependencies>
-  
-</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/nutch-extensionpoints/plugin.xml
----------------------------------------------------------------------
diff --git a/src/plugin/nutch-extensionpoints/plugin.xml b/src/plugin/nutch-extensionpoints/plugin.xml
deleted file mode 100644
index 8cf7a23..0000000
--- a/src/plugin/nutch-extensionpoints/plugin.xml
+++ /dev/null
@@ -1,67 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<plugin
-   id="nutch-extensionpoints"
-   name="the nutch core extension points"
-   version="2.0.0"
-   provider-name="nutch.org">
-
-   <!-- this file hosts all extension points nutch core code offers. 
-   Please not that plugins can define extension points as well to be extendable.-->
-
-<extension-point
-      id="org.apache.nutch.indexer.IndexingFilter"
-      name="Nutch Indexing Filter"/>
-
-<extension-point
-      id="org.apache.nutch.indexer.IndexWriter"
-      name="Nutch Index Writer"/>
-
-<extension-point
-      id="org.apache.nutch.parse.Parser"
-      name="Nutch Content Parser"/>
- 
-<extension-point
-      id="org.apache.nutch.parse.HtmlParseFilter"
-      name="HTML Parse Filter"/>
-
-<extension-point
-      id="org.apache.nutch.protocol.Protocol"
-      name="Nutch Protocol"/>
-
-<extension-point
-      id="org.apache.nutch.net.URLFilter"
-      name="Nutch URL Filter"/>
-
-<extension-point
-        id="org.apache.nutch.net.URLExemptionFilter"
-        name="Nutch URL Ignore Exemption Filter"/>
-
-<extension-point
-      id="org.apache.nutch.net.URLNormalizer"
-      name="Nutch URL Normalizer"/>
-
-<extension-point
-      id="org.apache.nutch.scoring.ScoringFilter"
-      name="Nutch Scoring"/>
-
-<extension-point
-      id="org.apache.nutch.segment.SegmentMergeFilter"
-      name="Nutch Segment Merge Filter"/>
-
-</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-ext/build.xml
----------------------------------------------------------------------
diff --git a/src/plugin/parse-ext/build.xml b/src/plugin/parse-ext/build.xml
deleted file mode 100644
index 25552fa..0000000
--- a/src/plugin/parse-ext/build.xml
+++ /dev/null
@@ -1,32 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="parse-ext" default="jar-core">
-
-  <import file="../build-plugin.xml"/>
-
-  <!-- Deploy Unit test dependencies -->
-  <target name="deps-test">
-    <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
-    <ant target="deploy" inheritall="false" dir="../protocol-file"/>
-  </target>
-
-
-  <copy file="command" todir="${deploy.dir}" preservelastmodified="true"/>
-  <chmod file="${deploy.dir}/command" perm="755"/>
-
-</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-ext/command
----------------------------------------------------------------------
diff --git a/src/plugin/parse-ext/command b/src/plugin/parse-ext/command
deleted file mode 100644
index f42c055..0000000
--- a/src/plugin/parse-ext/command
+++ /dev/null
@@ -1,24 +0,0 @@
-#!/bin/bash
-#
-# Sample bash script as external command invoked by parse-ext plugin
-#
-# 20040701, John Xing
-
-set -e
-
-if  [ $# -ne 1 ]; then
-  echo Usage:$0 mimeType >&2
-  exit 1
-fi
-
-case $1 in
-"application/vnd.nutch.example.cat")
-  cat
-  ;;
-"application/vnd.nutch.example.md5sum")
-  md5sum
-  ;;
-*)
-  echo "Can't parse mimeType $1" >&2
-  exit 1
-esac

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-ext/ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/parse-ext/ivy.xml b/src/plugin/parse-ext/ivy.xml
deleted file mode 100644
index 1a86d68..0000000
--- a/src/plugin/parse-ext/ivy.xml
+++ /dev/null
@@ -1,41 +0,0 @@
-<?xml version="1.0" ?>
-
-<!--
-   Licensed to the Apache Software Foundation (ASF) under one or more
-   contributor license agreements.  See the NOTICE file distributed with
-   this work for additional information regarding copyright ownership.
-   The ASF licenses this file to You under the Apache License, Version 2.0
-   (the "License"); you may not use this file except in compliance with
-   the License.  You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
--->
-
-<ivy-module version="1.0">
-  <info organisation="org.apache.nutch" module="${ant.project.name}">
-    <license name="Apache 2.0"/>
-    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
-    <description>
-        Apache Nutch
-    </description>
-  </info>
-
-  <configurations>
-    <include file="../../..//ivy/ivy-configurations.xml"/>
-  </configurations>
-
-  <publications>
-    <!--get the artifact from our module name-->
-    <artifact conf="master"/>
-  </publications>
-
-  <dependencies>
-  </dependencies>
-  
-</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-ext/plugin.xml
----------------------------------------------------------------------
diff --git a/src/plugin/parse-ext/plugin.xml b/src/plugin/parse-ext/plugin.xml
deleted file mode 100644
index 6819b36..0000000
--- a/src/plugin/parse-ext/plugin.xml
+++ /dev/null
@@ -1,60 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<plugin
-   id="parse-ext"
-   name="External Parser Plug-in"
-   version="1.0.0"
-   provider-name="nutch.org">
-
-   <runtime>
-      <library name="parse-ext.jar">
-         <export name="*"/>
-      </library>
-   </runtime>
-
-   <requires>
-      <import plugin="nutch-extensionpoints"/>
-   </requires>
-
-   <extension id="org.apache.nutch.parse.ext"
-              name="ExtParse"
-              point="org.apache.nutch.parse.Parser">
-
-      <implementation id="ExtParser"
-                      class="org.apache.nutch.parse.ext.ExtParser">
-        <parameter name="contentType" value="application/vnd.nutch.example.cat"/>
-        <parameter name="pathSuffix"  value=""/>
-        <parameter name="command"     value="./build/plugins/parse-ext/command"/>
-        <parameter name="timeout"     value="10"/>
-        <!-- can optionally specify an encoding parameter now, see NUTCH-564-->
-        <!-- <parameter name="encoding" value="UTF-8"/> -->
-      </implementation>
-
-      <implementation id="ExtParser"
-                      class="org.apache.nutch.parse.ext.ExtParser">
-        <parameter name="contentType" value="application/vnd.nutch.example.md5sum"/>
-        <parameter name="pathSuffix"  value=""/>
-        <parameter name="command"     value="./build/plugins/parse-ext/command"/>
-        <parameter name="timeout"     value="20"/>
-        <!-- can optionally specify an encoding parameter now, see NUTCH-564-->
-        <!-- <parameter name="encoding" value="UTF-8"/> -->
-      </implementation>
-
-   </extension>
-
-</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java
----------------------------------------------------------------------
diff --git a/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java b/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java
deleted file mode 100644
index 94d9b32..0000000
--- a/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java
+++ /dev/null
@@ -1,183 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.parse.ext;
-
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.parse.ParseResult;
-import org.apache.nutch.parse.ParseStatus;
-import org.apache.nutch.parse.Parser;
-import org.apache.nutch.parse.ParseData;
-import org.apache.nutch.parse.ParseImpl;
-import org.apache.nutch.parse.Outlink;
-import org.apache.nutch.parse.OutlinkExtractor;
-
-import org.apache.nutch.util.CommandRunner;
-import org.apache.nutch.net.protocols.Response;
-import org.apache.hadoop.conf.Configuration;
-
-import org.apache.nutch.plugin.Extension;
-import org.apache.nutch.plugin.PluginRepository;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.util.Hashtable;
-
-import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
-import java.nio.charset.Charset;
-
-/**
- * A wrapper that invokes external command to do real parsing job.
- * 
- * @author John Xing
- */
-
-public class ExtParser implements Parser {
-
-  public static final Logger LOG = LoggerFactory
-      .getLogger("org.apache.nutch.parse.ext");
-
-  static final int BUFFER_SIZE = 4096;
-
-  static final int TIMEOUT_DEFAULT = 30; // in seconds
-
-  // handy map from String contentType to String[] {command, timeoutString,
-  // encoding}
-  Hashtable<String, String[]> TYPE_PARAMS_MAP = new Hashtable<String, String[]>();
-
-  private Configuration conf;
-
-  public ExtParser() {
-  }
-
-  public ParseResult getParse(Content content) {
-
-    String contentType = content.getContentType();
-
-    String[] params = (String[]) TYPE_PARAMS_MAP.get(contentType);
-    if (params == null)
-      return new ParseStatus(ParseStatus.FAILED,
-          "No external command defined for contentType: " + contentType)
-          .getEmptyParseResult(content.getUrl(), getConf());
-
-    String command = params[0];
-    int timeout = Integer.parseInt(params[1]);
-    String encoding = params[2];
-
-    if (LOG.isTraceEnabled()) {
-      LOG.trace("Use " + command + " with timeout=" + timeout + "secs");
-    }
-
-    String text = null;
-    String title = null;
-
-    try {
-
-      byte[] raw = content.getContent();
-
-      String contentLength = content.getMetadata().get(Response.CONTENT_LENGTH);
-      if (contentLength != null
-          && raw.length != Integer.parseInt(contentLength)) {
-        return new ParseStatus(ParseStatus.FAILED,
-            ParseStatus.FAILED_TRUNCATED, "Content truncated at " + raw.length
-                + " bytes. Parser can't handle incomplete " + contentType
-                + " file.").getEmptyParseResult(content.getUrl(), getConf());
-      }
-
-      ByteArrayOutputStream os = new ByteArrayOutputStream(BUFFER_SIZE);
-      ByteArrayOutputStream es = new ByteArrayOutputStream(BUFFER_SIZE / 4);
-
-      CommandRunner cr = new CommandRunner();
-
-      cr.setCommand(command + " " + contentType);
-      cr.setInputStream(new ByteArrayInputStream(raw));
-      cr.setStdOutputStream(os);
-      cr.setStdErrorStream(es);
-
-      cr.setTimeout(timeout);
-
-      cr.evaluate();
-
-      if (cr.getExitValue() != 0)
-        return new ParseStatus(ParseStatus.FAILED, "External command "
-            + command + " failed with error: " + es.toString())
-            .getEmptyParseResult(content.getUrl(), getConf());
-
-      text = os.toString(encoding);
-
-    } catch (Exception e) { // run time exception
-      return new ParseStatus(e)
-          .getEmptyParseResult(content.getUrl(), getConf());
-    }
-
-    if (text == null)
-      text = "";
-
-    if (title == null)
-      title = "";
-
-    // collect outlink
-    Outlink[] outlinks = OutlinkExtractor.getOutlinks(text, getConf());
-
-    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title,
-        outlinks, content.getMetadata());
-    return ParseResult.createParseResult(content.getUrl(), new ParseImpl(text,
-        parseData));
-  }
-
-  public void setConf(Configuration conf) {
-    this.conf = conf;
-    Extension[] extensions = PluginRepository.get(conf)
-        .getExtensionPoint("org.apache.nutch.parse.Parser").getExtensions();
-
-    String contentType, command, timeoutString, encoding;
-
-    for (int i = 0; i < extensions.length; i++) {
-      Extension extension = extensions[i];
-
-      // only look for extensions defined by plugin parse-ext
-      if (!extension.getDescriptor().getPluginId().equals("parse-ext"))
-        continue;
-
-      contentType = extension.getAttribute("contentType");
-      if (contentType == null || contentType.equals(""))
-        continue;
-
-      command = extension.getAttribute("command");
-      if (command == null || command.equals(""))
-        continue;
-
-      // null encoding means default
-      encoding = extension.getAttribute("encoding");
-      if (encoding == null)
-        encoding = Charset.defaultCharset().name();
-
-      timeoutString = extension.getAttribute("timeout");
-      if (timeoutString == null || timeoutString.equals(""))
-        timeoutString = "" + TIMEOUT_DEFAULT;
-
-      TYPE_PARAMS_MAP.put(contentType, new String[] { command, timeoutString,
-          encoding });
-    }
-  }
-
-  public Configuration getConf() {
-    return this.conf;
-  }
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/package-info.java
----------------------------------------------------------------------
diff --git a/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/package-info.java b/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/package-info.java
deleted file mode 100644
index 6394489..0000000
--- a/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/package-info.java
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * Parse wrapper to run external command to do the parsing.
- */
-package org.apache.nutch.parse.ext;
-

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java
----------------------------------------------------------------------
diff --git a/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java b/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java
deleted file mode 100644
index a399273..0000000
--- a/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java
+++ /dev/null
@@ -1,130 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.parse.ext;
-
-import org.apache.nutch.protocol.ProtocolFactory;
-import org.apache.nutch.protocol.Protocol;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.protocol.ProtocolException;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseUtil;
-import org.apache.nutch.parse.ParseException;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.util.NutchConfiguration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.junit.After;
-import org.junit.Assert;
-import org.junit.Before;
-import org.junit.Test;
-
-import java.io.File;
-import java.io.FileOutputStream;
-import java.io.IOException;
-
-/**
- * Unit tests for ExtParser. First creates a temp file with fixed content, then
- * fetch and parse it using external command 'cat' and 'md5sum' alternately for
- * 10 times. Doing so also does a light stress test for class CommandRunner.java
- * (as used in ExtParser.java).
- * 
- * Warning: currently only do test on linux platform.
- * 
- * @author John Xing
- */
-public class TestExtParser {
-  private File tempFile = null;
-  private String urlString = null;
-  private Content content = null;
-  private Parse parse = null;
-
-  private String expectedText = "nutch rocks nutch rocks nutch rocks";
-  // echo -n "nutch rocks nutch rocks nutch rocks" | md5sum
-  private String expectedMD5sum = "df46711a1a48caafc98b1c3b83aa1526";
-
-  @Before
-  protected void setUp() throws ProtocolException, IOException {
-    // prepare a temp file with expectedText as its content
-    // This system property is defined in ./src/plugin/build-plugin.xml
-    String path = System.getProperty("test.data");
-    if (path != null) {
-      File tempDir = new File(path);
-      if (!tempDir.exists())
-        tempDir.mkdir();
-      tempFile = File.createTempFile("nutch.test.plugin.ExtParser.", ".txt",
-          tempDir);
-    } else {
-      // otherwise in java.io.tmpdir
-      tempFile = File.createTempFile("nutch.test.plugin.ExtParser.", ".txt");
-    }
-    urlString = tempFile.toURI().toURL().toString();
-
-    FileOutputStream fos = new FileOutputStream(tempFile);
-    fos.write(expectedText.getBytes());
-    fos.close();
-
-    // get nutch content
-    Protocol protocol = new ProtocolFactory(NutchConfiguration.create())
-        .getProtocol(urlString);
-    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum())
-        .getContent();
-    protocol = null;
-  }
-
-  @After
-  protected void tearDown() {
-    // clean content
-    content = null;
-
-    // clean temp file
-    // if (tempFile != null && tempFile.exists())
-    // tempFile.delete();
-  }
-
-  @Test
-  public void testIt() throws ParseException {
-    String contentType;
-
-    // now test only on linux platform
-    if (!System.getProperty("os.name").equalsIgnoreCase("linux")) {
-      System.err
-          .println("Current OS is " + System.getProperty("os.name") + ".");
-      System.err.println("No test is run on OS other than linux.");
-      return;
-    }
-
-    Configuration conf = NutchConfiguration.create();
-    // loop alternately, total 10*2 times of invoking external command
-    for (int i = 0; i < 10; i++) {
-      // check external parser that does 'cat'
-      contentType = "application/vnd.nutch.example.cat";
-      content.setContentType(contentType);
-      parse = new ParseUtil(conf).parseByExtensionId("parse-ext", content).get(
-          content.getUrl());
-      Assert.assertEquals(expectedText, parse.getText());
-
-      // check external parser that does 'md5sum'
-      contentType = "application/vnd.nutch.example.md5sum";
-      content.setContentType(contentType);
-      parse = new ParseUtil(conf).parseByExtensionId("parse-ext", content).get(
-          content.getUrl());
-      Assert.assertTrue(parse.getText().startsWith(expectedMD5sum));
-    }
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-html/build.xml
----------------------------------------------------------------------
diff --git a/src/plugin/parse-html/build.xml b/src/plugin/parse-html/build.xml
deleted file mode 100755
index a5b99b5..0000000
--- a/src/plugin/parse-html/build.xml
+++ /dev/null
@@ -1,40 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="parse-html" default="jar-core">
-
-  <import file="../build-plugin.xml"/>
-
-  <!-- Build compilation dependencies -->
-  <target name="deps-jar">
-    <ant target="jar" inheritall="false" dir="../lib-nekohtml"/>
-  </target>
-
-  <!-- Add compilation dependencies to classpath -->
-  <path id="plugin.deps">
-    <fileset dir="${nutch.root}/build">
-      <include name="**/lib-nekohtml/*.jar" />
-    </fileset>
-  </path>
-
-  <!-- Deploy Unit test dependencies -->
-  <target name="deps-test">
-    <ant target="deploy" inheritall="false" dir="../lib-nekohtml"/>
-    <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
-  </target>
-
-</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-html/ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/parse-html/ivy.xml b/src/plugin/parse-html/ivy.xml
deleted file mode 100644
index e8a6135..0000000
--- a/src/plugin/parse-html/ivy.xml
+++ /dev/null
@@ -1,42 +0,0 @@
-<?xml version="1.0" ?>
-
-<!--
-   Licensed to the Apache Software Foundation (ASF) under one or more
-   contributor license agreements.  See the NOTICE file distributed with
-   this work for additional information regarding copyright ownership.
-   The ASF licenses this file to You under the Apache License, Version 2.0
-   (the "License"); you may not use this file except in compliance with
-   the License.  You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
--->
-
-<ivy-module version="1.0">
-  <info organisation="org.apache.nutch" module="${ant.project.name}">
-    <license name="Apache 2.0"/>
-    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
-    <description>
-        Apache Nutch
-    </description>
-  </info>
-
-  <configurations>
-    <include file="../../../ivy/ivy-configurations.xml"/>
-  </configurations>
-
-  <publications>
-    <!--get the artifact from our module name-->
-    <artifact conf="master"/>
-  </publications>
-
-  <dependencies>
-   <dependency org="org.ccil.cowan.tagsoup" name="tagsoup" rev="1.2.1"/>
-  </dependencies>
-
-</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-html/plugin.xml
----------------------------------------------------------------------
diff --git a/src/plugin/parse-html/plugin.xml b/src/plugin/parse-html/plugin.xml
deleted file mode 100755
index 3be70c3..0000000
--- a/src/plugin/parse-html/plugin.xml
+++ /dev/null
@@ -1,48 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<plugin
-   id="parse-html"
-   name="Html Parse Plug-in"
-   version="1.0.0"
-   provider-name="nutch.org">
-
-   <runtime>
-      <library name="parse-html.jar">
-         <export name="*"/>
-      </library>
-      <library name="tagsoup-1.2.1.jar"/>
-   </runtime>
-
-   <requires>
-      <import plugin="nutch-extensionpoints"/>
-      <import plugin="lib-nekohtml"/>
-   </requires>
-
-   <extension id="org.apache.nutch.parse.html"
-              name="HtmlParse"
-              point="org.apache.nutch.parse.Parser">
-
-      <implementation id="org.apache.nutch.parse.html.HtmlParser"
-                      class="org.apache.nutch.parse.html.HtmlParser">
-        <parameter name="contentType" value="text/html|application/xhtml+xml"/>
-        <parameter name="pathSuffix" value=""/>
-      </implementation>
-
-   </extension>
-
-</plugin>