You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by th...@apache.org on 2016/07/05 22:49:23 UTC
[39/69] [abbrv] [partial] nutch git commit: Re arranged the source
code as per maven conventions for build
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/lib-selenium/src/main/java/org/apache/nutch/protocol/selenium/HttpWebClient.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-selenium/src/main/java/org/apache/nutch/protocol/selenium/HttpWebClient.java b/nutch-plugins/lib-selenium/src/main/java/org/apache/nutch/protocol/selenium/HttpWebClient.java
new file mode 100644
index 0000000..86692ae
--- /dev/null
+++ b/nutch-plugins/lib-selenium/src/main/java/org/apache/nutch/protocol/selenium/HttpWebClient.java
@@ -0,0 +1,236 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.selenium;
+
+import java.io.BufferedInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.net.URL;
+import java.util.concurrent.TimeUnit;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IOUtils;
+import org.openqa.selenium.By;
+import org.openqa.selenium.OutputType;
+import org.openqa.selenium.TakesScreenshot;
+import org.openqa.selenium.TimeoutException;
+import org.openqa.selenium.WebDriver;
+import org.openqa.selenium.chrome.ChromeDriver;
+import org.openqa.selenium.firefox.FirefoxBinary;
+import org.openqa.selenium.firefox.FirefoxDriver;
+import org.openqa.selenium.firefox.FirefoxProfile;
+import org.openqa.selenium.io.TemporaryFilesystem;
+import org.openqa.selenium.remote.DesiredCapabilities;
+import org.openqa.selenium.remote.RemoteWebDriver;
+import org.openqa.selenium.safari.SafariDriver;
+import org.openqa.selenium.phantomjs.PhantomJSDriver;
+import org.openqa.selenium.phantomjs.PhantomJSDriverService;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.opera.core.systems.OperaDriver;
+
+public class HttpWebClient {
+
+ private static final Logger LOG = LoggerFactory.getLogger(HttpWebClient.class);
+
+ public static ThreadLocal<WebDriver> threadWebDriver = new ThreadLocal<WebDriver>() {
+
+ @Override
+ protected WebDriver initialValue()
+ {
+ FirefoxProfile profile = new FirefoxProfile();
+ profile.setPreference("permissions.default.stylesheet", 2);
+ profile.setPreference("permissions.default.image", 2);
+ profile.setPreference("dom.ipc.plugins.enabled.libflashplayer.so", "false");
+ profile.setPreference(FirefoxProfile.ALLOWED_HOSTS_PREFERENCE, "localhost");
+ WebDriver driver = new FirefoxDriver(profile);
+ return driver;
+ };
+ };
+
+ public static WebDriver getDriverForPage(String url, Configuration conf) {
+ WebDriver driver = null;
+ DesiredCapabilities capabilities = null;
+ long pageLoadWait = conf.getLong("page.load.delay", 3);
+
+ try {
+ String driverType = conf.get("selenium.driver", "firefox");
+ switch (driverType) {
+ case "firefox":
+ String allowedHost = conf.get("selenium.firefox.allowed.hosts", "localhost");
+ long firefoxBinaryTimeout = conf.getLong("selenium.firefox.binary.timeout", 45);
+ boolean enableFlashPlayer = conf.getBoolean("selenium.firefox.enable.flash", false);
+ int loadImage = conf.getInt("selenium.firefox.load.image", 1);
+ int loadStylesheet = conf.getInt("selenium.firefox.load.stylesheet", 1);
+ FirefoxProfile profile = new FirefoxProfile();
+ FirefoxBinary binary = new FirefoxBinary();
+ profile.setPreference(FirefoxProfile.ALLOWED_HOSTS_PREFERENCE, allowedHost);
+ profile.setPreference("dom.ipc.plugins.enabled.libflashplayer.so", enableFlashPlayer);
+ profile.setPreference("permissions.default.stylesheet", loadStylesheet);
+ profile.setPreference("permissions.default.image", loadImage);
+ binary.setTimeout(TimeUnit.SECONDS.toMillis(firefoxBinaryTimeout));
+ driver = new FirefoxDriver(binary, profile);
+ break;
+ case "chrome":
+ driver = new ChromeDriver();
+ break;
+ case "safari":
+ driver = new SafariDriver();
+ break;
+ case "opera":
+ driver = new OperaDriver();
+ break;
+ case "phantomjs":
+ driver = new PhantomJSDriver();
+ break;
+ case "remote":
+ String seleniumHubHost = conf.get("selenium.hub.host", "localhost");
+ int seleniumHubPort = Integer.parseInt(conf.get("selenium.hub.port", "4444"));
+ String seleniumHubPath = conf.get("selenium.hub.path", "/wd/hub");
+ String seleniumHubProtocol = conf.get("selenium.hub.protocol", "http");
+ String seleniumGridDriver = conf.get("selenium.grid.driver","firefox");
+ String seleniumGridBinary = conf.get("selenium.grid.binary");
+
+ switch (seleniumGridDriver){
+ case "firefox":
+ capabilities = DesiredCapabilities.firefox();
+ capabilities.setBrowserName("firefox");
+ capabilities.setJavascriptEnabled(true);
+ capabilities.setCapability("firefox_binary",seleniumGridBinary);
+ System.setProperty("webdriver.reap_profile", "false");
+ driver = new RemoteWebDriver(new URL(seleniumHubProtocol, seleniumHubHost, seleniumHubPort, seleniumHubPath), capabilities);
+ break;
+ case "phantomjs":
+ capabilities = DesiredCapabilities.phantomjs();
+ capabilities.setBrowserName("phantomjs");
+ capabilities.setJavascriptEnabled(true);
+ capabilities.setCapability(PhantomJSDriverService.PHANTOMJS_EXECUTABLE_PATH_PROPERTY,seleniumGridBinary);
+ driver = new RemoteWebDriver(new URL(seleniumHubProtocol, seleniumHubHost, seleniumHubPort, seleniumHubPath), capabilities);
+ break;
+ default:
+ LOG.error("The Selenium Grid WebDriver choice {} is not available... defaulting to FirefoxDriver().", driverType);
+ driver = new RemoteWebDriver(new URL(seleniumHubProtocol, seleniumHubHost, seleniumHubPort, seleniumHubPath), DesiredCapabilities.firefox());
+ break;
+ }
+ break;
+ default:
+ LOG.error("The Selenium WebDriver choice {} is not available... defaulting to FirefoxDriver().", driverType);
+ driver = new FirefoxDriver();
+ break;
+ }
+ LOG.debug("Selenium {} WebDriver selected.", driverType);
+
+ driver.manage().timeouts().pageLoadTimeout(pageLoadWait, TimeUnit.SECONDS);
+ driver.get(url);
+ } catch (Exception e) {
+ if(e instanceof TimeoutException) {
+ LOG.debug("Selenium WebDriver: Timeout Exception: Capturing whatever loaded so far...");
+ return driver;
+ }
+ cleanUpDriver(driver);
+ throw new RuntimeException(e);
+ }
+
+ return driver;
+ }
+
+ public static String getHTMLContent(WebDriver driver, Configuration conf) {
+ if (conf.getBoolean("take.screenshot", false)) {
+ takeScreenshot(driver, conf);
+ }
+
+ return driver.findElement(By.tagName("body")).getAttribute("innerHTML");
+ }
+
+ public static void cleanUpDriver(WebDriver driver) {
+ if (driver != null) {
+ try {
+ driver.close();
+ driver.quit();
+ TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles();
+ } catch (Exception e) {
+ throw new RuntimeException(e);
+ }
+ }
+ }
+
+ /**
+ * Function for obtaining the HTML BODY using the selected
+ * {@link org.openqa.selenium.WebDriver}.
+ * There are a number of configuration properties within
+ * <code>nutch-site.xml</code> which determine whether to
+ * take screenshots of the rendered pages and persist them
+ * as timestamped .png's into HDFS.
+ * @param url the URL to fetch and render
+ * @param conf the {@link org.apache.hadoop.conf.Configuration}
+ * @return the rendered inner HTML page
+ */
+ public static String getHtmlPage(String url, Configuration conf) {
+ WebDriver driver = getDriverForPage(url, conf);
+
+ try {
+ if (conf.getBoolean("take.screenshot", false)) {
+ takeScreenshot(driver, conf);
+ }
+
+ String innerHtml = driver.findElement(By.tagName("body")).getAttribute("innerHTML");
+ return innerHtml;
+
+ // I'm sure this catch statement is a code smell ; borrowing it from lib-htmlunit
+ } catch (Exception e) {
+ TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles();
+ throw new RuntimeException(e);
+ } finally {
+ cleanUpDriver(driver);
+ }
+ }
+
+ public static String getHtmlPage(String url) {
+ return getHtmlPage(url, null);
+ }
+
+ private static void takeScreenshot(WebDriver driver, Configuration conf) {
+ try {
+ String url = driver.getCurrentUrl();
+ File srcFile = ((TakesScreenshot)driver).getScreenshotAs(OutputType.FILE);
+ LOG.debug("In-memory screenshot taken of: {}", url);
+ FileSystem fs = FileSystem.get(conf);
+ if (conf.get("screenshot.location") != null) {
+ Path screenshotPath = new Path(conf.get("screenshot.location") + "/" + srcFile.getName());
+ OutputStream os = null;
+ if (!fs.exists(screenshotPath)) {
+ LOG.debug("No existing screenshot already exists... creating new file at {} {}.", screenshotPath, srcFile.getName());
+ os = fs.create(screenshotPath);
+ }
+ InputStream is = new BufferedInputStream(new FileInputStream(srcFile));
+ IOUtils.copyBytes(is, os, conf);
+ LOG.debug("Screenshot for {} successfully saved to: {} {}", url, screenshotPath, srcFile.getName());
+ } else {
+ LOG.warn("Screenshot for {} not saved to HDFS (subsequently disgarded) as value for "
+ + "'screenshot.location' is absent from nutch-site.xml.", url);
+ }
+ } catch (Exception e) {
+ cleanUpDriver(driver);
+ throw new RuntimeException(e);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/lib-xml/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-xml/build.xml b/nutch-plugins/lib-xml/build.xml
new file mode 100644
index 0000000..0f87c07
--- /dev/null
+++ b/nutch-plugins/lib-xml/build.xml
@@ -0,0 +1,36 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="lib-xml" default="jar">
+
+ <import file="../build-plugin.xml" />
+
+ <!--
+ ! Override the compile and jar targets,
+ ! since there is nothing to compile here.
+ ! -->
+ <target name="compile" depends="init, resolve-default" />
+
+ <!--
+ <target name="jar" depends="compile">
+ <copy todir="${build.dir}" verbose="true">
+ <fileset dir="./lib" includes="**/*.jar" />
+ </copy>
+ </target>
+ -->
+
+</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/lib-xml/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-xml/ivy.xml b/nutch-plugins/lib-xml/ivy.xml
new file mode 100644
index 0000000..414f38a
--- /dev/null
+++ b/nutch-plugins/lib-xml/ivy.xml
@@ -0,0 +1,44 @@
+<?xml version="1.0" ?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<ivy-module version="1.0">
+ <info organisation="org.apache.nutch" module="${ant.project.name}">
+ <license name="Apache 2.0"/>
+ <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+ <description>
+ Apache Nutch
+ </description>
+ </info>
+
+ <configurations>
+ <include file="../../..//ivy/ivy-configurations.xml"/>
+ </configurations>
+
+ <publications>
+ <!--get the artifact from our module name-->
+ <artifact conf="master"/>
+ </publications>
+
+ <dependencies>
+ <dependency org="org.jdom" name="jdom" rev="1.1" conf="*->default"/>
+ <dependency org="jaxen" name="jaxen" rev="1.1.1" conf="*->master"/>
+ <dependency org="xerces" name="xercesImpl" rev="2.11.0" conf="*->master"/>
+ </dependencies>
+
+</ivy-module>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/lib-xml/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-xml/plugin.xml b/nutch-plugins/lib-xml/plugin.xml
new file mode 100644
index 0000000..79bd17f
--- /dev/null
+++ b/nutch-plugins/lib-xml/plugin.xml
@@ -0,0 +1,65 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<!--
+ ! XML library - Gathers many XML related libraries:
+ !
+ ! * Jaxen
+ ! - Download : http://jaxen.org/releases.html
+ ! - License : http://jaxen.org/license.html
+ !
+ ! * Xerces-J 2.6.1
+ ! - Download : http://xerces.apache.org/xerces2-j/download.cgi
+ ! - License : http://www.apache.org/licenses/LICENSE-2.0
+ !
+ ! * SAXPath 1.0 FCS
+ ! - Note : SAXPath has been incorporated into Jaxen.
+ ! It has been merged into the Jaxen codebase
+ ! and is no longer being maintained separately
+ ! - Download : http://sourceforge.net/project/showfiles.php?group_id=26014
+ ! - License : OSI-Approved Open Source
+ !
+ ! * jdom 1.0 beta8-dev
+ ! - Download : http://www.jdom.org/downloads/index.html
+ ! - License : http://www.jdom.org/docs/faq.html#a0030
+ !
+ !-->
+<plugin
+ id="lib-xml"
+ name="XML Libraries"
+ version="1.0"
+ provider-name="org.apache.nutch.xml">
+
+ <runtime>
+ <library name="jaxen-core.jar">
+ <export name="*"/>
+ </library>
+ <library name="jaxen-jdom.jar">
+ <export name="*"/>
+ </library>
+ <library name="xercesImpl.jar">
+ <export name="*"/>
+ </library>
+ <library name="saxpath.jar">
+ <export name="*"/>
+ </library>
+ <library name="jdom.jar">
+ <export name="*"/>
+ </library>
+ </runtime>
+
+</plugin>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/lib-xml/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/lib-xml/pom.xml b/nutch-plugins/lib-xml/pom.xml
new file mode 100644
index 0000000..132d0f2
--- /dev/null
+++ b/nutch-plugins/lib-xml/pom.xml
@@ -0,0 +1,38 @@
+<!--
+ ~ Licensed to the Apache Software Foundation (ASF) under one or more
+ ~ contributor license agreements. See the NOTICE file distributed with
+ ~ this work for additional information regarding copyright ownership.
+ ~ The ASF licenses this file to You under the Apache License, Version 2.0
+ ~ (the "License"); you may not use this file except in compliance with
+ ~ the License. You may obtain a copy of the License at
+ ~
+ ~ http://www.apache.org/licenses/LICENSE-2.0
+ ~
+ ~ Unless required by applicable law or agreed to in writing, software
+ ~ distributed under the License is distributed on an "AS IS" BASIS,
+ ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ~ See the License for the specific language governing permissions and
+ ~ limitations under the License.
+ -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.nutch</groupId>
+ <artifactId>nutch-plugins</artifactId>
+ <version>1.13-SNAPSHOT</version>
+ <relativePath>../pom.xml</relativePath>
+ </parent>
+ <artifactId>lib-xml</artifactId>
+ <packaging>jar</packaging>
+
+ <name>lib-xml</name>
+ <url>http://nutch.apache.org</url>
+
+ <properties>
+ <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+ </properties>
+
+</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/microformats-reltag/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/microformats-reltag/build.xml b/nutch-plugins/microformats-reltag/build.xml
new file mode 100644
index 0000000..395afee
--- /dev/null
+++ b/nutch-plugins/microformats-reltag/build.xml
@@ -0,0 +1,27 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="microformats-reltag" default="jar-core">
+
+ <import file="../build-plugin.xml"/>
+
+ <!-- Deploy Unit test dependencies -->
+ <target name="deps-test">
+ <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
+ </target>
+
+</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/microformats-reltag/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/microformats-reltag/ivy.xml b/nutch-plugins/microformats-reltag/ivy.xml
new file mode 100644
index 0000000..1a86d68
--- /dev/null
+++ b/nutch-plugins/microformats-reltag/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<ivy-module version="1.0">
+ <info organisation="org.apache.nutch" module="${ant.project.name}">
+ <license name="Apache 2.0"/>
+ <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+ <description>
+ Apache Nutch
+ </description>
+ </info>
+
+ <configurations>
+ <include file="../../..//ivy/ivy-configurations.xml"/>
+ </configurations>
+
+ <publications>
+ <!--get the artifact from our module name-->
+ <artifact conf="master"/>
+ </publications>
+
+ <dependencies>
+ </dependencies>
+
+</ivy-module>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/microformats-reltag/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/microformats-reltag/plugin.xml b/nutch-plugins/microformats-reltag/plugin.xml
new file mode 100644
index 0000000..b35e1f4
--- /dev/null
+++ b/nutch-plugins/microformats-reltag/plugin.xml
@@ -0,0 +1,49 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+ id="microformats-reltag"
+ name="Rel-Tag microformat Parser/Indexer/Querier"
+ version="1.0.0"
+ provider-name="nutch.org">
+
+ <runtime>
+ <library name="microformats-reltag.jar">
+ <export name="*"/>
+ </library>
+ </runtime>
+
+ <requires>
+ <import plugin="nutch-extensionpoints"/>
+ </requires>
+
+ <extension id="org.apache.nutch.microformats.reltag.RelTagParser"
+ name="Rel-Tag parser"
+ point="org.apache.nutch.parse.HtmlParseFilter">
+ <implementation id="RelTagParser"
+ class="org.apache.nutch.microformats.reltag.RelTagParser"/>
+ </extension>
+
+ <extension id="org.apache.nutch.microformats.reltag.RelTagIndexingFilter"
+ name="Rel-Tag indexing filter"
+ point="org.apache.nutch.indexer.IndexingFilter">
+ <implementation id="RelTagIndexingFilter"
+ class="org.apache.nutch.microformats.reltag.RelTagIndexingFilter"/>
+ </extension>
+
+</plugin>
+
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/microformats-reltag/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/microformats-reltag/pom.xml b/nutch-plugins/microformats-reltag/pom.xml
new file mode 100644
index 0000000..8579cb5
--- /dev/null
+++ b/nutch-plugins/microformats-reltag/pom.xml
@@ -0,0 +1,38 @@
+<!--
+ ~ Licensed to the Apache Software Foundation (ASF) under one or more
+ ~ contributor license agreements. See the NOTICE file distributed with
+ ~ this work for additional information regarding copyright ownership.
+ ~ The ASF licenses this file to You under the Apache License, Version 2.0
+ ~ (the "License"); you may not use this file except in compliance with
+ ~ the License. You may obtain a copy of the License at
+ ~
+ ~ http://www.apache.org/licenses/LICENSE-2.0
+ ~
+ ~ Unless required by applicable law or agreed to in writing, software
+ ~ distributed under the License is distributed on an "AS IS" BASIS,
+ ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ~ See the License for the specific language governing permissions and
+ ~ limitations under the License.
+ -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.nutch</groupId>
+ <artifactId>nutch-plugins</artifactId>
+ <version>1.13-SNAPSHOT</version>
+ <relativePath>../pom.xml</relativePath>
+ </parent>
+ <artifactId>microformats-reltag</artifactId>
+ <packaging>jar</packaging>
+
+ <name>microformats-reltag</name>
+ <url>http://nutch.apache.org</url>
+
+ <properties>
+ <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+ </properties>
+
+</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/microformats-reltag/src/main/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/microformats-reltag/src/main/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java b/nutch-plugins/microformats-reltag/src/main/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java
new file mode 100644
index 0000000..e50a150
--- /dev/null
+++ b/nutch-plugins/microformats-reltag/src/main/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java
@@ -0,0 +1,77 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.microformats.reltag;
+
+// Nutch imports
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.IndexingFilter;
+import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.parse.Parse;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
+
+/**
+ * An {@link org.apache.nutch.indexer.IndexingFilter} that add <code>tag</code>
+ * field(s) to the document.
+ *
+ * @see <a href="http://www.microformats.org/wiki/rel-tag">
+ * http://www.microformats.org/wiki/rel-tag</a>
+ * @author Jérôme Charron
+ */
+public class RelTagIndexingFilter implements IndexingFilter {
+
+ private Configuration conf;
+
+ // Inherited JavaDoc
+ public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
+ CrawlDatum datum, Inlinks inlinks) throws IndexingException {
+
+ // Check if some Rel-Tags found, possibly put there by RelTagParser
+ String[] tags = parse.getData().getParseMeta()
+ .getValues(RelTagParser.REL_TAG);
+ if (tags != null) {
+ for (int i = 0; i < tags.length; i++) {
+ doc.add("tag", tags[i]);
+ }
+ }
+
+ return doc;
+ }
+
+ /*
+ * ----------------------------- * <implementation:Configurable> *
+ * -----------------------------
+ */
+
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+ }
+
+ public Configuration getConf() {
+ return this.conf;
+ }
+
+ /*
+ * ------------------------------ * </implementation:Configurable> *
+ * ------------------------------
+ */
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/microformats-reltag/src/main/java/org/apache/nutch/microformats/reltag/RelTagParser.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/microformats-reltag/src/main/java/org/apache/nutch/microformats/reltag/RelTagParser.java b/nutch-plugins/microformats-reltag/src/main/java/org/apache/nutch/microformats/reltag/RelTagParser.java
new file mode 100644
index 0000000..9176a1e
--- /dev/null
+++ b/nutch-plugins/microformats-reltag/src/main/java/org/apache/nutch/microformats/reltag/RelTagParser.java
@@ -0,0 +1,148 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.microformats.reltag;
+
+// JDK imports
+import java.net.URL;
+import java.net.URLDecoder;
+import java.util.Iterator;
+import java.util.Set;
+import java.util.TreeSet;
+import org.w3c.dom.DocumentFragment;
+import org.w3c.dom.NamedNodeMap;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+
+// Commons Logging imports
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+// Nutch imports
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.HTMLMetaTags;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseResult;
+import org.apache.nutch.parse.HtmlParseFilter;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.StringUtil;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
+
+/**
+ * Adds microformat rel-tags of document if found.
+ *
+ * @see <a href="http://www.microformats.org/wiki/rel-tag">
+ * http://www.microformats.org/wiki/rel-tag</a>
+ */
+public class RelTagParser implements HtmlParseFilter {
+
+ public final static Logger LOG = LoggerFactory.getLogger(RelTagParser.class);
+
+ public final static String REL_TAG = "Rel-Tag";
+
+ private Configuration conf = null;
+
+ /**
+ * Scan the HTML document looking at possible rel-tags
+ */
+ public ParseResult filter(Content content, ParseResult parseResult,
+ HTMLMetaTags metaTags, DocumentFragment doc) {
+
+ // get parse obj
+ Parse parse = parseResult.get(content.getUrl());
+ // Trying to find the document's rel-tags
+ Parser parser = new Parser(doc);
+ Set<?> tags = parser.getRelTags();
+ Iterator<?> iter = tags.iterator();
+ Metadata metadata = parse.getData().getParseMeta();
+ while (iter.hasNext())
+ metadata.add(REL_TAG, (String) iter.next());
+
+ return parseResult;
+ }
+
+ private static class Parser {
+
+ Set<String> tags = null;
+
+ Parser(Node node) {
+ tags = new TreeSet<String>();
+ parse(node);
+ }
+
+ Set<String> getRelTags() {
+ return tags;
+ }
+
+ void parse(Node node) {
+
+ if (node.getNodeType() == Node.ELEMENT_NODE) {
+ // Look for <a> tag
+ if ("a".equalsIgnoreCase(node.getNodeName())) {
+ NamedNodeMap attrs = node.getAttributes();
+ Node hrefNode = attrs.getNamedItem("href");
+ // Checks that it contains a href attribute
+ if (hrefNode != null) {
+ Node relNode = attrs.getNamedItem("rel");
+ // Checks that it contains a rel attribute too
+ if (relNode != null) {
+ // Finaly checks that rel=tag
+ if ("tag".equalsIgnoreCase(relNode.getNodeValue())) {
+ String tag = parseTag(hrefNode.getNodeValue());
+ if (!StringUtil.isEmpty(tag)) {
+ if (!tags.contains(tag)) {
+ tags.add(tag);
+ LOG.debug("Adding tag: " + tag + " to tag set.");
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
+ // Recurse
+ NodeList children = node.getChildNodes();
+ for (int i = 0; children != null && i < children.getLength(); i++)
+ parse(children.item(i));
+ }
+
+ private final static String parseTag(String url) {
+ String tag = null;
+ try {
+ URL u = new URL(url);
+ String path = u.getPath();
+ tag = URLDecoder.decode(path.substring(path.lastIndexOf('/') + 1),
+ "UTF-8");
+ } catch (Exception e) {
+ // Malformed tag...
+ tag = null;
+ }
+ return tag;
+ }
+
+ }
+
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+ }
+
+ public Configuration getConf() {
+ return this.conf;
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/microformats-reltag/src/main/java/org/apache/nutch/microformats/reltag/package.html
----------------------------------------------------------------------
diff --git a/nutch-plugins/microformats-reltag/src/main/java/org/apache/nutch/microformats/reltag/package.html b/nutch-plugins/microformats-reltag/src/main/java/org/apache/nutch/microformats/reltag/package.html
new file mode 100644
index 0000000..bef5409
--- /dev/null
+++ b/nutch-plugins/microformats-reltag/src/main/java/org/apache/nutch/microformats/reltag/package.html
@@ -0,0 +1,8 @@
+<html>
+<body>
+<p>
+A microformats <a href="http://www.microformats.org/wiki/Rel-Tag">Rel-Tag</a>
+Parser/Indexer/Querier plugin.
+</p>
+</body>
+</html>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/mimetype-filter/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/mimetype-filter/build.xml b/nutch-plugins/mimetype-filter/build.xml
new file mode 100644
index 0000000..977e643
--- /dev/null
+++ b/nutch-plugins/mimetype-filter/build.xml
@@ -0,0 +1,28 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="mimetype-filter" default="jar-core">
+
+ <import file="../build-plugin.xml" />
+
+ <!-- for junit test -->
+ <mkdir dir="${build.test}/data"/>
+ <copy todir="${build.test}/data">
+ <fileset dir="sample" includes="**/*.txt"/>
+ </copy>
+
+</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/mimetype-filter/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/mimetype-filter/ivy.xml b/nutch-plugins/mimetype-filter/ivy.xml
new file mode 100644
index 0000000..0a363f7
--- /dev/null
+++ b/nutch-plugins/mimetype-filter/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<ivy-module version="1.0">
+ <info organisation="org.apache.nutch" module="${ant.project.name}">
+ <license name="Apache 2.0"/>
+ <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+ <description>
+ Apache Nutch
+ </description>
+ </info>
+
+ <configurations>
+ <include file="${nutch.root}/ivy/ivy-configurations.xml"/>
+ </configurations>
+
+ <publications>
+ <!--get the artifact from our module name-->
+ <artifact conf="master"/>
+ </publications>
+
+ <dependencies>
+ </dependencies>
+
+</ivy-module>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/mimetype-filter/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/mimetype-filter/plugin.xml b/nutch-plugins/mimetype-filter/plugin.xml
new file mode 100644
index 0000000..d038447
--- /dev/null
+++ b/nutch-plugins/mimetype-filter/plugin.xml
@@ -0,0 +1,37 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+ id="mimetype-filter"
+ name="Filter indexed documents by the detected MIME"
+ version="1.0.0"
+ provider-name="nutch.org">
+
+ <runtime>
+ <library name="mimetype-filter.jar">
+ <export name="*"/>
+ </library>
+ </runtime>
+
+ <extension id="org.apache.nutch.indexer.filter"
+ name="Nutch MIME filter"
+ point="org.apache.nutch.indexer.IndexingFilter">
+ <implementation id="MimeTypeIndexingFilter"
+ class="org.apache.nutch.indexer.filter.MimeTypeIndexingFilter"/>
+ </extension>
+
+</plugin>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/mimetype-filter/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/mimetype-filter/pom.xml b/nutch-plugins/mimetype-filter/pom.xml
new file mode 100644
index 0000000..29c0798
--- /dev/null
+++ b/nutch-plugins/mimetype-filter/pom.xml
@@ -0,0 +1,38 @@
+<!--
+ ~ Licensed to the Apache Software Foundation (ASF) under one or more
+ ~ contributor license agreements. See the NOTICE file distributed with
+ ~ this work for additional information regarding copyright ownership.
+ ~ The ASF licenses this file to You under the Apache License, Version 2.0
+ ~ (the "License"); you may not use this file except in compliance with
+ ~ the License. You may obtain a copy of the License at
+ ~
+ ~ http://www.apache.org/licenses/LICENSE-2.0
+ ~
+ ~ Unless required by applicable law or agreed to in writing, software
+ ~ distributed under the License is distributed on an "AS IS" BASIS,
+ ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ~ See the License for the specific language governing permissions and
+ ~ limitations under the License.
+ -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.nutch</groupId>
+ <artifactId>nutch-plugins</artifactId>
+ <version>1.13-SNAPSHOT</version>
+ <relativePath>../pom.xml</relativePath>
+ </parent>
+ <artifactId>mimetype-filter</artifactId>
+ <packaging>jar</packaging>
+
+ <name>mimetype-filter</name>
+ <url>http://nutch.apache.org</url>
+
+ <properties>
+ <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+ </properties>
+
+</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/mimetype-filter/sample/allow-images.txt
----------------------------------------------------------------------
diff --git a/nutch-plugins/mimetype-filter/sample/allow-images.txt b/nutch-plugins/mimetype-filter/sample/allow-images.txt
new file mode 100644
index 0000000..0f5f136
--- /dev/null
+++ b/nutch-plugins/mimetype-filter/sample/allow-images.txt
@@ -0,0 +1,34 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This filter can be configured to work in one of two modes (similar to
+# suffix-url-filter)
+
+# default to reject ('-'): in this mode, only documents with a mimetype that
+# match the ones specified in the config file will be accepted, all other
+# mimetypes will be rejected.
+
+# default to accept ('+'): in this mode, only documents with a mimetype
+# that match the ones specified in the config file will be rejected,
+# all other mimetypes will be accepted.
+
+# The format of this config file is one mimetype per line, with no preceding
+# whitespace. Order, in which suffixes are specified, doesn't matter. Blank
+# lines and comments (#) are allowed.
+#
+
+-
+
+image
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/mimetype-filter/sample/block-html.txt
----------------------------------------------------------------------
diff --git a/nutch-plugins/mimetype-filter/sample/block-html.txt b/nutch-plugins/mimetype-filter/sample/block-html.txt
new file mode 100644
index 0000000..69600ec
--- /dev/null
+++ b/nutch-plugins/mimetype-filter/sample/block-html.txt
@@ -0,0 +1,34 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This filter can be configured to work in one of two modes (similar to
+# suffix-url-filter)
+
+# default to reject ('-'): in this mode, only documents with a mimetype that
+# match the ones specified in the config file will be accepted, all other
+# mimetypes will be rejected.
+
+# default to accept ('+'): in this mode, only documents with a mimetype
+# that match the ones specified in the config file will be rejected,
+# all other mimetypes will be accepted.
+
+# The format of this config file is one mimetype per line, with no preceding
+# whitespace. Order, in which suffixes are specified, doesn't matter. Blank
+# lines and comments (#) are allowed.
+#
+
++
+
+text/html
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/mimetype-filter/src/main/java/org/apache/nutch/indexer/filter/MimeTypeIndexingFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/mimetype-filter/src/main/java/org/apache/nutch/indexer/filter/MimeTypeIndexingFilter.java b/nutch-plugins/mimetype-filter/src/main/java/org/apache/nutch/indexer/filter/MimeTypeIndexingFilter.java
new file mode 100644
index 0000000..494d888
--- /dev/null
+++ b/nutch-plugins/mimetype-filter/src/main/java/org/apache/nutch/indexer/filter/MimeTypeIndexingFilter.java
@@ -0,0 +1,273 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.indexer.filter;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.Options;
+import org.apache.commons.cli.OptionBuilder;
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.CommandLineParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.GnuParser;
+import org.apache.commons.cli.UnrecognizedOptionException;
+
+// Nutch imports
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+
+import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.indexer.IndexingFilter;
+import org.apache.nutch.indexer.NutchDocument;
+
+import org.apache.nutch.net.protocols.Response;
+
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.ParseStatus;
+
+import org.apache.nutch.metadata.Metadata;
+
+import org.apache.nutch.util.MimeUtil;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.PrefixStringMatcher;
+import org.apache.nutch.util.TrieStringMatcher;
+import org.apache.tika.Tika;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * An {@link org.apache.nutch.indexer.IndexingFilter} that allows filtering
+ * of documents based on the MIME Type detected by Tika
+ *
+ */
+public class MimeTypeIndexingFilter implements IndexingFilter {
+
+ public static final String MIMEFILTER_REGEX_FILE = "mimetype.filter.file";
+
+ private static final Logger LOG = LoggerFactory
+ .getLogger(MimeTypeIndexingFilter.class);
+
+ private MimeUtil MIME;
+ private Tika tika = new Tika();
+
+ private TrieStringMatcher trie;
+
+ private Configuration conf;
+
+ private boolean acceptMode = true;
+
+ // Inherited JavaDoc
+ @Override
+ public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
+ CrawlDatum datum, Inlinks inlinks) throws IndexingException {
+
+ String mimeType;
+ String contentType;
+
+ Writable tcontentType = datum.getMetaData()
+ .get(new Text(Response.CONTENT_TYPE));
+
+ if (tcontentType != null) {
+ contentType = tcontentType.toString();
+ } else {
+ contentType = parse.getData().getMeta(Response.CONTENT_TYPE);
+ }
+
+ if (contentType == null) {
+ mimeType = tika.detect(url.toString());
+ } else {
+ mimeType = MIME.forName(MimeUtil.cleanMimeType(contentType));
+ }
+
+ contentType = mimeType;
+
+ if (LOG.isInfoEnabled()) {
+ LOG.info(String.format("[%s] %s", contentType, url));
+ }
+
+ if (trie != null) {
+ if (trie.shortestMatch(contentType) == null) {
+ // no match, but
+ if (acceptMode) {
+ return doc;
+ }
+ return null;
+ } else {
+ // matched, but we are blocking
+ if (acceptMode) {
+ return null;
+ }
+ }
+ }
+
+ return doc;
+ }
+
+ /*
+ * -----------------------------
+ * <implementation:Configurable> *
+ * -----------------------------
+ */
+ @Override
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+ MIME = new MimeUtil(conf);
+
+ // load the file of the values
+ String file = conf.get(MIMEFILTER_REGEX_FILE, "");
+
+ if (file != null) {
+ if (file.isEmpty()) {
+ LOG.warn(String
+ .format("Missing %s property, ALL mimetypes will be allowed",
+ MIMEFILTER_REGEX_FILE));
+ } else {
+ Reader reader = conf.getConfResourceAsReader(file);
+
+ try {
+ readConfiguration(reader);
+ } catch (IOException e) {
+ if (LOG.isErrorEnabled()) {
+ LOG.error(e.getMessage());
+ }
+
+ throw new RuntimeException(e.getMessage(), e);
+ }
+ }
+ }
+ }
+
+ private void readConfiguration(Reader reader) throws IOException {
+ BufferedReader in = new BufferedReader(reader);
+ String line;
+ List rules = new ArrayList();
+
+ while (null != (line = in.readLine())) {
+ if (line.length() == 0) {
+ continue;
+ }
+
+ char first = line.charAt(0);
+ switch (first) {
+ case ' ':
+ case '\n':
+ case '#': // skip blank & comment lines
+ break;
+ case '+':
+ acceptMode = true;
+ break;
+ case '-':
+ acceptMode = false;
+ break;
+ default:
+ rules.add(line);
+ break;
+ }
+ }
+
+ trie = new PrefixStringMatcher(rules);
+ }
+
+ @Override
+ public Configuration getConf() {
+ return this.conf;
+ }
+
+ /**
+ * Main method for invoking this tool
+ *
+ * @throws IOException, IndexingException
+ */
+ public static void main(String[] args) throws IOException, IndexingException {
+ Option helpOpt = new Option("h", "help", false, "show this help message");
+ Option rulesOpt = OptionBuilder.withArgName("file").hasArg()
+ .withDescription(
+ "Rules file to be used in the tests relative to the conf directory")
+ .isRequired().create("rules");
+
+ Options options = new Options();
+ options.addOption(helpOpt).addOption(rulesOpt);
+
+ CommandLineParser parser = new GnuParser();
+ HelpFormatter formatter = new HelpFormatter();
+ String rulesFile;
+
+ try {
+ CommandLine line = parser.parse(options, args);
+
+ if (line.hasOption("help") || !line.hasOption("rules")) {
+ formatter
+ .printHelp("org.apache.nutch.indexer.filter.MimeTypeIndexingFilter",
+ options, true);
+ return;
+ }
+
+ rulesFile = line.getOptionValue("rules");
+ } catch (UnrecognizedOptionException e) {
+ formatter
+ .printHelp("org.apache.nutch.indexer.filter.MimeTypeIndexingFilter",
+ options, true);
+ return;
+ } catch (Exception e) {
+ LOG.error(StringUtils.stringifyException(e));
+ e.printStackTrace();
+ return;
+ }
+
+ MimeTypeIndexingFilter filter = new MimeTypeIndexingFilter();
+ Configuration conf = NutchConfiguration.create();
+ conf.set(MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE, rulesFile);
+ filter.setConf(conf);
+
+ BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
+ String line;
+
+ while ((line = in.readLine()) != null && !line.isEmpty()) {
+ Metadata metadata = new Metadata();
+ metadata.set(Response.CONTENT_TYPE, line);
+ ParseImpl parse = new ParseImpl("text",
+ new ParseData(new ParseStatus(), "title", new Outlink[0], metadata));
+
+ NutchDocument doc = filter.filter(new NutchDocument(), parse,
+ new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
+
+ if (doc != null) {
+ System.out.print("+ ");
+ System.out.println(line);
+ } else {
+ System.out.print("- ");
+ System.out.println(line);
+ }
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/mimetype-filter/src/test/org/apache/nutch/indexer/filter/MimeTypeIndexingFilterTest.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/mimetype-filter/src/test/org/apache/nutch/indexer/filter/MimeTypeIndexingFilterTest.java b/nutch-plugins/mimetype-filter/src/test/org/apache/nutch/indexer/filter/MimeTypeIndexingFilterTest.java
new file mode 100644
index 0000000..bca230f
--- /dev/null
+++ b/nutch-plugins/mimetype-filter/src/test/org/apache/nutch/indexer/filter/MimeTypeIndexingFilterTest.java
@@ -0,0 +1,114 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.indexer.filter;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.util.NutchConfiguration;
+
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+/**
+ * JUnit based tests of class
+ * {@link org.apache.nutch.indexer.filter.MimeTypeIndexingFilter}
+ *
+ */
+public class MimeTypeIndexingFilterTest {
+
+ private Configuration conf = NutchConfiguration.create();
+ private MimeTypeIndexingFilter filter = new MimeTypeIndexingFilter();
+ private String[] MIME_TYPES = { "text/html", "image/png", "application/pdf" };
+ private ParseImpl[] parses = new ParseImpl[MIME_TYPES.length];
+ private String sampleDir = System.getProperty("test.data", ".");
+
+ @Before
+ public void setUp() throws Exception {
+ for (int i = 0; i < MIME_TYPES.length; i++) {
+ Metadata metadata = new Metadata();
+ metadata.add(Response.CONTENT_TYPE, MIME_TYPES[i]);
+
+ ParseImpl parse = new ParseImpl("text",
+ new ParseData(new ParseStatus(), "title", new Outlink[0], metadata));
+
+ parses[i] = parse;
+ }
+ }
+
+ @Test
+ public void testMissingConfigFile() throws Exception {
+ String file = conf.get(MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE, "");
+ Assert.assertEquals(String
+ .format("Property %s must not be present in the the configuration file",
+ MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE), "", file);
+
+ filter.setConf(conf);
+
+ // property not set so in this cases all documents must pass the filter
+ for (int i = 0; i < parses.length; i++) {
+ NutchDocument doc = filter.filter(new NutchDocument(), parses[i],
+ new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
+
+ Assert.assertNotNull("All documents must be allowed by default", doc);
+ }
+ }
+
+ @Test
+ public void testAllowOnlyImages() throws Exception {
+ conf.set(MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE, "allow-images.txt");
+ filter.setConf(conf);
+
+ for (int i = 0; i < parses.length; i++) {
+ NutchDocument doc = filter.filter(new NutchDocument(), parses[i],
+ new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
+
+ if (MIME_TYPES[i].contains("image")) {
+ Assert.assertNotNull("Allow only images", doc);
+ } else {
+ Assert.assertNull("Block everything else", doc);
+ }
+ }
+ }
+
+ @Test
+ public void testBlockHTML() throws Exception {
+ conf.set(MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE, "block-html.txt");
+ filter.setConf(conf);
+
+ for (int i = 0; i < parses.length; i++) {
+ NutchDocument doc = filter.filter(new NutchDocument(), parses[i],
+ new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
+
+ if (MIME_TYPES[i].contains("html")) {
+ Assert.assertNull("Block only HTML documents", doc);
+ } else {
+ Assert.assertNotNull("Allow everything else", doc);
+ }
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/nutch-extensionpoints/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/nutch-extensionpoints/build.xml b/nutch-plugins/nutch-extensionpoints/build.xml
new file mode 100644
index 0000000..45eb815
--- /dev/null
+++ b/nutch-plugins/nutch-extensionpoints/build.xml
@@ -0,0 +1,30 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="nutch-extensionpoints" default="jar">
+
+ <import file="../build-plugin.xml"/>
+
+ <!--
+ ! Override the compile and jar targets,
+ ! since there is nothing to compile here.
+ ! -->
+ <target name="compile" depends="init, resolve-default"/>
+
+ <!--target name="jar" depends="compile"/-->
+
+</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/nutch-extensionpoints/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/nutch-extensionpoints/ivy.xml b/nutch-plugins/nutch-extensionpoints/ivy.xml
new file mode 100644
index 0000000..1a86d68
--- /dev/null
+++ b/nutch-plugins/nutch-extensionpoints/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<ivy-module version="1.0">
+ <info organisation="org.apache.nutch" module="${ant.project.name}">
+ <license name="Apache 2.0"/>
+ <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+ <description>
+ Apache Nutch
+ </description>
+ </info>
+
+ <configurations>
+ <include file="../../..//ivy/ivy-configurations.xml"/>
+ </configurations>
+
+ <publications>
+ <!--get the artifact from our module name-->
+ <artifact conf="master"/>
+ </publications>
+
+ <dependencies>
+ </dependencies>
+
+</ivy-module>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/nutch-extensionpoints/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/nutch-extensionpoints/plugin.xml b/nutch-plugins/nutch-extensionpoints/plugin.xml
new file mode 100644
index 0000000..8cf7a23
--- /dev/null
+++ b/nutch-plugins/nutch-extensionpoints/plugin.xml
@@ -0,0 +1,67 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+ id="nutch-extensionpoints"
+ name="the nutch core extension points"
+ version="2.0.0"
+ provider-name="nutch.org">
+
+ <!-- this file hosts all extension points nutch core code offers.
+ Please not that plugins can define extension points as well to be extendable.-->
+
+<extension-point
+ id="org.apache.nutch.indexer.IndexingFilter"
+ name="Nutch Indexing Filter"/>
+
+<extension-point
+ id="org.apache.nutch.indexer.IndexWriter"
+ name="Nutch Index Writer"/>
+
+<extension-point
+ id="org.apache.nutch.parse.Parser"
+ name="Nutch Content Parser"/>
+
+<extension-point
+ id="org.apache.nutch.parse.HtmlParseFilter"
+ name="HTML Parse Filter"/>
+
+<extension-point
+ id="org.apache.nutch.protocol.Protocol"
+ name="Nutch Protocol"/>
+
+<extension-point
+ id="org.apache.nutch.net.URLFilter"
+ name="Nutch URL Filter"/>
+
+<extension-point
+ id="org.apache.nutch.net.URLExemptionFilter"
+ name="Nutch URL Ignore Exemption Filter"/>
+
+<extension-point
+ id="org.apache.nutch.net.URLNormalizer"
+ name="Nutch URL Normalizer"/>
+
+<extension-point
+ id="org.apache.nutch.scoring.ScoringFilter"
+ name="Nutch Scoring"/>
+
+<extension-point
+ id="org.apache.nutch.segment.SegmentMergeFilter"
+ name="Nutch Segment Merge Filter"/>
+
+</plugin>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/nutch-extensionpoints/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/nutch-extensionpoints/pom.xml b/nutch-plugins/nutch-extensionpoints/pom.xml
new file mode 100644
index 0000000..db76178
--- /dev/null
+++ b/nutch-plugins/nutch-extensionpoints/pom.xml
@@ -0,0 +1,38 @@
+<!--
+ ~ Licensed to the Apache Software Foundation (ASF) under one or more
+ ~ contributor license agreements. See the NOTICE file distributed with
+ ~ this work for additional information regarding copyright ownership.
+ ~ The ASF licenses this file to You under the Apache License, Version 2.0
+ ~ (the "License"); you may not use this file except in compliance with
+ ~ the License. You may obtain a copy of the License at
+ ~
+ ~ http://www.apache.org/licenses/LICENSE-2.0
+ ~
+ ~ Unless required by applicable law or agreed to in writing, software
+ ~ distributed under the License is distributed on an "AS IS" BASIS,
+ ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ~ See the License for the specific language governing permissions and
+ ~ limitations under the License.
+ -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.nutch</groupId>
+ <artifactId>nutch-plugins</artifactId>
+ <version>1.13-SNAPSHOT</version>
+ <relativePath>../pom.xml</relativePath>
+ </parent>
+ <artifactId>nutch-extensionpoints</artifactId>
+ <packaging>jar</packaging>
+
+ <name>nutch-extensionpoints</name>
+ <url>http://nutch.apache.org</url>
+
+ <properties>
+ <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+ </properties>
+
+</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-ext/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-ext/build.xml b/nutch-plugins/parse-ext/build.xml
new file mode 100644
index 0000000..25552fa
--- /dev/null
+++ b/nutch-plugins/parse-ext/build.xml
@@ -0,0 +1,32 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="parse-ext" default="jar-core">
+
+ <import file="../build-plugin.xml"/>
+
+ <!-- Deploy Unit test dependencies -->
+ <target name="deps-test">
+ <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
+ <ant target="deploy" inheritall="false" dir="../protocol-file"/>
+ </target>
+
+
+ <copy file="command" todir="${deploy.dir}" preservelastmodified="true"/>
+ <chmod file="${deploy.dir}/command" perm="755"/>
+
+</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-ext/command
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-ext/command b/nutch-plugins/parse-ext/command
new file mode 100644
index 0000000..f42c055
--- /dev/null
+++ b/nutch-plugins/parse-ext/command
@@ -0,0 +1,24 @@
+#!/bin/bash
+#
+# Sample bash script as external command invoked by parse-ext plugin
+#
+# 20040701, John Xing
+
+set -e
+
+if [ $# -ne 1 ]; then
+ echo Usage:$0 mimeType >&2
+ exit 1
+fi
+
+case $1 in
+"application/vnd.nutch.example.cat")
+ cat
+ ;;
+"application/vnd.nutch.example.md5sum")
+ md5sum
+ ;;
+*)
+ echo "Can't parse mimeType $1" >&2
+ exit 1
+esac
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-ext/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-ext/ivy.xml b/nutch-plugins/parse-ext/ivy.xml
new file mode 100644
index 0000000..1a86d68
--- /dev/null
+++ b/nutch-plugins/parse-ext/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<ivy-module version="1.0">
+ <info organisation="org.apache.nutch" module="${ant.project.name}">
+ <license name="Apache 2.0"/>
+ <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+ <description>
+ Apache Nutch
+ </description>
+ </info>
+
+ <configurations>
+ <include file="../../..//ivy/ivy-configurations.xml"/>
+ </configurations>
+
+ <publications>
+ <!--get the artifact from our module name-->
+ <artifact conf="master"/>
+ </publications>
+
+ <dependencies>
+ </dependencies>
+
+</ivy-module>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-ext/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-ext/plugin.xml b/nutch-plugins/parse-ext/plugin.xml
new file mode 100644
index 0000000..6819b36
--- /dev/null
+++ b/nutch-plugins/parse-ext/plugin.xml
@@ -0,0 +1,60 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+ id="parse-ext"
+ name="External Parser Plug-in"
+ version="1.0.0"
+ provider-name="nutch.org">
+
+ <runtime>
+ <library name="parse-ext.jar">
+ <export name="*"/>
+ </library>
+ </runtime>
+
+ <requires>
+ <import plugin="nutch-extensionpoints"/>
+ </requires>
+
+ <extension id="org.apache.nutch.parse.ext"
+ name="ExtParse"
+ point="org.apache.nutch.parse.Parser">
+
+ <implementation id="ExtParser"
+ class="org.apache.nutch.parse.ext.ExtParser">
+ <parameter name="contentType" value="application/vnd.nutch.example.cat"/>
+ <parameter name="pathSuffix" value=""/>
+ <parameter name="command" value="./build/plugins/parse-ext/command"/>
+ <parameter name="timeout" value="10"/>
+ <!-- can optionally specify an encoding parameter now, see NUTCH-564-->
+ <!-- <parameter name="encoding" value="UTF-8"/> -->
+ </implementation>
+
+ <implementation id="ExtParser"
+ class="org.apache.nutch.parse.ext.ExtParser">
+ <parameter name="contentType" value="application/vnd.nutch.example.md5sum"/>
+ <parameter name="pathSuffix" value=""/>
+ <parameter name="command" value="./build/plugins/parse-ext/command"/>
+ <parameter name="timeout" value="20"/>
+ <!-- can optionally specify an encoding parameter now, see NUTCH-564-->
+ <!-- <parameter name="encoding" value="UTF-8"/> -->
+ </implementation>
+
+ </extension>
+
+</plugin>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-ext/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-ext/pom.xml b/nutch-plugins/parse-ext/pom.xml
new file mode 100644
index 0000000..5a7b7be
--- /dev/null
+++ b/nutch-plugins/parse-ext/pom.xml
@@ -0,0 +1,38 @@
+<!--
+ ~ Licensed to the Apache Software Foundation (ASF) under one or more
+ ~ contributor license agreements. See the NOTICE file distributed with
+ ~ this work for additional information regarding copyright ownership.
+ ~ The ASF licenses this file to You under the Apache License, Version 2.0
+ ~ (the "License"); you may not use this file except in compliance with
+ ~ the License. You may obtain a copy of the License at
+ ~
+ ~ http://www.apache.org/licenses/LICENSE-2.0
+ ~
+ ~ Unless required by applicable law or agreed to in writing, software
+ ~ distributed under the License is distributed on an "AS IS" BASIS,
+ ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ~ See the License for the specific language governing permissions and
+ ~ limitations under the License.
+ -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.nutch</groupId>
+ <artifactId>nutch-plugins</artifactId>
+ <version>1.13-SNAPSHOT</version>
+ <relativePath>../pom.xml</relativePath>
+ </parent>
+ <artifactId>parse-ext</artifactId>
+ <packaging>jar</packaging>
+
+ <name>parse-ext</name>
+ <url>http://nutch.apache.org</url>
+
+ <properties>
+ <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+ </properties>
+
+</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-ext/src/main/java/org/apache/nutch/parse/ext/ExtParser.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-ext/src/main/java/org/apache/nutch/parse/ext/ExtParser.java b/nutch-plugins/parse-ext/src/main/java/org/apache/nutch/parse/ext/ExtParser.java
new file mode 100644
index 0000000..94d9b32
--- /dev/null
+++ b/nutch-plugins/parse-ext/src/main/java/org/apache/nutch/parse/ext/ExtParser.java
@@ -0,0 +1,183 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.ext;
+
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.parse.ParseResult;
+import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.parse.Parser;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.OutlinkExtractor;
+
+import org.apache.nutch.util.CommandRunner;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.hadoop.conf.Configuration;
+
+import org.apache.nutch.plugin.Extension;
+import org.apache.nutch.plugin.PluginRepository;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.Hashtable;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.nio.charset.Charset;
+
+/**
+ * A wrapper that invokes external command to do real parsing job.
+ *
+ * @author John Xing
+ */
+
+public class ExtParser implements Parser {
+
+ public static final Logger LOG = LoggerFactory
+ .getLogger("org.apache.nutch.parse.ext");
+
+ static final int BUFFER_SIZE = 4096;
+
+ static final int TIMEOUT_DEFAULT = 30; // in seconds
+
+ // handy map from String contentType to String[] {command, timeoutString,
+ // encoding}
+ Hashtable<String, String[]> TYPE_PARAMS_MAP = new Hashtable<String, String[]>();
+
+ private Configuration conf;
+
+ public ExtParser() {
+ }
+
+ public ParseResult getParse(Content content) {
+
+ String contentType = content.getContentType();
+
+ String[] params = (String[]) TYPE_PARAMS_MAP.get(contentType);
+ if (params == null)
+ return new ParseStatus(ParseStatus.FAILED,
+ "No external command defined for contentType: " + contentType)
+ .getEmptyParseResult(content.getUrl(), getConf());
+
+ String command = params[0];
+ int timeout = Integer.parseInt(params[1]);
+ String encoding = params[2];
+
+ if (LOG.isTraceEnabled()) {
+ LOG.trace("Use " + command + " with timeout=" + timeout + "secs");
+ }
+
+ String text = null;
+ String title = null;
+
+ try {
+
+ byte[] raw = content.getContent();
+
+ String contentLength = content.getMetadata().get(Response.CONTENT_LENGTH);
+ if (contentLength != null
+ && raw.length != Integer.parseInt(contentLength)) {
+ return new ParseStatus(ParseStatus.FAILED,
+ ParseStatus.FAILED_TRUNCATED, "Content truncated at " + raw.length
+ + " bytes. Parser can't handle incomplete " + contentType
+ + " file.").getEmptyParseResult(content.getUrl(), getConf());
+ }
+
+ ByteArrayOutputStream os = new ByteArrayOutputStream(BUFFER_SIZE);
+ ByteArrayOutputStream es = new ByteArrayOutputStream(BUFFER_SIZE / 4);
+
+ CommandRunner cr = new CommandRunner();
+
+ cr.setCommand(command + " " + contentType);
+ cr.setInputStream(new ByteArrayInputStream(raw));
+ cr.setStdOutputStream(os);
+ cr.setStdErrorStream(es);
+
+ cr.setTimeout(timeout);
+
+ cr.evaluate();
+
+ if (cr.getExitValue() != 0)
+ return new ParseStatus(ParseStatus.FAILED, "External command "
+ + command + " failed with error: " + es.toString())
+ .getEmptyParseResult(content.getUrl(), getConf());
+
+ text = os.toString(encoding);
+
+ } catch (Exception e) { // run time exception
+ return new ParseStatus(e)
+ .getEmptyParseResult(content.getUrl(), getConf());
+ }
+
+ if (text == null)
+ text = "";
+
+ if (title == null)
+ title = "";
+
+ // collect outlink
+ Outlink[] outlinks = OutlinkExtractor.getOutlinks(text, getConf());
+
+ ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title,
+ outlinks, content.getMetadata());
+ return ParseResult.createParseResult(content.getUrl(), new ParseImpl(text,
+ parseData));
+ }
+
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+ Extension[] extensions = PluginRepository.get(conf)
+ .getExtensionPoint("org.apache.nutch.parse.Parser").getExtensions();
+
+ String contentType, command, timeoutString, encoding;
+
+ for (int i = 0; i < extensions.length; i++) {
+ Extension extension = extensions[i];
+
+ // only look for extensions defined by plugin parse-ext
+ if (!extension.getDescriptor().getPluginId().equals("parse-ext"))
+ continue;
+
+ contentType = extension.getAttribute("contentType");
+ if (contentType == null || contentType.equals(""))
+ continue;
+
+ command = extension.getAttribute("command");
+ if (command == null || command.equals(""))
+ continue;
+
+ // null encoding means default
+ encoding = extension.getAttribute("encoding");
+ if (encoding == null)
+ encoding = Charset.defaultCharset().name();
+
+ timeoutString = extension.getAttribute("timeout");
+ if (timeoutString == null || timeoutString.equals(""))
+ timeoutString = "" + TIMEOUT_DEFAULT;
+
+ TYPE_PARAMS_MAP.put(contentType, new String[] { command, timeoutString,
+ encoding });
+ }
+ }
+
+ public Configuration getConf() {
+ return this.conf;
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-ext/src/main/java/org/apache/nutch/parse/ext/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-ext/src/main/java/org/apache/nutch/parse/ext/package-info.java b/nutch-plugins/parse-ext/src/main/java/org/apache/nutch/parse/ext/package-info.java
new file mode 100644
index 0000000..6394489
--- /dev/null
+++ b/nutch-plugins/parse-ext/src/main/java/org/apache/nutch/parse/ext/package-info.java
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Parse wrapper to run external command to do the parsing.
+ */
+package org.apache.nutch.parse.ext;
+