You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by th...@apache.org on 2016/07/05 22:48:56 UTC
[12/69] [abbrv] [partial] nutch git commit: Re arranged the source
code as per maven conventions for build
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java
----------------------------------------------------------------------
diff --git a/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java b/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java
deleted file mode 100644
index 86692ae..0000000
--- a/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java
+++ /dev/null
@@ -1,236 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.protocol.selenium;
-
-import java.io.BufferedInputStream;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.net.URL;
-import java.util.concurrent.TimeUnit;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.IOUtils;
-import org.openqa.selenium.By;
-import org.openqa.selenium.OutputType;
-import org.openqa.selenium.TakesScreenshot;
-import org.openqa.selenium.TimeoutException;
-import org.openqa.selenium.WebDriver;
-import org.openqa.selenium.chrome.ChromeDriver;
-import org.openqa.selenium.firefox.FirefoxBinary;
-import org.openqa.selenium.firefox.FirefoxDriver;
-import org.openqa.selenium.firefox.FirefoxProfile;
-import org.openqa.selenium.io.TemporaryFilesystem;
-import org.openqa.selenium.remote.DesiredCapabilities;
-import org.openqa.selenium.remote.RemoteWebDriver;
-import org.openqa.selenium.safari.SafariDriver;
-import org.openqa.selenium.phantomjs.PhantomJSDriver;
-import org.openqa.selenium.phantomjs.PhantomJSDriverService;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import com.opera.core.systems.OperaDriver;
-
-public class HttpWebClient {
-
- private static final Logger LOG = LoggerFactory.getLogger(HttpWebClient.class);
-
- public static ThreadLocal<WebDriver> threadWebDriver = new ThreadLocal<WebDriver>() {
-
- @Override
- protected WebDriver initialValue()
- {
- FirefoxProfile profile = new FirefoxProfile();
- profile.setPreference("permissions.default.stylesheet", 2);
- profile.setPreference("permissions.default.image", 2);
- profile.setPreference("dom.ipc.plugins.enabled.libflashplayer.so", "false");
- profile.setPreference(FirefoxProfile.ALLOWED_HOSTS_PREFERENCE, "localhost");
- WebDriver driver = new FirefoxDriver(profile);
- return driver;
- };
- };
-
- public static WebDriver getDriverForPage(String url, Configuration conf) {
- WebDriver driver = null;
- DesiredCapabilities capabilities = null;
- long pageLoadWait = conf.getLong("page.load.delay", 3);
-
- try {
- String driverType = conf.get("selenium.driver", "firefox");
- switch (driverType) {
- case "firefox":
- String allowedHost = conf.get("selenium.firefox.allowed.hosts", "localhost");
- long firefoxBinaryTimeout = conf.getLong("selenium.firefox.binary.timeout", 45);
- boolean enableFlashPlayer = conf.getBoolean("selenium.firefox.enable.flash", false);
- int loadImage = conf.getInt("selenium.firefox.load.image", 1);
- int loadStylesheet = conf.getInt("selenium.firefox.load.stylesheet", 1);
- FirefoxProfile profile = new FirefoxProfile();
- FirefoxBinary binary = new FirefoxBinary();
- profile.setPreference(FirefoxProfile.ALLOWED_HOSTS_PREFERENCE, allowedHost);
- profile.setPreference("dom.ipc.plugins.enabled.libflashplayer.so", enableFlashPlayer);
- profile.setPreference("permissions.default.stylesheet", loadStylesheet);
- profile.setPreference("permissions.default.image", loadImage);
- binary.setTimeout(TimeUnit.SECONDS.toMillis(firefoxBinaryTimeout));
- driver = new FirefoxDriver(binary, profile);
- break;
- case "chrome":
- driver = new ChromeDriver();
- break;
- case "safari":
- driver = new SafariDriver();
- break;
- case "opera":
- driver = new OperaDriver();
- break;
- case "phantomjs":
- driver = new PhantomJSDriver();
- break;
- case "remote":
- String seleniumHubHost = conf.get("selenium.hub.host", "localhost");
- int seleniumHubPort = Integer.parseInt(conf.get("selenium.hub.port", "4444"));
- String seleniumHubPath = conf.get("selenium.hub.path", "/wd/hub");
- String seleniumHubProtocol = conf.get("selenium.hub.protocol", "http");
- String seleniumGridDriver = conf.get("selenium.grid.driver","firefox");
- String seleniumGridBinary = conf.get("selenium.grid.binary");
-
- switch (seleniumGridDriver){
- case "firefox":
- capabilities = DesiredCapabilities.firefox();
- capabilities.setBrowserName("firefox");
- capabilities.setJavascriptEnabled(true);
- capabilities.setCapability("firefox_binary",seleniumGridBinary);
- System.setProperty("webdriver.reap_profile", "false");
- driver = new RemoteWebDriver(new URL(seleniumHubProtocol, seleniumHubHost, seleniumHubPort, seleniumHubPath), capabilities);
- break;
- case "phantomjs":
- capabilities = DesiredCapabilities.phantomjs();
- capabilities.setBrowserName("phantomjs");
- capabilities.setJavascriptEnabled(true);
- capabilities.setCapability(PhantomJSDriverService.PHANTOMJS_EXECUTABLE_PATH_PROPERTY,seleniumGridBinary);
- driver = new RemoteWebDriver(new URL(seleniumHubProtocol, seleniumHubHost, seleniumHubPort, seleniumHubPath), capabilities);
- break;
- default:
- LOG.error("The Selenium Grid WebDriver choice {} is not available... defaulting to FirefoxDriver().", driverType);
- driver = new RemoteWebDriver(new URL(seleniumHubProtocol, seleniumHubHost, seleniumHubPort, seleniumHubPath), DesiredCapabilities.firefox());
- break;
- }
- break;
- default:
- LOG.error("The Selenium WebDriver choice {} is not available... defaulting to FirefoxDriver().", driverType);
- driver = new FirefoxDriver();
- break;
- }
- LOG.debug("Selenium {} WebDriver selected.", driverType);
-
- driver.manage().timeouts().pageLoadTimeout(pageLoadWait, TimeUnit.SECONDS);
- driver.get(url);
- } catch (Exception e) {
- if(e instanceof TimeoutException) {
- LOG.debug("Selenium WebDriver: Timeout Exception: Capturing whatever loaded so far...");
- return driver;
- }
- cleanUpDriver(driver);
- throw new RuntimeException(e);
- }
-
- return driver;
- }
-
- public static String getHTMLContent(WebDriver driver, Configuration conf) {
- if (conf.getBoolean("take.screenshot", false)) {
- takeScreenshot(driver, conf);
- }
-
- return driver.findElement(By.tagName("body")).getAttribute("innerHTML");
- }
-
- public static void cleanUpDriver(WebDriver driver) {
- if (driver != null) {
- try {
- driver.close();
- driver.quit();
- TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles();
- } catch (Exception e) {
- throw new RuntimeException(e);
- }
- }
- }
-
- /**
- * Function for obtaining the HTML BODY using the selected
- * {@link org.openqa.selenium.WebDriver}.
- * There are a number of configuration properties within
- * <code>nutch-site.xml</code> which determine whether to
- * take screenshots of the rendered pages and persist them
- * as timestamped .png's into HDFS.
- * @param url the URL to fetch and render
- * @param conf the {@link org.apache.hadoop.conf.Configuration}
- * @return the rendered inner HTML page
- */
- public static String getHtmlPage(String url, Configuration conf) {
- WebDriver driver = getDriverForPage(url, conf);
-
- try {
- if (conf.getBoolean("take.screenshot", false)) {
- takeScreenshot(driver, conf);
- }
-
- String innerHtml = driver.findElement(By.tagName("body")).getAttribute("innerHTML");
- return innerHtml;
-
- // I'm sure this catch statement is a code smell ; borrowing it from lib-htmlunit
- } catch (Exception e) {
- TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles();
- throw new RuntimeException(e);
- } finally {
- cleanUpDriver(driver);
- }
- }
-
- public static String getHtmlPage(String url) {
- return getHtmlPage(url, null);
- }
-
- private static void takeScreenshot(WebDriver driver, Configuration conf) {
- try {
- String url = driver.getCurrentUrl();
- File srcFile = ((TakesScreenshot)driver).getScreenshotAs(OutputType.FILE);
- LOG.debug("In-memory screenshot taken of: {}", url);
- FileSystem fs = FileSystem.get(conf);
- if (conf.get("screenshot.location") != null) {
- Path screenshotPath = new Path(conf.get("screenshot.location") + "/" + srcFile.getName());
- OutputStream os = null;
- if (!fs.exists(screenshotPath)) {
- LOG.debug("No existing screenshot already exists... creating new file at {} {}.", screenshotPath, srcFile.getName());
- os = fs.create(screenshotPath);
- }
- InputStream is = new BufferedInputStream(new FileInputStream(srcFile));
- IOUtils.copyBytes(is, os, conf);
- LOG.debug("Screenshot for {} successfully saved to: {} {}", url, screenshotPath, srcFile.getName());
- } else {
- LOG.warn("Screenshot for {} not saved to HDFS (subsequently disgarded) as value for "
- + "'screenshot.location' is absent from nutch-site.xml.", url);
- }
- } catch (Exception e) {
- cleanUpDriver(driver);
- throw new RuntimeException(e);
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/lib-xml/build.xml
----------------------------------------------------------------------
diff --git a/src/plugin/lib-xml/build.xml b/src/plugin/lib-xml/build.xml
deleted file mode 100644
index 0f87c07..0000000
--- a/src/plugin/lib-xml/build.xml
+++ /dev/null
@@ -1,36 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="lib-xml" default="jar">
-
- <import file="../build-plugin.xml" />
-
- <!--
- ! Override the compile and jar targets,
- ! since there is nothing to compile here.
- ! -->
- <target name="compile" depends="init, resolve-default" />
-
- <!--
- <target name="jar" depends="compile">
- <copy todir="${build.dir}" verbose="true">
- <fileset dir="./lib" includes="**/*.jar" />
- </copy>
- </target>
- -->
-
-</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/lib-xml/ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/lib-xml/ivy.xml b/src/plugin/lib-xml/ivy.xml
deleted file mode 100644
index 414f38a..0000000
--- a/src/plugin/lib-xml/ivy.xml
+++ /dev/null
@@ -1,44 +0,0 @@
-<?xml version="1.0" ?>
-
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-
-<ivy-module version="1.0">
- <info organisation="org.apache.nutch" module="${ant.project.name}">
- <license name="Apache 2.0"/>
- <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
- <description>
- Apache Nutch
- </description>
- </info>
-
- <configurations>
- <include file="../../..//ivy/ivy-configurations.xml"/>
- </configurations>
-
- <publications>
- <!--get the artifact from our module name-->
- <artifact conf="master"/>
- </publications>
-
- <dependencies>
- <dependency org="org.jdom" name="jdom" rev="1.1" conf="*->default"/>
- <dependency org="jaxen" name="jaxen" rev="1.1.1" conf="*->master"/>
- <dependency org="xerces" name="xercesImpl" rev="2.11.0" conf="*->master"/>
- </dependencies>
-
-</ivy-module>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/lib-xml/plugin.xml
----------------------------------------------------------------------
diff --git a/src/plugin/lib-xml/plugin.xml b/src/plugin/lib-xml/plugin.xml
deleted file mode 100644
index 79bd17f..0000000
--- a/src/plugin/lib-xml/plugin.xml
+++ /dev/null
@@ -1,65 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<!--
- ! XML library - Gathers many XML related libraries:
- !
- ! * Jaxen
- ! - Download : http://jaxen.org/releases.html
- ! - License : http://jaxen.org/license.html
- !
- ! * Xerces-J 2.6.1
- ! - Download : http://xerces.apache.org/xerces2-j/download.cgi
- ! - License : http://www.apache.org/licenses/LICENSE-2.0
- !
- ! * SAXPath 1.0 FCS
- ! - Note : SAXPath has been incorporated into Jaxen.
- ! It has been merged into the Jaxen codebase
- ! and is no longer being maintained separately
- ! - Download : http://sourceforge.net/project/showfiles.php?group_id=26014
- ! - License : OSI-Approved Open Source
- !
- ! * jdom 1.0 beta8-dev
- ! - Download : http://www.jdom.org/downloads/index.html
- ! - License : http://www.jdom.org/docs/faq.html#a0030
- !
- !-->
-<plugin
- id="lib-xml"
- name="XML Libraries"
- version="1.0"
- provider-name="org.apache.nutch.xml">
-
- <runtime>
- <library name="jaxen-core.jar">
- <export name="*"/>
- </library>
- <library name="jaxen-jdom.jar">
- <export name="*"/>
- </library>
- <library name="xercesImpl.jar">
- <export name="*"/>
- </library>
- <library name="saxpath.jar">
- <export name="*"/>
- </library>
- <library name="jdom.jar">
- <export name="*"/>
- </library>
- </runtime>
-
-</plugin>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/microformats-reltag/build.xml
----------------------------------------------------------------------
diff --git a/src/plugin/microformats-reltag/build.xml b/src/plugin/microformats-reltag/build.xml
deleted file mode 100644
index 395afee..0000000
--- a/src/plugin/microformats-reltag/build.xml
+++ /dev/null
@@ -1,27 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="microformats-reltag" default="jar-core">
-
- <import file="../build-plugin.xml"/>
-
- <!-- Deploy Unit test dependencies -->
- <target name="deps-test">
- <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
- </target>
-
-</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/microformats-reltag/ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/microformats-reltag/ivy.xml b/src/plugin/microformats-reltag/ivy.xml
deleted file mode 100644
index 1a86d68..0000000
--- a/src/plugin/microformats-reltag/ivy.xml
+++ /dev/null
@@ -1,41 +0,0 @@
-<?xml version="1.0" ?>
-
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-
-<ivy-module version="1.0">
- <info organisation="org.apache.nutch" module="${ant.project.name}">
- <license name="Apache 2.0"/>
- <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
- <description>
- Apache Nutch
- </description>
- </info>
-
- <configurations>
- <include file="../../..//ivy/ivy-configurations.xml"/>
- </configurations>
-
- <publications>
- <!--get the artifact from our module name-->
- <artifact conf="master"/>
- </publications>
-
- <dependencies>
- </dependencies>
-
-</ivy-module>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/microformats-reltag/plugin.xml
----------------------------------------------------------------------
diff --git a/src/plugin/microformats-reltag/plugin.xml b/src/plugin/microformats-reltag/plugin.xml
deleted file mode 100644
index b35e1f4..0000000
--- a/src/plugin/microformats-reltag/plugin.xml
+++ /dev/null
@@ -1,49 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<plugin
- id="microformats-reltag"
- name="Rel-Tag microformat Parser/Indexer/Querier"
- version="1.0.0"
- provider-name="nutch.org">
-
- <runtime>
- <library name="microformats-reltag.jar">
- <export name="*"/>
- </library>
- </runtime>
-
- <requires>
- <import plugin="nutch-extensionpoints"/>
- </requires>
-
- <extension id="org.apache.nutch.microformats.reltag.RelTagParser"
- name="Rel-Tag parser"
- point="org.apache.nutch.parse.HtmlParseFilter">
- <implementation id="RelTagParser"
- class="org.apache.nutch.microformats.reltag.RelTagParser"/>
- </extension>
-
- <extension id="org.apache.nutch.microformats.reltag.RelTagIndexingFilter"
- name="Rel-Tag indexing filter"
- point="org.apache.nutch.indexer.IndexingFilter">
- <implementation id="RelTagIndexingFilter"
- class="org.apache.nutch.microformats.reltag.RelTagIndexingFilter"/>
- </extension>
-
-</plugin>
-
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java
----------------------------------------------------------------------
diff --git a/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java b/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java
deleted file mode 100644
index e50a150..0000000
--- a/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java
+++ /dev/null
@@ -1,77 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.microformats.reltag;
-
-// Nutch imports
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.crawl.Inlinks;
-import org.apache.nutch.indexer.IndexingFilter;
-import org.apache.nutch.indexer.IndexingException;
-import org.apache.nutch.indexer.NutchDocument;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.parse.Parse;
-
-// Hadoop imports
-import org.apache.hadoop.conf.Configuration;
-
-/**
- * An {@link org.apache.nutch.indexer.IndexingFilter} that add <code>tag</code>
- * field(s) to the document.
- *
- * @see <a href="http://www.microformats.org/wiki/rel-tag">
- * http://www.microformats.org/wiki/rel-tag</a>
- * @author Jérôme Charron
- */
-public class RelTagIndexingFilter implements IndexingFilter {
-
- private Configuration conf;
-
- // Inherited JavaDoc
- public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
- CrawlDatum datum, Inlinks inlinks) throws IndexingException {
-
- // Check if some Rel-Tags found, possibly put there by RelTagParser
- String[] tags = parse.getData().getParseMeta()
- .getValues(RelTagParser.REL_TAG);
- if (tags != null) {
- for (int i = 0; i < tags.length; i++) {
- doc.add("tag", tags[i]);
- }
- }
-
- return doc;
- }
-
- /*
- * ----------------------------- * <implementation:Configurable> *
- * -----------------------------
- */
-
- public void setConf(Configuration conf) {
- this.conf = conf;
- }
-
- public Configuration getConf() {
- return this.conf;
- }
-
- /*
- * ------------------------------ * </implementation:Configurable> *
- * ------------------------------
- */
-
-}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java
----------------------------------------------------------------------
diff --git a/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java b/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java
deleted file mode 100644
index 9176a1e..0000000
--- a/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java
+++ /dev/null
@@ -1,148 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.microformats.reltag;
-
-// JDK imports
-import java.net.URL;
-import java.net.URLDecoder;
-import java.util.Iterator;
-import java.util.Set;
-import java.util.TreeSet;
-import org.w3c.dom.DocumentFragment;
-import org.w3c.dom.NamedNodeMap;
-import org.w3c.dom.Node;
-import org.w3c.dom.NodeList;
-
-// Commons Logging imports
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-// Nutch imports
-import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.parse.HTMLMetaTags;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseResult;
-import org.apache.nutch.parse.HtmlParseFilter;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.util.StringUtil;
-
-// Hadoop imports
-import org.apache.hadoop.conf.Configuration;
-
-/**
- * Adds microformat rel-tags of document if found.
- *
- * @see <a href="http://www.microformats.org/wiki/rel-tag">
- * http://www.microformats.org/wiki/rel-tag</a>
- */
-public class RelTagParser implements HtmlParseFilter {
-
- public final static Logger LOG = LoggerFactory.getLogger(RelTagParser.class);
-
- public final static String REL_TAG = "Rel-Tag";
-
- private Configuration conf = null;
-
- /**
- * Scan the HTML document looking at possible rel-tags
- */
- public ParseResult filter(Content content, ParseResult parseResult,
- HTMLMetaTags metaTags, DocumentFragment doc) {
-
- // get parse obj
- Parse parse = parseResult.get(content.getUrl());
- // Trying to find the document's rel-tags
- Parser parser = new Parser(doc);
- Set<?> tags = parser.getRelTags();
- Iterator<?> iter = tags.iterator();
- Metadata metadata = parse.getData().getParseMeta();
- while (iter.hasNext())
- metadata.add(REL_TAG, (String) iter.next());
-
- return parseResult;
- }
-
- private static class Parser {
-
- Set<String> tags = null;
-
- Parser(Node node) {
- tags = new TreeSet<String>();
- parse(node);
- }
-
- Set<String> getRelTags() {
- return tags;
- }
-
- void parse(Node node) {
-
- if (node.getNodeType() == Node.ELEMENT_NODE) {
- // Look for <a> tag
- if ("a".equalsIgnoreCase(node.getNodeName())) {
- NamedNodeMap attrs = node.getAttributes();
- Node hrefNode = attrs.getNamedItem("href");
- // Checks that it contains a href attribute
- if (hrefNode != null) {
- Node relNode = attrs.getNamedItem("rel");
- // Checks that it contains a rel attribute too
- if (relNode != null) {
- // Finaly checks that rel=tag
- if ("tag".equalsIgnoreCase(relNode.getNodeValue())) {
- String tag = parseTag(hrefNode.getNodeValue());
- if (!StringUtil.isEmpty(tag)) {
- if (!tags.contains(tag)) {
- tags.add(tag);
- LOG.debug("Adding tag: " + tag + " to tag set.");
- }
- }
- }
- }
- }
- }
- }
-
- // Recurse
- NodeList children = node.getChildNodes();
- for (int i = 0; children != null && i < children.getLength(); i++)
- parse(children.item(i));
- }
-
- private final static String parseTag(String url) {
- String tag = null;
- try {
- URL u = new URL(url);
- String path = u.getPath();
- tag = URLDecoder.decode(path.substring(path.lastIndexOf('/') + 1),
- "UTF-8");
- } catch (Exception e) {
- // Malformed tag...
- tag = null;
- }
- return tag;
- }
-
- }
-
- public void setConf(Configuration conf) {
- this.conf = conf;
- }
-
- public Configuration getConf() {
- return this.conf;
- }
-}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/package.html
----------------------------------------------------------------------
diff --git a/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/package.html b/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/package.html
deleted file mode 100644
index bef5409..0000000
--- a/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/package.html
+++ /dev/null
@@ -1,8 +0,0 @@
-<html>
-<body>
-<p>
-A microformats <a href="http://www.microformats.org/wiki/Rel-Tag">Rel-Tag</a>
-Parser/Indexer/Querier plugin.
-</p>
-</body>
-</html>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/mimetype-filter/build.xml
----------------------------------------------------------------------
diff --git a/src/plugin/mimetype-filter/build.xml b/src/plugin/mimetype-filter/build.xml
deleted file mode 100644
index 977e643..0000000
--- a/src/plugin/mimetype-filter/build.xml
+++ /dev/null
@@ -1,28 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="mimetype-filter" default="jar-core">
-
- <import file="../build-plugin.xml" />
-
- <!-- for junit test -->
- <mkdir dir="${build.test}/data"/>
- <copy todir="${build.test}/data">
- <fileset dir="sample" includes="**/*.txt"/>
- </copy>
-
-</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/mimetype-filter/ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/mimetype-filter/ivy.xml b/src/plugin/mimetype-filter/ivy.xml
deleted file mode 100644
index 0a363f7..0000000
--- a/src/plugin/mimetype-filter/ivy.xml
+++ /dev/null
@@ -1,41 +0,0 @@
-<?xml version="1.0" ?>
-
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-
-<ivy-module version="1.0">
- <info organisation="org.apache.nutch" module="${ant.project.name}">
- <license name="Apache 2.0"/>
- <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
- <description>
- Apache Nutch
- </description>
- </info>
-
- <configurations>
- <include file="${nutch.root}/ivy/ivy-configurations.xml"/>
- </configurations>
-
- <publications>
- <!--get the artifact from our module name-->
- <artifact conf="master"/>
- </publications>
-
- <dependencies>
- </dependencies>
-
-</ivy-module>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/mimetype-filter/plugin.xml
----------------------------------------------------------------------
diff --git a/src/plugin/mimetype-filter/plugin.xml b/src/plugin/mimetype-filter/plugin.xml
deleted file mode 100644
index d038447..0000000
--- a/src/plugin/mimetype-filter/plugin.xml
+++ /dev/null
@@ -1,37 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<plugin
- id="mimetype-filter"
- name="Filter indexed documents by the detected MIME"
- version="1.0.0"
- provider-name="nutch.org">
-
- <runtime>
- <library name="mimetype-filter.jar">
- <export name="*"/>
- </library>
- </runtime>
-
- <extension id="org.apache.nutch.indexer.filter"
- name="Nutch MIME filter"
- point="org.apache.nutch.indexer.IndexingFilter">
- <implementation id="MimeTypeIndexingFilter"
- class="org.apache.nutch.indexer.filter.MimeTypeIndexingFilter"/>
- </extension>
-
-</plugin>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/mimetype-filter/sample/allow-images.txt
----------------------------------------------------------------------
diff --git a/src/plugin/mimetype-filter/sample/allow-images.txt b/src/plugin/mimetype-filter/sample/allow-images.txt
deleted file mode 100644
index 0f5f136..0000000
--- a/src/plugin/mimetype-filter/sample/allow-images.txt
+++ /dev/null
@@ -1,34 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# This filter can be configured to work in one of two modes (similar to
-# suffix-url-filter)
-
-# default to reject ('-'): in this mode, only documents with a mimetype that
-# match the ones specified in the config file will be accepted, all other
-# mimetypes will be rejected.
-
-# default to accept ('+'): in this mode, only documents with a mimetype
-# that match the ones specified in the config file will be rejected,
-# all other mimetypes will be accepted.
-
-# The format of this config file is one mimetype per line, with no preceding
-# whitespace. Order, in which suffixes are specified, doesn't matter. Blank
-# lines and comments (#) are allowed.
-#
-
--
-
-image
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/mimetype-filter/sample/block-html.txt
----------------------------------------------------------------------
diff --git a/src/plugin/mimetype-filter/sample/block-html.txt b/src/plugin/mimetype-filter/sample/block-html.txt
deleted file mode 100644
index 69600ec..0000000
--- a/src/plugin/mimetype-filter/sample/block-html.txt
+++ /dev/null
@@ -1,34 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# This filter can be configured to work in one of two modes (similar to
-# suffix-url-filter)
-
-# default to reject ('-'): in this mode, only documents with a mimetype that
-# match the ones specified in the config file will be accepted, all other
-# mimetypes will be rejected.
-
-# default to accept ('+'): in this mode, only documents with a mimetype
-# that match the ones specified in the config file will be rejected,
-# all other mimetypes will be accepted.
-
-# The format of this config file is one mimetype per line, with no preceding
-# whitespace. Order, in which suffixes are specified, doesn't matter. Blank
-# lines and comments (#) are allowed.
-#
-
-+
-
-text/html
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/mimetype-filter/src/java/org/apache/nutch/indexer/filter/MimeTypeIndexingFilter.java
----------------------------------------------------------------------
diff --git a/src/plugin/mimetype-filter/src/java/org/apache/nutch/indexer/filter/MimeTypeIndexingFilter.java b/src/plugin/mimetype-filter/src/java/org/apache/nutch/indexer/filter/MimeTypeIndexingFilter.java
deleted file mode 100644
index 494d888..0000000
--- a/src/plugin/mimetype-filter/src/java/org/apache/nutch/indexer/filter/MimeTypeIndexingFilter.java
+++ /dev/null
@@ -1,273 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.indexer.filter;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import org.apache.commons.cli.Option;
-import org.apache.commons.cli.Options;
-import org.apache.commons.cli.OptionBuilder;
-import org.apache.commons.cli.CommandLine;
-import org.apache.commons.cli.CommandLineParser;
-import org.apache.commons.cli.HelpFormatter;
-import org.apache.commons.cli.GnuParser;
-import org.apache.commons.cli.UnrecognizedOptionException;
-
-// Nutch imports
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.util.StringUtils;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.io.Writable;
-
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.crawl.Inlinks;
-
-import org.apache.nutch.indexer.IndexingException;
-import org.apache.nutch.indexer.IndexingFilter;
-import org.apache.nutch.indexer.NutchDocument;
-
-import org.apache.nutch.net.protocols.Response;
-
-import org.apache.nutch.parse.Outlink;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseData;
-import org.apache.nutch.parse.ParseImpl;
-import org.apache.nutch.parse.ParseStatus;
-
-import org.apache.nutch.metadata.Metadata;
-
-import org.apache.nutch.util.MimeUtil;
-import org.apache.nutch.util.NutchConfiguration;
-import org.apache.nutch.util.PrefixStringMatcher;
-import org.apache.nutch.util.TrieStringMatcher;
-import org.apache.tika.Tika;
-
-import java.io.BufferedReader;
-import java.io.IOException;
-import java.io.InputStreamReader;
-import java.io.Reader;
-import java.util.ArrayList;
-import java.util.List;
-
-/**
- * An {@link org.apache.nutch.indexer.IndexingFilter} that allows filtering
- * of documents based on the MIME Type detected by Tika
- *
- */
-public class MimeTypeIndexingFilter implements IndexingFilter {
-
- public static final String MIMEFILTER_REGEX_FILE = "mimetype.filter.file";
-
- private static final Logger LOG = LoggerFactory
- .getLogger(MimeTypeIndexingFilter.class);
-
- private MimeUtil MIME;
- private Tika tika = new Tika();
-
- private TrieStringMatcher trie;
-
- private Configuration conf;
-
- private boolean acceptMode = true;
-
- // Inherited JavaDoc
- @Override
- public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
- CrawlDatum datum, Inlinks inlinks) throws IndexingException {
-
- String mimeType;
- String contentType;
-
- Writable tcontentType = datum.getMetaData()
- .get(new Text(Response.CONTENT_TYPE));
-
- if (tcontentType != null) {
- contentType = tcontentType.toString();
- } else {
- contentType = parse.getData().getMeta(Response.CONTENT_TYPE);
- }
-
- if (contentType == null) {
- mimeType = tika.detect(url.toString());
- } else {
- mimeType = MIME.forName(MimeUtil.cleanMimeType(contentType));
- }
-
- contentType = mimeType;
-
- if (LOG.isInfoEnabled()) {
- LOG.info(String.format("[%s] %s", contentType, url));
- }
-
- if (trie != null) {
- if (trie.shortestMatch(contentType) == null) {
- // no match, but
- if (acceptMode) {
- return doc;
- }
- return null;
- } else {
- // matched, but we are blocking
- if (acceptMode) {
- return null;
- }
- }
- }
-
- return doc;
- }
-
- /*
- * -----------------------------
- * <implementation:Configurable> *
- * -----------------------------
- */
- @Override
- public void setConf(Configuration conf) {
- this.conf = conf;
- MIME = new MimeUtil(conf);
-
- // load the file of the values
- String file = conf.get(MIMEFILTER_REGEX_FILE, "");
-
- if (file != null) {
- if (file.isEmpty()) {
- LOG.warn(String
- .format("Missing %s property, ALL mimetypes will be allowed",
- MIMEFILTER_REGEX_FILE));
- } else {
- Reader reader = conf.getConfResourceAsReader(file);
-
- try {
- readConfiguration(reader);
- } catch (IOException e) {
- if (LOG.isErrorEnabled()) {
- LOG.error(e.getMessage());
- }
-
- throw new RuntimeException(e.getMessage(), e);
- }
- }
- }
- }
-
- private void readConfiguration(Reader reader) throws IOException {
- BufferedReader in = new BufferedReader(reader);
- String line;
- List rules = new ArrayList();
-
- while (null != (line = in.readLine())) {
- if (line.length() == 0) {
- continue;
- }
-
- char first = line.charAt(0);
- switch (first) {
- case ' ':
- case '\n':
- case '#': // skip blank & comment lines
- break;
- case '+':
- acceptMode = true;
- break;
- case '-':
- acceptMode = false;
- break;
- default:
- rules.add(line);
- break;
- }
- }
-
- trie = new PrefixStringMatcher(rules);
- }
-
- @Override
- public Configuration getConf() {
- return this.conf;
- }
-
- /**
- * Main method for invoking this tool
- *
- * @throws IOException, IndexingException
- */
- public static void main(String[] args) throws IOException, IndexingException {
- Option helpOpt = new Option("h", "help", false, "show this help message");
- Option rulesOpt = OptionBuilder.withArgName("file").hasArg()
- .withDescription(
- "Rules file to be used in the tests relative to the conf directory")
- .isRequired().create("rules");
-
- Options options = new Options();
- options.addOption(helpOpt).addOption(rulesOpt);
-
- CommandLineParser parser = new GnuParser();
- HelpFormatter formatter = new HelpFormatter();
- String rulesFile;
-
- try {
- CommandLine line = parser.parse(options, args);
-
- if (line.hasOption("help") || !line.hasOption("rules")) {
- formatter
- .printHelp("org.apache.nutch.indexer.filter.MimeTypeIndexingFilter",
- options, true);
- return;
- }
-
- rulesFile = line.getOptionValue("rules");
- } catch (UnrecognizedOptionException e) {
- formatter
- .printHelp("org.apache.nutch.indexer.filter.MimeTypeIndexingFilter",
- options, true);
- return;
- } catch (Exception e) {
- LOG.error(StringUtils.stringifyException(e));
- e.printStackTrace();
- return;
- }
-
- MimeTypeIndexingFilter filter = new MimeTypeIndexingFilter();
- Configuration conf = NutchConfiguration.create();
- conf.set(MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE, rulesFile);
- filter.setConf(conf);
-
- BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
- String line;
-
- while ((line = in.readLine()) != null && !line.isEmpty()) {
- Metadata metadata = new Metadata();
- metadata.set(Response.CONTENT_TYPE, line);
- ParseImpl parse = new ParseImpl("text",
- new ParseData(new ParseStatus(), "title", new Outlink[0], metadata));
-
- NutchDocument doc = filter.filter(new NutchDocument(), parse,
- new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
-
- if (doc != null) {
- System.out.print("+ ");
- System.out.println(line);
- } else {
- System.out.print("- ");
- System.out.println(line);
- }
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/mimetype-filter/src/test/org/apache/nutch/indexer/filter/MimeTypeIndexingFilterTest.java
----------------------------------------------------------------------
diff --git a/src/plugin/mimetype-filter/src/test/org/apache/nutch/indexer/filter/MimeTypeIndexingFilterTest.java b/src/plugin/mimetype-filter/src/test/org/apache/nutch/indexer/filter/MimeTypeIndexingFilterTest.java
deleted file mode 100644
index bca230f..0000000
--- a/src/plugin/mimetype-filter/src/test/org/apache/nutch/indexer/filter/MimeTypeIndexingFilterTest.java
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.indexer.filter;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.crawl.Inlinks;
-import org.apache.nutch.indexer.NutchDocument;
-import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.net.protocols.Response;
-import org.apache.nutch.parse.Outlink;
-import org.apache.nutch.parse.ParseData;
-import org.apache.nutch.parse.ParseImpl;
-import org.apache.nutch.parse.ParseStatus;
-import org.apache.nutch.util.NutchConfiguration;
-
-import org.junit.Assert;
-import org.junit.Before;
-import org.junit.Test;
-
-/**
- * JUnit based tests of class
- * {@link org.apache.nutch.indexer.filter.MimeTypeIndexingFilter}
- *
- */
-public class MimeTypeIndexingFilterTest {
-
- private Configuration conf = NutchConfiguration.create();
- private MimeTypeIndexingFilter filter = new MimeTypeIndexingFilter();
- private String[] MIME_TYPES = { "text/html", "image/png", "application/pdf" };
- private ParseImpl[] parses = new ParseImpl[MIME_TYPES.length];
- private String sampleDir = System.getProperty("test.data", ".");
-
- @Before
- public void setUp() throws Exception {
- for (int i = 0; i < MIME_TYPES.length; i++) {
- Metadata metadata = new Metadata();
- metadata.add(Response.CONTENT_TYPE, MIME_TYPES[i]);
-
- ParseImpl parse = new ParseImpl("text",
- new ParseData(new ParseStatus(), "title", new Outlink[0], metadata));
-
- parses[i] = parse;
- }
- }
-
- @Test
- public void testMissingConfigFile() throws Exception {
- String file = conf.get(MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE, "");
- Assert.assertEquals(String
- .format("Property %s must not be present in the the configuration file",
- MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE), "", file);
-
- filter.setConf(conf);
-
- // property not set so in this cases all documents must pass the filter
- for (int i = 0; i < parses.length; i++) {
- NutchDocument doc = filter.filter(new NutchDocument(), parses[i],
- new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
-
- Assert.assertNotNull("All documents must be allowed by default", doc);
- }
- }
-
- @Test
- public void testAllowOnlyImages() throws Exception {
- conf.set(MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE, "allow-images.txt");
- filter.setConf(conf);
-
- for (int i = 0; i < parses.length; i++) {
- NutchDocument doc = filter.filter(new NutchDocument(), parses[i],
- new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
-
- if (MIME_TYPES[i].contains("image")) {
- Assert.assertNotNull("Allow only images", doc);
- } else {
- Assert.assertNull("Block everything else", doc);
- }
- }
- }
-
- @Test
- public void testBlockHTML() throws Exception {
- conf.set(MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE, "block-html.txt");
- filter.setConf(conf);
-
- for (int i = 0; i < parses.length; i++) {
- NutchDocument doc = filter.filter(new NutchDocument(), parses[i],
- new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
-
- if (MIME_TYPES[i].contains("html")) {
- Assert.assertNull("Block only HTML documents", doc);
- } else {
- Assert.assertNotNull("Allow everything else", doc);
- }
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/nutch-extensionpoints/build.xml
----------------------------------------------------------------------
diff --git a/src/plugin/nutch-extensionpoints/build.xml b/src/plugin/nutch-extensionpoints/build.xml
deleted file mode 100644
index 45eb815..0000000
--- a/src/plugin/nutch-extensionpoints/build.xml
+++ /dev/null
@@ -1,30 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="nutch-extensionpoints" default="jar">
-
- <import file="../build-plugin.xml"/>
-
- <!--
- ! Override the compile and jar targets,
- ! since there is nothing to compile here.
- ! -->
- <target name="compile" depends="init, resolve-default"/>
-
- <!--target name="jar" depends="compile"/-->
-
-</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/nutch-extensionpoints/ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/nutch-extensionpoints/ivy.xml b/src/plugin/nutch-extensionpoints/ivy.xml
deleted file mode 100644
index 1a86d68..0000000
--- a/src/plugin/nutch-extensionpoints/ivy.xml
+++ /dev/null
@@ -1,41 +0,0 @@
-<?xml version="1.0" ?>
-
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-
-<ivy-module version="1.0">
- <info organisation="org.apache.nutch" module="${ant.project.name}">
- <license name="Apache 2.0"/>
- <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
- <description>
- Apache Nutch
- </description>
- </info>
-
- <configurations>
- <include file="../../..//ivy/ivy-configurations.xml"/>
- </configurations>
-
- <publications>
- <!--get the artifact from our module name-->
- <artifact conf="master"/>
- </publications>
-
- <dependencies>
- </dependencies>
-
-</ivy-module>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/nutch-extensionpoints/plugin.xml
----------------------------------------------------------------------
diff --git a/src/plugin/nutch-extensionpoints/plugin.xml b/src/plugin/nutch-extensionpoints/plugin.xml
deleted file mode 100644
index 8cf7a23..0000000
--- a/src/plugin/nutch-extensionpoints/plugin.xml
+++ /dev/null
@@ -1,67 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<plugin
- id="nutch-extensionpoints"
- name="the nutch core extension points"
- version="2.0.0"
- provider-name="nutch.org">
-
- <!-- this file hosts all extension points nutch core code offers.
- Please not that plugins can define extension points as well to be extendable.-->
-
-<extension-point
- id="org.apache.nutch.indexer.IndexingFilter"
- name="Nutch Indexing Filter"/>
-
-<extension-point
- id="org.apache.nutch.indexer.IndexWriter"
- name="Nutch Index Writer"/>
-
-<extension-point
- id="org.apache.nutch.parse.Parser"
- name="Nutch Content Parser"/>
-
-<extension-point
- id="org.apache.nutch.parse.HtmlParseFilter"
- name="HTML Parse Filter"/>
-
-<extension-point
- id="org.apache.nutch.protocol.Protocol"
- name="Nutch Protocol"/>
-
-<extension-point
- id="org.apache.nutch.net.URLFilter"
- name="Nutch URL Filter"/>
-
-<extension-point
- id="org.apache.nutch.net.URLExemptionFilter"
- name="Nutch URL Ignore Exemption Filter"/>
-
-<extension-point
- id="org.apache.nutch.net.URLNormalizer"
- name="Nutch URL Normalizer"/>
-
-<extension-point
- id="org.apache.nutch.scoring.ScoringFilter"
- name="Nutch Scoring"/>
-
-<extension-point
- id="org.apache.nutch.segment.SegmentMergeFilter"
- name="Nutch Segment Merge Filter"/>
-
-</plugin>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-ext/build.xml
----------------------------------------------------------------------
diff --git a/src/plugin/parse-ext/build.xml b/src/plugin/parse-ext/build.xml
deleted file mode 100644
index 25552fa..0000000
--- a/src/plugin/parse-ext/build.xml
+++ /dev/null
@@ -1,32 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="parse-ext" default="jar-core">
-
- <import file="../build-plugin.xml"/>
-
- <!-- Deploy Unit test dependencies -->
- <target name="deps-test">
- <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
- <ant target="deploy" inheritall="false" dir="../protocol-file"/>
- </target>
-
-
- <copy file="command" todir="${deploy.dir}" preservelastmodified="true"/>
- <chmod file="${deploy.dir}/command" perm="755"/>
-
-</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-ext/command
----------------------------------------------------------------------
diff --git a/src/plugin/parse-ext/command b/src/plugin/parse-ext/command
deleted file mode 100644
index f42c055..0000000
--- a/src/plugin/parse-ext/command
+++ /dev/null
@@ -1,24 +0,0 @@
-#!/bin/bash
-#
-# Sample bash script as external command invoked by parse-ext plugin
-#
-# 20040701, John Xing
-
-set -e
-
-if [ $# -ne 1 ]; then
- echo Usage:$0 mimeType >&2
- exit 1
-fi
-
-case $1 in
-"application/vnd.nutch.example.cat")
- cat
- ;;
-"application/vnd.nutch.example.md5sum")
- md5sum
- ;;
-*)
- echo "Can't parse mimeType $1" >&2
- exit 1
-esac
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-ext/ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/parse-ext/ivy.xml b/src/plugin/parse-ext/ivy.xml
deleted file mode 100644
index 1a86d68..0000000
--- a/src/plugin/parse-ext/ivy.xml
+++ /dev/null
@@ -1,41 +0,0 @@
-<?xml version="1.0" ?>
-
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-
-<ivy-module version="1.0">
- <info organisation="org.apache.nutch" module="${ant.project.name}">
- <license name="Apache 2.0"/>
- <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
- <description>
- Apache Nutch
- </description>
- </info>
-
- <configurations>
- <include file="../../..//ivy/ivy-configurations.xml"/>
- </configurations>
-
- <publications>
- <!--get the artifact from our module name-->
- <artifact conf="master"/>
- </publications>
-
- <dependencies>
- </dependencies>
-
-</ivy-module>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-ext/plugin.xml
----------------------------------------------------------------------
diff --git a/src/plugin/parse-ext/plugin.xml b/src/plugin/parse-ext/plugin.xml
deleted file mode 100644
index 6819b36..0000000
--- a/src/plugin/parse-ext/plugin.xml
+++ /dev/null
@@ -1,60 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<plugin
- id="parse-ext"
- name="External Parser Plug-in"
- version="1.0.0"
- provider-name="nutch.org">
-
- <runtime>
- <library name="parse-ext.jar">
- <export name="*"/>
- </library>
- </runtime>
-
- <requires>
- <import plugin="nutch-extensionpoints"/>
- </requires>
-
- <extension id="org.apache.nutch.parse.ext"
- name="ExtParse"
- point="org.apache.nutch.parse.Parser">
-
- <implementation id="ExtParser"
- class="org.apache.nutch.parse.ext.ExtParser">
- <parameter name="contentType" value="application/vnd.nutch.example.cat"/>
- <parameter name="pathSuffix" value=""/>
- <parameter name="command" value="./build/plugins/parse-ext/command"/>
- <parameter name="timeout" value="10"/>
- <!-- can optionally specify an encoding parameter now, see NUTCH-564-->
- <!-- <parameter name="encoding" value="UTF-8"/> -->
- </implementation>
-
- <implementation id="ExtParser"
- class="org.apache.nutch.parse.ext.ExtParser">
- <parameter name="contentType" value="application/vnd.nutch.example.md5sum"/>
- <parameter name="pathSuffix" value=""/>
- <parameter name="command" value="./build/plugins/parse-ext/command"/>
- <parameter name="timeout" value="20"/>
- <!-- can optionally specify an encoding parameter now, see NUTCH-564-->
- <!-- <parameter name="encoding" value="UTF-8"/> -->
- </implementation>
-
- </extension>
-
-</plugin>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java
----------------------------------------------------------------------
diff --git a/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java b/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java
deleted file mode 100644
index 94d9b32..0000000
--- a/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java
+++ /dev/null
@@ -1,183 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.parse.ext;
-
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.parse.ParseResult;
-import org.apache.nutch.parse.ParseStatus;
-import org.apache.nutch.parse.Parser;
-import org.apache.nutch.parse.ParseData;
-import org.apache.nutch.parse.ParseImpl;
-import org.apache.nutch.parse.Outlink;
-import org.apache.nutch.parse.OutlinkExtractor;
-
-import org.apache.nutch.util.CommandRunner;
-import org.apache.nutch.net.protocols.Response;
-import org.apache.hadoop.conf.Configuration;
-
-import org.apache.nutch.plugin.Extension;
-import org.apache.nutch.plugin.PluginRepository;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.util.Hashtable;
-
-import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
-import java.nio.charset.Charset;
-
-/**
- * A wrapper that invokes external command to do real parsing job.
- *
- * @author John Xing
- */
-
-public class ExtParser implements Parser {
-
- public static final Logger LOG = LoggerFactory
- .getLogger("org.apache.nutch.parse.ext");
-
- static final int BUFFER_SIZE = 4096;
-
- static final int TIMEOUT_DEFAULT = 30; // in seconds
-
- // handy map from String contentType to String[] {command, timeoutString,
- // encoding}
- Hashtable<String, String[]> TYPE_PARAMS_MAP = new Hashtable<String, String[]>();
-
- private Configuration conf;
-
- public ExtParser() {
- }
-
- public ParseResult getParse(Content content) {
-
- String contentType = content.getContentType();
-
- String[] params = (String[]) TYPE_PARAMS_MAP.get(contentType);
- if (params == null)
- return new ParseStatus(ParseStatus.FAILED,
- "No external command defined for contentType: " + contentType)
- .getEmptyParseResult(content.getUrl(), getConf());
-
- String command = params[0];
- int timeout = Integer.parseInt(params[1]);
- String encoding = params[2];
-
- if (LOG.isTraceEnabled()) {
- LOG.trace("Use " + command + " with timeout=" + timeout + "secs");
- }
-
- String text = null;
- String title = null;
-
- try {
-
- byte[] raw = content.getContent();
-
- String contentLength = content.getMetadata().get(Response.CONTENT_LENGTH);
- if (contentLength != null
- && raw.length != Integer.parseInt(contentLength)) {
- return new ParseStatus(ParseStatus.FAILED,
- ParseStatus.FAILED_TRUNCATED, "Content truncated at " + raw.length
- + " bytes. Parser can't handle incomplete " + contentType
- + " file.").getEmptyParseResult(content.getUrl(), getConf());
- }
-
- ByteArrayOutputStream os = new ByteArrayOutputStream(BUFFER_SIZE);
- ByteArrayOutputStream es = new ByteArrayOutputStream(BUFFER_SIZE / 4);
-
- CommandRunner cr = new CommandRunner();
-
- cr.setCommand(command + " " + contentType);
- cr.setInputStream(new ByteArrayInputStream(raw));
- cr.setStdOutputStream(os);
- cr.setStdErrorStream(es);
-
- cr.setTimeout(timeout);
-
- cr.evaluate();
-
- if (cr.getExitValue() != 0)
- return new ParseStatus(ParseStatus.FAILED, "External command "
- + command + " failed with error: " + es.toString())
- .getEmptyParseResult(content.getUrl(), getConf());
-
- text = os.toString(encoding);
-
- } catch (Exception e) { // run time exception
- return new ParseStatus(e)
- .getEmptyParseResult(content.getUrl(), getConf());
- }
-
- if (text == null)
- text = "";
-
- if (title == null)
- title = "";
-
- // collect outlink
- Outlink[] outlinks = OutlinkExtractor.getOutlinks(text, getConf());
-
- ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title,
- outlinks, content.getMetadata());
- return ParseResult.createParseResult(content.getUrl(), new ParseImpl(text,
- parseData));
- }
-
- public void setConf(Configuration conf) {
- this.conf = conf;
- Extension[] extensions = PluginRepository.get(conf)
- .getExtensionPoint("org.apache.nutch.parse.Parser").getExtensions();
-
- String contentType, command, timeoutString, encoding;
-
- for (int i = 0; i < extensions.length; i++) {
- Extension extension = extensions[i];
-
- // only look for extensions defined by plugin parse-ext
- if (!extension.getDescriptor().getPluginId().equals("parse-ext"))
- continue;
-
- contentType = extension.getAttribute("contentType");
- if (contentType == null || contentType.equals(""))
- continue;
-
- command = extension.getAttribute("command");
- if (command == null || command.equals(""))
- continue;
-
- // null encoding means default
- encoding = extension.getAttribute("encoding");
- if (encoding == null)
- encoding = Charset.defaultCharset().name();
-
- timeoutString = extension.getAttribute("timeout");
- if (timeoutString == null || timeoutString.equals(""))
- timeoutString = "" + TIMEOUT_DEFAULT;
-
- TYPE_PARAMS_MAP.put(contentType, new String[] { command, timeoutString,
- encoding });
- }
- }
-
- public Configuration getConf() {
- return this.conf;
- }
-}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/package-info.java
----------------------------------------------------------------------
diff --git a/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/package-info.java b/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/package-info.java
deleted file mode 100644
index 6394489..0000000
--- a/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/package-info.java
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * Parse wrapper to run external command to do the parsing.
- */
-package org.apache.nutch.parse.ext;
-
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java
----------------------------------------------------------------------
diff --git a/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java b/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java
deleted file mode 100644
index a399273..0000000
--- a/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java
+++ /dev/null
@@ -1,130 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.parse.ext;
-
-import org.apache.nutch.protocol.ProtocolFactory;
-import org.apache.nutch.protocol.Protocol;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.protocol.ProtocolException;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseUtil;
-import org.apache.nutch.parse.ParseException;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.util.NutchConfiguration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.junit.After;
-import org.junit.Assert;
-import org.junit.Before;
-import org.junit.Test;
-
-import java.io.File;
-import java.io.FileOutputStream;
-import java.io.IOException;
-
-/**
- * Unit tests for ExtParser. First creates a temp file with fixed content, then
- * fetch and parse it using external command 'cat' and 'md5sum' alternately for
- * 10 times. Doing so also does a light stress test for class CommandRunner.java
- * (as used in ExtParser.java).
- *
- * Warning: currently only do test on linux platform.
- *
- * @author John Xing
- */
-public class TestExtParser {
- private File tempFile = null;
- private String urlString = null;
- private Content content = null;
- private Parse parse = null;
-
- private String expectedText = "nutch rocks nutch rocks nutch rocks";
- // echo -n "nutch rocks nutch rocks nutch rocks" | md5sum
- private String expectedMD5sum = "df46711a1a48caafc98b1c3b83aa1526";
-
- @Before
- protected void setUp() throws ProtocolException, IOException {
- // prepare a temp file with expectedText as its content
- // This system property is defined in ./src/plugin/build-plugin.xml
- String path = System.getProperty("test.data");
- if (path != null) {
- File tempDir = new File(path);
- if (!tempDir.exists())
- tempDir.mkdir();
- tempFile = File.createTempFile("nutch.test.plugin.ExtParser.", ".txt",
- tempDir);
- } else {
- // otherwise in java.io.tmpdir
- tempFile = File.createTempFile("nutch.test.plugin.ExtParser.", ".txt");
- }
- urlString = tempFile.toURI().toURL().toString();
-
- FileOutputStream fos = new FileOutputStream(tempFile);
- fos.write(expectedText.getBytes());
- fos.close();
-
- // get nutch content
- Protocol protocol = new ProtocolFactory(NutchConfiguration.create())
- .getProtocol(urlString);
- content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum())
- .getContent();
- protocol = null;
- }
-
- @After
- protected void tearDown() {
- // clean content
- content = null;
-
- // clean temp file
- // if (tempFile != null && tempFile.exists())
- // tempFile.delete();
- }
-
- @Test
- public void testIt() throws ParseException {
- String contentType;
-
- // now test only on linux platform
- if (!System.getProperty("os.name").equalsIgnoreCase("linux")) {
- System.err
- .println("Current OS is " + System.getProperty("os.name") + ".");
- System.err.println("No test is run on OS other than linux.");
- return;
- }
-
- Configuration conf = NutchConfiguration.create();
- // loop alternately, total 10*2 times of invoking external command
- for (int i = 0; i < 10; i++) {
- // check external parser that does 'cat'
- contentType = "application/vnd.nutch.example.cat";
- content.setContentType(contentType);
- parse = new ParseUtil(conf).parseByExtensionId("parse-ext", content).get(
- content.getUrl());
- Assert.assertEquals(expectedText, parse.getText());
-
- // check external parser that does 'md5sum'
- contentType = "application/vnd.nutch.example.md5sum";
- content.setContentType(contentType);
- parse = new ParseUtil(conf).parseByExtensionId("parse-ext", content).get(
- content.getUrl());
- Assert.assertTrue(parse.getText().startsWith(expectedMD5sum));
- }
- }
-
-}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-html/build.xml
----------------------------------------------------------------------
diff --git a/src/plugin/parse-html/build.xml b/src/plugin/parse-html/build.xml
deleted file mode 100755
index a5b99b5..0000000
--- a/src/plugin/parse-html/build.xml
+++ /dev/null
@@ -1,40 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="parse-html" default="jar-core">
-
- <import file="../build-plugin.xml"/>
-
- <!-- Build compilation dependencies -->
- <target name="deps-jar">
- <ant target="jar" inheritall="false" dir="../lib-nekohtml"/>
- </target>
-
- <!-- Add compilation dependencies to classpath -->
- <path id="plugin.deps">
- <fileset dir="${nutch.root}/build">
- <include name="**/lib-nekohtml/*.jar" />
- </fileset>
- </path>
-
- <!-- Deploy Unit test dependencies -->
- <target name="deps-test">
- <ant target="deploy" inheritall="false" dir="../lib-nekohtml"/>
- <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
- </target>
-
-</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-html/ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/parse-html/ivy.xml b/src/plugin/parse-html/ivy.xml
deleted file mode 100644
index e8a6135..0000000
--- a/src/plugin/parse-html/ivy.xml
+++ /dev/null
@@ -1,42 +0,0 @@
-<?xml version="1.0" ?>
-
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-
-<ivy-module version="1.0">
- <info organisation="org.apache.nutch" module="${ant.project.name}">
- <license name="Apache 2.0"/>
- <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
- <description>
- Apache Nutch
- </description>
- </info>
-
- <configurations>
- <include file="../../../ivy/ivy-configurations.xml"/>
- </configurations>
-
- <publications>
- <!--get the artifact from our module name-->
- <artifact conf="master"/>
- </publications>
-
- <dependencies>
- <dependency org="org.ccil.cowan.tagsoup" name="tagsoup" rev="1.2.1"/>
- </dependencies>
-
-</ivy-module>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-html/plugin.xml
----------------------------------------------------------------------
diff --git a/src/plugin/parse-html/plugin.xml b/src/plugin/parse-html/plugin.xml
deleted file mode 100755
index 3be70c3..0000000
--- a/src/plugin/parse-html/plugin.xml
+++ /dev/null
@@ -1,48 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<plugin
- id="parse-html"
- name="Html Parse Plug-in"
- version="1.0.0"
- provider-name="nutch.org">
-
- <runtime>
- <library name="parse-html.jar">
- <export name="*"/>
- </library>
- <library name="tagsoup-1.2.1.jar"/>
- </runtime>
-
- <requires>
- <import plugin="nutch-extensionpoints"/>
- <import plugin="lib-nekohtml"/>
- </requires>
-
- <extension id="org.apache.nutch.parse.html"
- name="HtmlParse"
- point="org.apache.nutch.parse.Parser">
-
- <implementation id="org.apache.nutch.parse.html.HtmlParser"
- class="org.apache.nutch.parse.html.HtmlParser">
- <parameter name="contentType" value="text/html|application/xhtml+xml"/>
- <parameter name="pathSuffix" value=""/>
- </implementation>
-
- </extension>
-
-</plugin>