You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2015/08/26 04:21:31 UTC
svn commit: r1697808 - in /nutch/trunk: ./ conf/
src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/
src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/
src/plugin/protocol-selenium/
Author: lewismc
Date: Wed Aug 26 02:21:31 2015
New Revision: 1697808
URL: http://svn.apache.org/r1697808
Log:
NUTCH-2083 Implement functionality to shadow nutch-selenium-grid-plugin from Mo Omer
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/conf/nutch-default.xml
nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java
nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesParseFilter.java
nutch/trunk/src/plugin/protocol-selenium/README.md
Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1697808&r1=1697807&r2=1697808&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed Aug 26 02:21:31 2015
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Current Development 1.11-SNAPSHOT
+* NUTCH-2083 Implement functionality to shadow nutch-selenium-grid-plugin from Mo Omer (lewismc)
+
* NUTCH-2049 Upgrade to Hadoop 2.4 (lewismc)
* NUTCH-1486 Upgrade to Solr 4.10.2 (lewismc, markus)
Modified: nutch/trunk/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1697808&r1=1697807&r2=1697808&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Wed Aug 26 02:21:31 2015
@@ -1785,7 +1785,10 @@ CAUTION: Set the parser.timeout to -1 or
<description>
A String value representing the flavour of Selenium
WebDriver() to use. Currently the following options
- exist - firefox, chrome, safari and opera.
+ exist - 'firefox', 'chrome', 'safari', 'opera' and 'remote'.
+ If 'remote' is used it is essential to also set correct properties for
+ 'selenium.hub.port', 'selenium.hub.path', 'selenium.hub.host' and
+ 'selenium.hub.protocol'.
</description>
</property>
@@ -1813,6 +1816,30 @@ CAUTION: Set the parser.timeout to -1 or
</description>
</property>
+<property>
+ <name>selenium.hub.port</name>
+ <value>4444</value>
+ <description>Selenium Hub Location connection port</description>
+</property>
+
+<property>
+ <name>selenium.hub.path</name>
+ <value>/wd/hub</value>
+ <description>Selenium Hub Location connection path</description>
+</property>
+
+<property>
+ <name>selenium.hub.host</name>
+ <value>localhost</value>
+ <description>Selenium Hub Location connection host</description>
+</property>
+
+<property>
+ <name>selenium.hub.protocol</name>
+ <value>http</value>
+ <description>Selenium Hub Location connection protocol</description>
+</property>
+
<!-- lib-selenium configuration -->
<property>
<name>libselenium.page.load.delay</name>
Modified: nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java?rev=1697808&r1=1697807&r2=1697808&view=diff
==============================================================================
--- nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java (original)
+++ nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java Wed Aug 26 02:21:31 2015
@@ -17,7 +17,6 @@
package org.apache.nutch.protocol.selenium;
import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
@@ -30,15 +29,20 @@ import org.openqa.selenium.WebDriver;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.firefox.FirefoxDriver;
import org.openqa.selenium.firefox.FirefoxProfile;
+import org.openqa.selenium.remote.DesiredCapabilities;
+import org.openqa.selenium.remote.RemoteWebDriver;
import org.openqa.selenium.safari.SafariDriver;
import org.openqa.selenium.support.ui.WebDriverWait;
+
import com.opera.core.systems.OperaDriver;
+
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.io.OutputStream;
import java.lang.String;
+import java.net.URL;
public class HttpWebClient {
@@ -77,6 +81,13 @@ public class HttpWebClient {
case "opera":
driver = new OperaDriver();
break;
+ case "remote":
+ String seleniumHubHost = conf.get("selenium.hub.host", "localhost");
+ int seleniumHubPort = Integer.parseInt(conf.get("selenium.hub.port", "4444"));
+ String seleniumHubPath = conf.get("selenium.hub.path", "/wd/hub");
+ String seleniumHubProtocol = conf.get("selenium.hub.protocol", "http");
+ driver = new RemoteWebDriver(new URL(seleniumHubProtocol, seleniumHubHost, seleniumHubPort, seleniumHubPath), DesiredCapabilities.firefox());
+ break;
default:
LOG.error("The Selenium WebDriver choice {} is not available... defaulting to FirefoxDriver().", driverType);
driver = new FirefoxDriver();
Modified: nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesParseFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesParseFilter.java?rev=1697808&r1=1697807&r2=1697808&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesParseFilter.java (original)
+++ nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesParseFilter.java Wed Aug 26 02:21:31 2015
@@ -137,11 +137,8 @@ public class NaiveBayesParseFilter imple
} catch (IOException e) {
LOG.error(StringUtils.stringifyException(e));
-
}
-
try {
-
train();
} catch (Exception e) {
@@ -169,7 +166,7 @@ public class NaiveBayesParseFilter imple
if (!filterParse(text)) { // kick in the second tier
// if parent page found
// irrelevent
- LOG.info("ParseFilter: NaiveBayes: Page found irrelevent:: " + url);
+ LOG.info("ParseFilter: NaiveBayes: Page found irrelevant:: " + url);
LOG.info("Checking outlinks");
Outlink[] out = null;
Modified: nutch/trunk/src/plugin/protocol-selenium/README.md
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-selenium/README.md?rev=1697808&r1=1697807&r2=1697808&view=diff
==============================================================================
--- nutch/trunk/src/plugin/protocol-selenium/README.md (original)
+++ nutch/trunk/src/plugin/protocol-selenium/README.md Wed Aug 26 02:21:31 2015
@@ -1,22 +1,23 @@
Nutch Selenium
==============
+# Introduction
+
This plugin allows you to fetch Javascript pages using [Selenium](http://www.seleniumhq.org/), while relying on the rest of the awesome Nutch stack!
The underlying code is based on the nutch-htmlunit plugin, which was in turn based on nutch-httpclient.
-# IMPORTANT NOTES:
+There are essentially two ways in which Nutch can be used with Selenium.
- * A version of this plugin which relies on the Selenium Hub/Node system can be found here: [nutch-selenium-grid-plugin](https://github.com/momer/nutch-selenium-grid-plugin)
+ * Locally (on each node) as a self contained process, or
+ * via the RemoteWebDriver which connects to [Selenium-Grid](http://www.seleniumhq.org/docs/07_selenium_grid.jsp). A grid consists of a single hub, and one or more nodes.
-# Installation (tested on Ubuntu 14.0x)
+# Installation
## Part 1: Setting up Selenium
- * Ensure that you have Firefox installed
+ * Ensure that you have Firefox installed. More info about the package @ [launchpad](https://launchpad.net/ubuntu/trusty/+source/firefox)
```
-# More info about the package @ [launchpad](https://launchpad.net/ubuntu/trusty/+source/firefox)
-
sudo apt-get install firefox
```
* Install Xvfb and its associates
@@ -51,14 +52,92 @@ sudo export DISPLAY=:11
</description>
</property>
```
+
+Then ensure that you have the correct configuration set within the following configuration options
+
+```
+<!-- protocol-selenium plugin properties -->
+
+<property>
+ <name>selenium.driver</name>
+ <value>firefox</value>
+ <description>
+ A String value representing the flavour of Selenium
+ WebDriver() to use. Currently the following options
+ exist - 'firefox', 'chrome', 'safari', 'opera' and 'remote'.
+ If 'remote' is used it is essential to also set correct properties for
+ 'selenium.hub.port', 'selenium.hub.path', 'selenium.hub.host' and
+ 'selenium.hub.protocol'.
+ </description>
+</property>
+
+<property>
+ <name>selenium.take.screenshot</name>
+ <value>false</value>
+ <description>
+ Boolean property determining whether the protocol-selenium
+ WebDriver should capture a screenshot of the URL. If set to
+ true remember to define the 'selenium.screenshot.location'
+ property as this determines the location screenshots should be
+ persisted to on HDFS. If that property is not set, screenshots
+ are simply discarded.
+ </description>
+</property>
+
+<property>
+ <name>selenium.screenshot.location</name>
+ <value></value>
+ <description>
+ The location on disk where a URL screenshot should be saved
+ to if the 'selenium.take.screenshot' proerty is set to true.
+ By default this is null, in this case screenshots held in memory
+ are simply discarded.
+ </description>
+</property>
+
+<property>
+ <name>selenium.hub.port</name>
+ <value>4444</value>
+ <description>Selenium Hub Location connection port</description>
+</property>
+
+<property>
+ <name>selenium.hub.path</name>
+ <value>/wd/hub</value>
+ <description>Selenium Hub Location connection path</description>
+</property>
+
+<property>
+ <name>selenium.hub.host</name>
+ <value>localhost</value>
+ <description>Selenium Hub Location connection host</description>
+</property>
+
+<property>
+ <name>selenium.hub.protocol</name>
+ <value>http</value>
+ <description>Selenium Hub Location connection protocol</description>
+</property>
+
+<!-- lib-selenium configuration -->
+<property>
+ <name>libselenium.page.load.delay</name>
+ <value>3</value>
+ <description>
+ The delay in seconds to use when loading a page with lib-selenium. This
+ setting is used by protocol-selenium and protocol-interactiveselenium
+ since they depending on lib-selenium for fetching.
+ </description>
+</property>
+```
+ * If you've selected 'remote' value for the 'selenium.driver' property, ensure that you've configured
+ the additional properties based on your [Selenium-Grid installation](http://www.seleniumhq.org/docs/07_selenium_grid.jsp#installation).
+
* Compile nutch
```
ant runtime
```
* Start your web crawl (Ensure that you followed the above steps and have started your xvfb display as shown above)
-```
-NUTCH_HOME/runtime/local/bin/crawl [-i|--index] [-D \"key=value\"] <Seed Dir> <Crawl Dir> <Num Rounds>
-```