You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2015/08/26 04:21:31 UTC

svn commit: r1697808 - in /nutch/trunk: ./ conf/ src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/ src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/ src/plugin/protocol-selenium/

Author: lewismc
Date: Wed Aug 26 02:21:31 2015
New Revision: 1697808

URL: http://svn.apache.org/r1697808
Log:
NUTCH-2083 Implement functionality to shadow nutch-selenium-grid-plugin from Mo Omer

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/conf/nutch-default.xml
    nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java
    nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesParseFilter.java
    nutch/trunk/src/plugin/protocol-selenium/README.md

Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1697808&r1=1697807&r2=1697808&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed Aug 26 02:21:31 2015
@@ -2,6 +2,8 @@ Nutch Change Log
   
 Nutch Current Development 1.11-SNAPSHOT
 
+* NUTCH-2083 Implement functionality to shadow nutch-selenium-grid-plugin from Mo Omer (lewismc)
+
 * NUTCH-2049 Upgrade to Hadoop 2.4 (lewismc)
 
 * NUTCH-1486 Upgrade to Solr 4.10.2 (lewismc, markus)

Modified: nutch/trunk/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1697808&r1=1697807&r2=1697808&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Wed Aug 26 02:21:31 2015
@@ -1785,7 +1785,10 @@ CAUTION: Set the parser.timeout to -1 or
   <description>
     A String value representing the flavour of Selenium 
     WebDriver() to use. Currently the following options
-    exist - firefox, chrome, safari and opera.
+    exist - 'firefox', 'chrome', 'safari', 'opera' and 'remote'.
+    If 'remote' is used it is essential to also set correct properties for
+    'selenium.hub.port', 'selenium.hub.path', 'selenium.hub.host' and
+    'selenium.hub.protocol'.
   </description>
 </property>
 
@@ -1813,6 +1816,30 @@ CAUTION: Set the parser.timeout to -1 or
   </description>
 </property>
 
+<property>
+  <name>selenium.hub.port</name>
+  <value>4444</value>
+  <description>Selenium Hub Location connection port</description>
+</property>
+
+<property>
+  <name>selenium.hub.path</name>
+  <value>/wd/hub</value>
+  <description>Selenium Hub Location connection path</description>
+</property>
+
+<property>
+  <name>selenium.hub.host</name>
+  <value>localhost</value>
+  <description>Selenium Hub Location connection host</description>
+</property>
+
+<property>
+  <name>selenium.hub.protocol</name>
+  <value>http</value>
+  <description>Selenium Hub Location connection protocol</description>
+</property>
+
 <!-- lib-selenium configuration -->
 <property>
   <name>libselenium.page.load.delay</name>

Modified: nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java?rev=1697808&r1=1697807&r2=1697808&view=diff
==============================================================================
--- nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java (original)
+++ nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java Wed Aug 26 02:21:31 2015
@@ -17,7 +17,6 @@
 package org.apache.nutch.protocol.selenium;
 
 import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FSDataOutputStream;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.IOUtils;
@@ -30,15 +29,20 @@ import org.openqa.selenium.WebDriver;
 import org.openqa.selenium.chrome.ChromeDriver;
 import org.openqa.selenium.firefox.FirefoxDriver;
 import org.openqa.selenium.firefox.FirefoxProfile;
+import org.openqa.selenium.remote.DesiredCapabilities;
+import org.openqa.selenium.remote.RemoteWebDriver;
 import org.openqa.selenium.safari.SafariDriver;
 import org.openqa.selenium.support.ui.WebDriverWait;
+
 import com.opera.core.systems.OperaDriver;
+
 import java.io.BufferedInputStream;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.InputStream;
 import java.io.OutputStream;
 import java.lang.String;
+import java.net.URL;
 
 public class HttpWebClient {
 
@@ -77,6 +81,13 @@ public class HttpWebClient {
           case "opera":
             driver = new OperaDriver();
             break;
+          case "remote":
+            String seleniumHubHost = conf.get("selenium.hub.host", "localhost");
+            int seleniumHubPort = Integer.parseInt(conf.get("selenium.hub.port", "4444"));
+            String seleniumHubPath = conf.get("selenium.hub.path", "/wd/hub");
+            String seleniumHubProtocol = conf.get("selenium.hub.protocol", "http");
+            driver = new RemoteWebDriver(new URL(seleniumHubProtocol, seleniumHubHost, seleniumHubPort, seleniumHubPath), DesiredCapabilities.firefox());
+            break;
           default:
             LOG.error("The Selenium WebDriver choice {} is not available... defaulting to FirefoxDriver().", driverType);
             driver = new FirefoxDriver();

Modified: nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesParseFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesParseFilter.java?rev=1697808&r1=1697807&r2=1697808&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesParseFilter.java (original)
+++ nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesParseFilter.java Wed Aug 26 02:21:31 2015
@@ -137,11 +137,8 @@ public class NaiveBayesParseFilter imple
 
     } catch (IOException e) {
       LOG.error(StringUtils.stringifyException(e));
-
     }
-
     try {
-
       train();
     } catch (Exception e) {
 
@@ -169,7 +166,7 @@ public class NaiveBayesParseFilter imple
     if (!filterParse(text)) { // kick in the second tier
       // if parent page found
       // irrelevent
-      LOG.info("ParseFilter: NaiveBayes: Page found irrelevent:: " + url);
+      LOG.info("ParseFilter: NaiveBayes: Page found irrelevant:: " + url);
       LOG.info("Checking outlinks");
 
       Outlink[] out = null;

Modified: nutch/trunk/src/plugin/protocol-selenium/README.md
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-selenium/README.md?rev=1697808&r1=1697807&r2=1697808&view=diff
==============================================================================
--- nutch/trunk/src/plugin/protocol-selenium/README.md (original)
+++ nutch/trunk/src/plugin/protocol-selenium/README.md Wed Aug 26 02:21:31 2015
@@ -1,22 +1,23 @@
 Nutch Selenium
 ==============
 
+# Introduction
+
 This plugin allows you to fetch Javascript pages using [Selenium](http://www.seleniumhq.org/), while relying on the rest of the awesome Nutch stack!
 
 The underlying code is based on the nutch-htmlunit plugin, which was in turn based on nutch-httpclient.
 
-# IMPORTANT NOTES:
+There are essentially two ways in which Nutch can be used with Selenium.
 
- * A version of this plugin which relies on the Selenium Hub/Node system can be found here: [nutch-selenium-grid-plugin](https://github.com/momer/nutch-selenium-grid-plugin)
+ * Locally (on each node) as a self contained process, or
+ * via the RemoteWebDriver which connects to [Selenium-Grid](http://www.seleniumhq.org/docs/07_selenium_grid.jsp). A grid consists of a single hub, and one or more nodes.
 
-# Installation (tested on Ubuntu 14.0x)
+# Installation
 
 ## Part 1: Setting up Selenium
 
- * Ensure that you have Firefox installed
+ * Ensure that you have Firefox installed. More info about the package @ [launchpad](https://launchpad.net/ubuntu/trusty/+source/firefox)
 ```
-# More info about the package @ [launchpad](https://launchpad.net/ubuntu/trusty/+source/firefox)
-
 sudo apt-get install firefox
 ```
  * Install Xvfb and its associates
@@ -51,14 +52,92 @@ sudo export DISPLAY=:11
     </description>
   </property>
 ```
+
+Then ensure that you have the correct configuration set within the following configuration options
+
+```
+<!-- protocol-selenium plugin properties -->
+
+<property>
+  <name>selenium.driver</name>
+  <value>firefox</value>
+  <description>
+    A String value representing the flavour of Selenium 
+    WebDriver() to use. Currently the following options
+    exist - 'firefox', 'chrome', 'safari', 'opera' and 'remote'.
+    If 'remote' is used it is essential to also set correct properties for
+    'selenium.hub.port', 'selenium.hub.path', 'selenium.hub.host' and
+    'selenium.hub.protocol'.
+  </description>
+</property>
+
+<property>
+  <name>selenium.take.screenshot</name>
+  <value>false</value>
+  <description>
+    Boolean property determining whether the protocol-selenium
+    WebDriver should capture a screenshot of the URL. If set to
+    true remember to define the 'selenium.screenshot.location' 
+    property as this determines the location screenshots should be 
+    persisted to on HDFS. If that property is not set, screenshots
+    are simply discarded.
+  </description>
+</property>
+
+<property>
+  <name>selenium.screenshot.location</name>
+  <value></value>
+  <description>
+    The location on disk where a URL screenshot should be saved
+    to if the 'selenium.take.screenshot' proerty is set to true.
+    By default this is null, in this case screenshots held in memory
+    are simply discarded.
+  </description>
+</property>
+
+<property>
+  <name>selenium.hub.port</name>
+  <value>4444</value>
+  <description>Selenium Hub Location connection port</description>
+</property>
+
+<property>
+  <name>selenium.hub.path</name>
+  <value>/wd/hub</value>
+  <description>Selenium Hub Location connection path</description>
+</property>
+
+<property>
+  <name>selenium.hub.host</name>
+  <value>localhost</value>
+  <description>Selenium Hub Location connection host</description>
+</property>
+
+<property>
+  <name>selenium.hub.protocol</name>
+  <value>http</value>
+  <description>Selenium Hub Location connection protocol</description>
+</property>
+
+<!-- lib-selenium configuration -->
+<property>
+  <name>libselenium.page.load.delay</name>
+  <value>3</value>
+  <description>
+    The delay in seconds to use when loading a page with lib-selenium. This
+    setting is used by protocol-selenium and protocol-interactiveselenium
+    since they depending on lib-selenium for fetching.
+  </description>
+</property>
+```
+ * If you've selected 'remote' value for the 'selenium.driver' property, ensure that you've configured
+ the additional properties based on your [Selenium-Grid installation](http://www.seleniumhq.org/docs/07_selenium_grid.jsp#installation).
+
  * Compile nutch
 ```
 ant runtime
 ```
 
  * Start your web crawl (Ensure that you followed the above steps and have started your xvfb display as shown above)
-```
-NUTCH_HOME/runtime/local/bin/crawl [-i|--index] [-D \"key=value\"] <Seed Dir> <Crawl Dir> <Num Rounds>
-```