You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2015/10/18 21:37:22 UTC

svn commit: r1709307 - in /nutch/trunk: ./ src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/ src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/

Author: mattmann
Date: Sun Oct 18 19:37:22 2015
New Revision: 1709307

URL: http://svn.apache.org/viewvc?rev=1709307&view=rev
Log:
Fix for NUTCH-2141: Change the InteractiveSelenium plugin handler Interface to return page content contributed by Balaji <ba...@gmail.com> this closes #77 #75

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/HttpResponse.java
    nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefalultMultiInteractionHandler.java
    nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefaultClickAllAjaxLinksHandler.java
    nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefaultHandler.java
    nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/InteractiveSeleniumHandler.java

Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1709307&r1=1709306&r2=1709307&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Sun Oct 18 19:37:22 2015
@@ -2,6 +2,9 @@ Nutch Change Log
    
 Nutch Current Development 1.11-SNAPSHOT
 
+* NUTCH-2141 Change the InteractiveSelenium plugin handler Interface to return page content
+  (Balaji Gurumurthy via mattmann)
+
 * NUTCH-2129 Add protocol status tracking to crawl datum (Michael Joyce via mattmann)
 
 * NUTCH-2142 Nutch File Dump - FileNotFoundException (Invalid Argument) Error (Karanjeet Singh via mattmann)

Modified: nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/HttpResponse.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/HttpResponse.java?rev=1709307&r1=1709306&r2=1709307&view=diff
==============================================================================
--- nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/HttpResponse.java (original)
+++ nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/HttpResponse.java Sun Oct 18 19:37:22 2015
@@ -277,8 +277,7 @@ public class HttpResponse implements Res
 
         WebDriver driver = HttpWebClient.getDriverForPage(url.toString(), conf);
 
-        handler.processDriver(driver);
-        processedPage += HttpWebClient.getHTMLContent(driver, conf);
+        processedPage += handler.processDriver(driver);
 
         HttpWebClient.cleanUpDriver(driver);
     }

Modified: nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefalultMultiInteractionHandler.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefalultMultiInteractionHandler.java?rev=1709307&r1=1709306&r2=1709307&view=diff
==============================================================================
--- nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefalultMultiInteractionHandler.java (original)
+++ nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefalultMultiInteractionHandler.java Sun Oct 18 19:37:22 2015
@@ -32,10 +32,11 @@ public class DefalultMultiInteractionHan
   private static final Logger LOG = LoggerFactory
       .getLogger(DefalultMultiInteractionHandler.class);
 
-  public void processDriver(WebDriver driver) {
+  public String processDriver(WebDriver driver) {
+    // loop and get multiple pages in this string
+    String accumulatedData = "";
     try {
-      // loop and get multiple pages in this string
-      String accumulatedData = "";
+      
       // append the string to the last page's driver
       JavascriptExecutor jsx = (JavascriptExecutor) driver;
       jsx.executeScript("document.body.innerHTML=document.body.innerHTML "
@@ -43,6 +44,7 @@ public class DefalultMultiInteractionHan
     } catch (Exception e) {
       LOG.info(StringUtils.stringifyException(e));
     }
+    return accumulatedData;
   }
 
   public boolean shouldProcessURL(String URL) {

Modified: nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefaultClickAllAjaxLinksHandler.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefaultClickAllAjaxLinksHandler.java?rev=1709307&r1=1709306&r2=1709307&view=diff
==============================================================================
--- nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefaultClickAllAjaxLinksHandler.java (original)
+++ nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefaultClickAllAjaxLinksHandler.java Sun Oct 18 19:37:22 2015
@@ -38,10 +38,11 @@ public class DefaultClickAllAjaxLinksHan
   private static final Logger LOG = LoggerFactory
       .getLogger(DefaultClickAllAjaxLinksHandler.class);
 
-  public void processDriver(WebDriver driver) {
-
+  public String processDriver(WebDriver driver) {
+    
+    String accumulatedData = "";
     try {
-      String accumulatedData = "";
+      
 
       driver.findElement(By.tagName("body")).getAttribute("innerHTML");
       Configuration conf = NutchConfiguration.create();
@@ -78,6 +79,7 @@ public class DefaultClickAllAjaxLinksHan
     } catch (Exception e) {
       LOG.info(StringUtils.stringifyException(e));
     }
+    return accumulatedData;
   }
 
   public boolean shouldProcessURL(String URL) {

Modified: nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefaultHandler.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefaultHandler.java?rev=1709307&r1=1709306&r2=1709307&view=diff
==============================================================================
--- nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefaultHandler.java (original)
+++ nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefaultHandler.java Sun Oct 18 19:37:22 2015
@@ -20,7 +20,9 @@ package org.apache.nutch.protocol.intera
 import org.openqa.selenium.WebDriver;
 
 public class DefaultHandler implements InteractiveSeleniumHandler {
-    public void processDriver(WebDriver driver) {}
+    public String processDriver(WebDriver driver) {
+      return null;
+    }
 
     public boolean shouldProcessURL(String URL) {
         return true;

Modified: nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/InteractiveSeleniumHandler.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/InteractiveSeleniumHandler.java?rev=1709307&r1=1709306&r2=1709307&view=diff
==============================================================================
--- nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/InteractiveSeleniumHandler.java (original)
+++ nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/InteractiveSeleniumHandler.java Sun Oct 18 19:37:22 2015
@@ -20,6 +20,6 @@ package org.apache.nutch.protocol.intera
 import org.openqa.selenium.WebDriver;
 
 public interface InteractiveSeleniumHandler {
-    public void processDriver(WebDriver driver);
+    public String processDriver(WebDriver driver);
     public boolean shouldProcessURL(String URL);
 }