You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2015/10/18 21:37:22 UTC
svn commit: r1709307 - in /nutch/trunk: ./
src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/
src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/
Author: mattmann
Date: Sun Oct 18 19:37:22 2015
New Revision: 1709307
URL: http://svn.apache.org/viewvc?rev=1709307&view=rev
Log:
Fix for NUTCH-2141: Change the InteractiveSelenium plugin handler Interface to return page content contributed by Balaji <ba...@gmail.com> this closes #77 #75
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/HttpResponse.java
nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefalultMultiInteractionHandler.java
nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefaultClickAllAjaxLinksHandler.java
nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefaultHandler.java
nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/InteractiveSeleniumHandler.java
Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1709307&r1=1709306&r2=1709307&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Sun Oct 18 19:37:22 2015
@@ -2,6 +2,9 @@ Nutch Change Log
Nutch Current Development 1.11-SNAPSHOT
+* NUTCH-2141 Change the InteractiveSelenium plugin handler Interface to return page content
+ (Balaji Gurumurthy via mattmann)
+
* NUTCH-2129 Add protocol status tracking to crawl datum (Michael Joyce via mattmann)
* NUTCH-2142 Nutch File Dump - FileNotFoundException (Invalid Argument) Error (Karanjeet Singh via mattmann)
Modified: nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/HttpResponse.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/HttpResponse.java?rev=1709307&r1=1709306&r2=1709307&view=diff
==============================================================================
--- nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/HttpResponse.java (original)
+++ nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/HttpResponse.java Sun Oct 18 19:37:22 2015
@@ -277,8 +277,7 @@ public class HttpResponse implements Res
WebDriver driver = HttpWebClient.getDriverForPage(url.toString(), conf);
- handler.processDriver(driver);
- processedPage += HttpWebClient.getHTMLContent(driver, conf);
+ processedPage += handler.processDriver(driver);
HttpWebClient.cleanUpDriver(driver);
}
Modified: nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefalultMultiInteractionHandler.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefalultMultiInteractionHandler.java?rev=1709307&r1=1709306&r2=1709307&view=diff
==============================================================================
--- nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefalultMultiInteractionHandler.java (original)
+++ nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefalultMultiInteractionHandler.java Sun Oct 18 19:37:22 2015
@@ -32,10 +32,11 @@ public class DefalultMultiInteractionHan
private static final Logger LOG = LoggerFactory
.getLogger(DefalultMultiInteractionHandler.class);
- public void processDriver(WebDriver driver) {
+ public String processDriver(WebDriver driver) {
+ // loop and get multiple pages in this string
+ String accumulatedData = "";
try {
- // loop and get multiple pages in this string
- String accumulatedData = "";
+
// append the string to the last page's driver
JavascriptExecutor jsx = (JavascriptExecutor) driver;
jsx.executeScript("document.body.innerHTML=document.body.innerHTML "
@@ -43,6 +44,7 @@ public class DefalultMultiInteractionHan
} catch (Exception e) {
LOG.info(StringUtils.stringifyException(e));
}
+ return accumulatedData;
}
public boolean shouldProcessURL(String URL) {
Modified: nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefaultClickAllAjaxLinksHandler.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefaultClickAllAjaxLinksHandler.java?rev=1709307&r1=1709306&r2=1709307&view=diff
==============================================================================
--- nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefaultClickAllAjaxLinksHandler.java (original)
+++ nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefaultClickAllAjaxLinksHandler.java Sun Oct 18 19:37:22 2015
@@ -38,10 +38,11 @@ public class DefaultClickAllAjaxLinksHan
private static final Logger LOG = LoggerFactory
.getLogger(DefaultClickAllAjaxLinksHandler.class);
- public void processDriver(WebDriver driver) {
-
+ public String processDriver(WebDriver driver) {
+
+ String accumulatedData = "";
try {
- String accumulatedData = "";
+
driver.findElement(By.tagName("body")).getAttribute("innerHTML");
Configuration conf = NutchConfiguration.create();
@@ -78,6 +79,7 @@ public class DefaultClickAllAjaxLinksHan
} catch (Exception e) {
LOG.info(StringUtils.stringifyException(e));
}
+ return accumulatedData;
}
public boolean shouldProcessURL(String URL) {
Modified: nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefaultHandler.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefaultHandler.java?rev=1709307&r1=1709306&r2=1709307&view=diff
==============================================================================
--- nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefaultHandler.java (original)
+++ nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefaultHandler.java Sun Oct 18 19:37:22 2015
@@ -20,7 +20,9 @@ package org.apache.nutch.protocol.intera
import org.openqa.selenium.WebDriver;
public class DefaultHandler implements InteractiveSeleniumHandler {
- public void processDriver(WebDriver driver) {}
+ public String processDriver(WebDriver driver) {
+ return null;
+ }
public boolean shouldProcessURL(String URL) {
return true;
Modified: nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/InteractiveSeleniumHandler.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/InteractiveSeleniumHandler.java?rev=1709307&r1=1709306&r2=1709307&view=diff
==============================================================================
--- nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/InteractiveSeleniumHandler.java (original)
+++ nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/InteractiveSeleniumHandler.java Sun Oct 18 19:37:22 2015
@@ -20,6 +20,6 @@ package org.apache.nutch.protocol.intera
import org.openqa.selenium.WebDriver;
public interface InteractiveSeleniumHandler {
- public void processDriver(WebDriver driver);
+ public String processDriver(WebDriver driver);
public boolean shouldProcessURL(String URL);
}