You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2016/04/18 00:35:47 UTC
[3/5] nutch git commit: fix for NUTCH-2191 contributed by karanjeets
fix for NUTCH-2191 contributed by karanjeets
Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/3cda2229
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/3cda2229
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/3cda2229
Branch: refs/heads/master
Commit: 3cda222971c970270dcc7525b97dfffe4b818ced
Parents: 366104d
Author: Karanjeet Singh <co...@gmail.com>
Authored: Mon Mar 28 22:58:40 2016 -0700
Committer: Karanjeet Singh <co...@gmail.com>
Committed: Mon Mar 28 22:58:40 2016 -0700
----------------------------------------------------------------------
default.properties | 1 +
.../protocol/htmlunit/HtmlUnitWebDriver.java | 125 +++++++++----------
.../htmlunit/HtmlUnitWebWindowListener.java | 53 ++++----
.../nutch/protocol/htmlunit/HttpResponse.java | 5 +-
4 files changed, 93 insertions(+), 91 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/nutch/blob/3cda2229/default.properties
----------------------------------------------------------------------
diff --git a/default.properties b/default.properties
index d34f778..aec5d51 100644
--- a/default.properties
+++ b/default.properties
@@ -90,6 +90,7 @@ plugins.protocol=\
org.apache.nutch.protocol.http*:\
org.apache.nutch.protocol.httpclient*:\
org.apache.nutch.protocol.selenium*
+ org.apache.nutch.protocol.htmlunit*
#
# URL Filter Plugins
http://git-wip-us.apache.org/repos/asf/nutch/blob/3cda2229/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebDriver.java
----------------------------------------------------------------------
diff --git a/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebDriver.java b/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebDriver.java
index fc231c3..5e2c0ac 100644
--- a/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebDriver.java
+++ b/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebDriver.java
@@ -51,79 +51,79 @@ public class HtmlUnitWebDriver extends HtmlUnitDriver {
private static int maxRedirects;
public HtmlUnitWebDriver() {
- super(enableJavascript);
+ super(enableJavascript);
}
@Override
protected WebClient modifyWebClient(WebClient client) {
- client.getOptions().setJavaScriptEnabled(enableJavascript);
- client.getOptions().setCssEnabled(enableCss);
- client.getOptions().setRedirectEnabled(enableRedirect);
- if(enableJavascript)
- client.setJavaScriptTimeout(javascriptTimeout);
- client.getOptions().setThrowExceptionOnScriptError(false);
- if(enableRedirect)
- client.addWebWindowListener(new HtmlUnitWebWindowListener(maxRedirects));
- return client;
+ client.getOptions().setJavaScriptEnabled(enableJavascript);
+ client.getOptions().setCssEnabled(enableCss);
+ client.getOptions().setRedirectEnabled(enableRedirect);
+ if(enableJavascript)
+ client.setJavaScriptTimeout(javascriptTimeout);
+ client.getOptions().setThrowExceptionOnScriptError(false);
+ if(enableRedirect)
+ client.addWebWindowListener(new HtmlUnitWebWindowListener(maxRedirects));
+ return client;
}
public static WebDriver getDriverForPage(String url, Configuration conf) {
- long pageLoadTimout = conf.getLong("htmlunit.page.load.delay", 3);
- enableJavascript = conf.getBoolean("htmlunit.enable.javascript", true);
- enableCss = conf.getBoolean("htmlunit.enable.css", false);
- javascriptTimeout = conf.getLong("htmlunit.javascript.timeout", 3500);
- int redirects = Integer.parseInt(conf.get("http.redirect.max", "0"));
- enableRedirect = redirects <= 0 ? false : true;
- maxRedirects = redirects;
+ long pageLoadTimout = conf.getLong("htmlunit.page.load.delay", 3);
+ enableJavascript = conf.getBoolean("htmlunit.enable.javascript", true);
+ enableCss = conf.getBoolean("htmlunit.enable.css", false);
+ javascriptTimeout = conf.getLong("htmlunit.javascript.timeout", 3500);
+ int redirects = Integer.parseInt(conf.get("http.redirect.max", "0"));
+ enableRedirect = redirects <= 0 ? false : true;
+ maxRedirects = redirects;
- WebDriver driver = null;
+ WebDriver driver = null;
- try {
- driver = new HtmlUnitWebDriver();
- driver.manage().timeouts().pageLoadTimeout(pageLoadTimout, TimeUnit.SECONDS);
- driver.get(url);
- } catch(Exception e) {
- if(e instanceof TimeoutException) {
- LOG.debug("HtmlUnit WebDriver: Timeout Exception: Capturing whatever loaded so far...");
- return driver;
- }
- cleanUpDriver(driver);
- throw new RuntimeException(e);
- }
+ try {
+ driver = new HtmlUnitWebDriver();
+ driver.manage().timeouts().pageLoadTimeout(pageLoadTimout, TimeUnit.SECONDS);
+ driver.get(url);
+ } catch(Exception e) {
+ if(e instanceof TimeoutException) {
+ LOG.debug("HtmlUnit WebDriver: Timeout Exception: Capturing whatever loaded so far...");
+ return driver;
+ }
+ cleanUpDriver(driver);
+ throw new RuntimeException(e);
+ }
- return driver;
+ return driver;
}
public static String getHTMLContent(WebDriver driver, Configuration conf) {
- try {
- if (conf.getBoolean("htmlunit.take.screenshot", false))
- takeScreenshot(driver, conf);
+ try {
+ if (conf.getBoolean("htmlunit.take.screenshot", false))
+ takeScreenshot(driver, conf);
- String innerHtml = "";
- if(enableJavascript) {
- WebElement body = driver.findElement(By.tagName("body"));
- innerHtml = (String)((JavascriptExecutor)driver).executeScript("return arguments[0].innerHTML;", body);
- }
- else
- innerHtml = driver.getPageSource().replaceAll("&", "&");
- return innerHtml;
- } catch(Exception e) {
- TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles();
- cleanUpDriver(driver);
- throw new RuntimeException(e);
- }
+ String innerHtml = "";
+ if(enableJavascript) {
+ WebElement body = driver.findElement(By.tagName("body"));
+ innerHtml = (String)((JavascriptExecutor)driver).executeScript("return arguments[0].innerHTML;", body);
+ }
+ else
+ innerHtml = driver.getPageSource().replaceAll("&", "&");
+ return innerHtml;
+ } catch(Exception e) {
+ TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles();
+ cleanUpDriver(driver);
+ throw new RuntimeException(e);
+ }
}
public static void cleanUpDriver(WebDriver driver) {
- if (driver != null) {
- try {
- driver.close();
- driver.quit();
- TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles();
- } catch (Exception e) {
- throw new RuntimeException(e);
- }
+ if (driver != null) {
+ try {
+ driver.close();
+ driver.quit();
+ TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles();
+ } catch (Exception e) {
+ throw new RuntimeException(e);
}
+ }
}
/**
@@ -142,23 +142,22 @@ public class HtmlUnitWebDriver extends HtmlUnitDriver {
try {
if (conf.getBoolean("htmlunit.take.screenshot", false))
- takeScreenshot(driver, conf);
+ takeScreenshot(driver, conf);
-
String innerHtml = "";
if(enableJavascript) {
- WebElement body = driver.findElement(By.tagName("body"));
- innerHtml = (String)((JavascriptExecutor)driver).executeScript("return arguments[0].innerHTML;", body);
+ WebElement body = driver.findElement(By.tagName("body"));
+ innerHtml = (String)((JavascriptExecutor)driver).executeScript("return arguments[0].innerHTML;", body);
}
else
- innerHtml = driver.getPageSource().replaceAll("&", "&");
+ innerHtml = driver.getPageSource().replaceAll("&", "&");
return innerHtml;
} catch (Exception e) {
- TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles();
- throw new RuntimeException(e);
+ TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles();
+ throw new RuntimeException(e);
} finally {
- cleanUpDriver(driver);
+ cleanUpDriver(driver);
}
}
http://git-wip-us.apache.org/repos/asf/nutch/blob/3cda2229/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebWindowListener.java
----------------------------------------------------------------------
diff --git a/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebWindowListener.java b/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebWindowListener.java
index 760f4aa..baa8774 100644
--- a/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebWindowListener.java
+++ b/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebWindowListener.java
@@ -5,32 +5,33 @@ import com.gargoylesoftware.htmlunit.WebWindowListener;
public class HtmlUnitWebWindowListener implements WebWindowListener {
- private Integer redirectCount = 0;
- private Integer maxRedirects = 0;
-
- public HtmlUnitWebWindowListener() {
-
- }
-
- public HtmlUnitWebWindowListener(int maxRedirects) {
- this.maxRedirects = maxRedirects;
- }
-
- @Override
- public void webWindowOpened(WebWindowEvent event) {
-
- }
+ private Integer redirectCount = 0;
+ private Integer maxRedirects = 0;
+
+ public HtmlUnitWebWindowListener() {
+
+ }
+
+ public HtmlUnitWebWindowListener(int maxRedirects) {
+ this.maxRedirects = maxRedirects;
+ }
+
+ @Override
+ public void webWindowOpened(WebWindowEvent event) {
+
+ }
- @Override
- public void webWindowContentChanged(WebWindowEvent event) {
- redirectCount++;
- if(redirectCount > maxRedirects)
- throw new RuntimeException("Redirect Count: " + redirectCount + " exceeded the Maximum Redirects allowed: " + maxRedirects);
- }
+ @Override
+ public void webWindowContentChanged(WebWindowEvent event) {
+ redirectCount++;
+ if(redirectCount > maxRedirects)
+ throw new RuntimeException("Redirect Count: " + redirectCount + " exceeded the Maximum Redirects allowed: " + maxRedirects);
+ }
- @Override
- public void webWindowClosed(WebWindowEvent event) {
-
- }
-
+ @Override
+ public void webWindowClosed(WebWindowEvent event) {
+
+ }
+
}
+
http://git-wip-us.apache.org/repos/asf/nutch/blob/3cda2229/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
index 72b1fa1..a2f3b1e 100644
--- a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
+++ b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
@@ -344,7 +344,8 @@ public class HttpResponse implements Response {
@Override
public int getCode() {
- // TODO Auto-generated method stub
- return code;
+ // TODO Auto-generated method stub
+ return code;
}
}
+