You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2016/04/18 00:35:47 UTC

[3/5] nutch git commit: fix for NUTCH-2191 contributed by karanjeets

fix for NUTCH-2191 contributed by karanjeets


Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/3cda2229
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/3cda2229
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/3cda2229

Branch: refs/heads/master
Commit: 3cda222971c970270dcc7525b97dfffe4b818ced
Parents: 366104d
Author: Karanjeet Singh <co...@gmail.com>
Authored: Mon Mar 28 22:58:40 2016 -0700
Committer: Karanjeet Singh <co...@gmail.com>
Committed: Mon Mar 28 22:58:40 2016 -0700

----------------------------------------------------------------------
 default.properties                              |   1 +
 .../protocol/htmlunit/HtmlUnitWebDriver.java    | 125 +++++++++----------
 .../htmlunit/HtmlUnitWebWindowListener.java     |  53 ++++----
 .../nutch/protocol/htmlunit/HttpResponse.java   |   5 +-
 4 files changed, 93 insertions(+), 91 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/nutch/blob/3cda2229/default.properties
----------------------------------------------------------------------
diff --git a/default.properties b/default.properties
index d34f778..aec5d51 100644
--- a/default.properties
+++ b/default.properties
@@ -90,6 +90,7 @@ plugins.protocol=\
    org.apache.nutch.protocol.http*:\
    org.apache.nutch.protocol.httpclient*:\
    org.apache.nutch.protocol.selenium*
+   org.apache.nutch.protocol.htmlunit*
 
 #
 # URL Filter Plugins

http://git-wip-us.apache.org/repos/asf/nutch/blob/3cda2229/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebDriver.java
----------------------------------------------------------------------
diff --git a/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebDriver.java b/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebDriver.java
index fc231c3..5e2c0ac 100644
--- a/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebDriver.java
+++ b/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebDriver.java
@@ -51,79 +51,79 @@ public class HtmlUnitWebDriver extends HtmlUnitDriver {
   private static int maxRedirects;
   
   public HtmlUnitWebDriver() {
-	super(enableJavascript);
+    super(enableJavascript);
   }
   
   @Override
   protected WebClient modifyWebClient(WebClient client) {
-	  client.getOptions().setJavaScriptEnabled(enableJavascript);
-	  client.getOptions().setCssEnabled(enableCss);
-	  client.getOptions().setRedirectEnabled(enableRedirect);
-	  if(enableJavascript)
-		  client.setJavaScriptTimeout(javascriptTimeout);
-	  client.getOptions().setThrowExceptionOnScriptError(false);
-	  if(enableRedirect)
-		  client.addWebWindowListener(new HtmlUnitWebWindowListener(maxRedirects));
-	  return client;
+    client.getOptions().setJavaScriptEnabled(enableJavascript);
+    client.getOptions().setCssEnabled(enableCss);
+    client.getOptions().setRedirectEnabled(enableRedirect);
+    if(enableJavascript)
+      client.setJavaScriptTimeout(javascriptTimeout);
+      client.getOptions().setThrowExceptionOnScriptError(false);
+      if(enableRedirect)
+        client.addWebWindowListener(new HtmlUnitWebWindowListener(maxRedirects));
+	return client;
   }
   
   public static WebDriver getDriverForPage(String url, Configuration conf) {
-	  long pageLoadTimout = conf.getLong("htmlunit.page.load.delay", 3);
-	  enableJavascript = conf.getBoolean("htmlunit.enable.javascript", true);
-	  enableCss = conf.getBoolean("htmlunit.enable.css", false);
-	  javascriptTimeout = conf.getLong("htmlunit.javascript.timeout", 3500);
-	  int redirects = Integer.parseInt(conf.get("http.redirect.max", "0"));
-	  enableRedirect = redirects <= 0 ? false : true;
-	  maxRedirects = redirects;
+    long pageLoadTimout = conf.getLong("htmlunit.page.load.delay", 3);
+    enableJavascript = conf.getBoolean("htmlunit.enable.javascript", true);
+    enableCss = conf.getBoolean("htmlunit.enable.css", false);
+    javascriptTimeout = conf.getLong("htmlunit.javascript.timeout", 3500);
+    int redirects = Integer.parseInt(conf.get("http.redirect.max", "0"));
+    enableRedirect = redirects <= 0 ? false : true;
+    maxRedirects = redirects;
 	  
-	  WebDriver driver = null;
+    WebDriver driver = null;
 	  
-	  try {
-		  driver = new HtmlUnitWebDriver();
-		  driver.manage().timeouts().pageLoadTimeout(pageLoadTimout, TimeUnit.SECONDS);
-		  driver.get(url);
-	  } catch(Exception e) {
-		  if(e instanceof TimeoutException) {
-				LOG.debug("HtmlUnit WebDriver: Timeout Exception: Capturing whatever loaded so far...");
-				return driver;
-			}
-			cleanUpDriver(driver);
-		    throw new RuntimeException(e);
-	  }
+    try {
+      driver = new HtmlUnitWebDriver();
+      driver.manage().timeouts().pageLoadTimeout(pageLoadTimout, TimeUnit.SECONDS);
+      driver.get(url);
+     } catch(Exception e) {
+       if(e instanceof TimeoutException) {
+	 LOG.debug("HtmlUnit WebDriver: Timeout Exception: Capturing whatever loaded so far...");
+	 return driver;
+     }
+     cleanUpDriver(driver);
+     throw new RuntimeException(e);
+    }
 
-      return driver;
+    return driver;
   }
 
   public static String getHTMLContent(WebDriver driver, Configuration conf) {
-      try {
-		  if (conf.getBoolean("htmlunit.take.screenshot", false))
-	    	  takeScreenshot(driver, conf);
+    try {
+      if (conf.getBoolean("htmlunit.take.screenshot", false))
+      takeScreenshot(driver, conf);
 		  
-		  String innerHtml = "";
-	      if(enableJavascript) {
-	    	  WebElement body = driver.findElement(By.tagName("body"));
-	    	  innerHtml = (String)((JavascriptExecutor)driver).executeScript("return arguments[0].innerHTML;", body); 
-	      }
-	      else
-	    	  innerHtml = driver.getPageSource().replaceAll("&amp;", "&");
-	      return innerHtml;
-      } catch(Exception e) {
-    	  TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles();
-    	  cleanUpDriver(driver);
-    	  throw new RuntimeException(e);
-      } 
+      String innerHtml = "";
+      if(enableJavascript) {
+	WebElement body = driver.findElement(By.tagName("body"));
+	innerHtml = (String)((JavascriptExecutor)driver).executeScript("return arguments[0].innerHTML;", body); 
+      }
+      else
+	innerHtml = driver.getPageSource().replaceAll("&amp;", "&");
+      return innerHtml;
+    } catch(Exception e) {
+	TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles();
+    	cleanUpDriver(driver);
+    	throw new RuntimeException(e);
+    } 
   }
 
   public static void cleanUpDriver(WebDriver driver) {
-      if (driver != null) {
-          try {
-        	  driver.close();
-              driver.quit();
-              TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles();
-          } catch (Exception e) {
-              throw new RuntimeException(e);
-          }
+    if (driver != null) {
+      try {
+        driver.close();
+        driver.quit();
+        TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles();
+      } catch (Exception e) {
+        throw new RuntimeException(e);
       }
+    }
   }
 
   /**
@@ -142,23 +142,22 @@ public class HtmlUnitWebDriver extends HtmlUnitDriver {
 
     try {
       if (conf.getBoolean("htmlunit.take.screenshot", false))
-    	  takeScreenshot(driver, conf);
+	takeScreenshot(driver, conf);
 
-      
       String innerHtml = "";
       if(enableJavascript) {
-    	  WebElement body = driver.findElement(By.tagName("body"));
-    	  innerHtml = (String)((JavascriptExecutor)driver).executeScript("return arguments[0].innerHTML;", body); 
+	WebElement body = driver.findElement(By.tagName("body"));
+    	innerHtml = (String)((JavascriptExecutor)driver).executeScript("return arguments[0].innerHTML;", body); 
       }
       else
-    	  innerHtml = driver.getPageSource().replaceAll("&amp;", "&");
+    	innerHtml = driver.getPageSource().replaceAll("&amp;", "&");
       return innerHtml;
 
     } catch (Exception e) {
-	      TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles();
-	      throw new RuntimeException(e);
+	TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles();
+        throw new RuntimeException(e);
     } finally {
-    	cleanUpDriver(driver);
+        cleanUpDriver(driver);
     }
   }
 

http://git-wip-us.apache.org/repos/asf/nutch/blob/3cda2229/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebWindowListener.java
----------------------------------------------------------------------
diff --git a/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebWindowListener.java b/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebWindowListener.java
index 760f4aa..baa8774 100644
--- a/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebWindowListener.java
+++ b/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebWindowListener.java
@@ -5,32 +5,33 @@ import com.gargoylesoftware.htmlunit.WebWindowListener;
 
 public class HtmlUnitWebWindowListener implements WebWindowListener {
 
-	private Integer redirectCount = 0;
-	private Integer maxRedirects = 0;
-	
-	public HtmlUnitWebWindowListener() {
-		
-	}
-	
-	public HtmlUnitWebWindowListener(int maxRedirects) {
-		this.maxRedirects = maxRedirects;
-	}
-	
-	@Override
-	public void webWindowOpened(WebWindowEvent event) {
-		
-	}
+  private Integer redirectCount = 0;
+  private Integer maxRedirects = 0;
+  
+  public HtmlUnitWebWindowListener() {
+    
+  }
+  
+  public HtmlUnitWebWindowListener(int maxRedirects) {
+    this.maxRedirects = maxRedirects;
+  }
+  
+  @Override
+  public void webWindowOpened(WebWindowEvent event) {
+    
+  }
 
-	@Override
-	public void webWindowContentChanged(WebWindowEvent event) {
-		redirectCount++;
-		if(redirectCount > maxRedirects)
-			throw new RuntimeException("Redirect Count: " + redirectCount + " exceeded the Maximum Redirects allowed: " + maxRedirects);
-	}
+  @Override
+  public void webWindowContentChanged(WebWindowEvent event) {
+    redirectCount++;
+    if(redirectCount > maxRedirects)
+      throw new RuntimeException("Redirect Count: " + redirectCount + " exceeded the Maximum Redirects allowed: " + maxRedirects);
+  }
 
-	@Override
-	public void webWindowClosed(WebWindowEvent event) {
-		
-	}
-	
+  @Override
+  public void webWindowClosed(WebWindowEvent event) {
+    
+  }
+  
 }
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/3cda2229/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
index 72b1fa1..a2f3b1e 100644
--- a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
+++ b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
@@ -344,7 +344,8 @@ public class HttpResponse implements Response {
 
   @Override
   public int getCode() {
-	// TODO Auto-generated method stub
-	return code;
+  // TODO Auto-generated method stub
+  return code;
   }
 }
+