You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by as...@apache.org on 2015/10/08 20:55:28 UTC

svn commit: r1707601 - in /nutch/trunk: ./ src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/

Author: asitang
Date: Thu Oct  8 18:55:27 2015
New Revision: 1707601

URL: http://svn.apache.org/viewvc?rev=1707601&view=rev
Log:
NUTCH-2108 Add a function to the selenium interactive plugin interface to do multiple manipulation of driver and then return the data this closes #67

Added:
    nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefalultMultiInteractionHandler.java
    nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefaultClickAllAjaxLinksHandler.java
Modified:
    nutch/trunk/CHANGES.txt

Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1707601&r1=1707600&r2=1707601&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Thu Oct  8 18:55:27 2015
@@ -2,6 +2,10 @@ Nutch Change Log
    
 Nutch Current Development 1.11-SNAPSHOT
 
+* NUTCH-2109 Create a brute force click-all-ajax-links utility fucntion for selenium interactive plugin
+
+* NUTCH-2108 Add a function to the selenium interactive plugin interface to do multiple manipulation of driver and then return the data
+
 * NUTCH-2124 Fetcher following same redirect again and again (Yogendra Kumar Soni via snagel)
 
 * NUTCH-2123 Seed List REST API returns Text but headers indicate/require JSON

Added: nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefalultMultiInteractionHandler.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefalultMultiInteractionHandler.java?rev=1707601&view=auto
==============================================================================
--- nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefalultMultiInteractionHandler.java (added)
+++ nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefalultMultiInteractionHandler.java Thu Oct  8 18:55:27 2015
@@ -0,0 +1,51 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.interactiveselenium;
+
+import org.apache.hadoop.util.StringUtils;
+import org.openqa.selenium.JavascriptExecutor;
+import org.openqa.selenium.WebDriver;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * This is a placeholder/example of a technique or use case where we do multiple 
+ * interaction with the web driver and need data from each such interaction in the end. This code shows that after you have 
+ * done multiple interactions and accumulated data you can in the end append that to the driver.  
+ */
+public class DefalultMultiInteractionHandler implements
+    InteractiveSeleniumHandler {
+  private static final Logger LOG = LoggerFactory
+      .getLogger(DefalultMultiInteractionHandler.class);
+
+  public void processDriver(WebDriver driver) {
+    try {
+      // loop and get multiple pages in this string
+      String accumulatedData = "";
+      // append the string to the last page's driver
+      JavascriptExecutor jsx = (JavascriptExecutor) driver;
+      jsx.executeScript("document.body.innerHTML=document.body.innerHTML "
+          + accumulatedData + ";");
+    } catch (Exception e) {
+      LOG.info(StringUtils.stringifyException(e));
+    }
+  }
+
+  public boolean shouldProcessURL(String URL) {
+    return true;
+  }
+}

Added: nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefaultClickAllAjaxLinksHandler.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefaultClickAllAjaxLinksHandler.java?rev=1707601&view=auto
==============================================================================
--- nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefaultClickAllAjaxLinksHandler.java (added)
+++ nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefaultClickAllAjaxLinksHandler.java Thu Oct  8 18:55:27 2015
@@ -0,0 +1,86 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.interactiveselenium;
+
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.nutch.util.NutchConfiguration;
+import org.openqa.selenium.By;
+import org.openqa.selenium.JavascriptExecutor;
+import org.openqa.selenium.WebDriver;
+import org.openqa.selenium.WebElement;
+import org.openqa.selenium.support.ui.WebDriverWait;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * This handler clicks all the <a hfer="javascript:void(null);"> tags
+ * because it considers them as not usual links but ajax links/interactions. This uses the same logic of 
+ * DefalultMultiInteractionHandler. 
+ */
+public class DefaultClickAllAjaxLinksHandler implements InteractiveSeleniumHandler {
+  private static final Logger LOG = LoggerFactory
+      .getLogger(DefaultClickAllAjaxLinksHandler.class);
+
+  public void processDriver(WebDriver driver) {
+
+    try {
+      String accumulatedData = "";
+
+      driver.findElement(By.tagName("body")).getAttribute("innerHTML");
+      Configuration conf = NutchConfiguration.create();
+      new WebDriverWait(driver, conf.getLong("libselenium.page.load.delay", 3));
+
+      List<WebElement> atags = driver.findElements(By.tagName("a"));
+      int numberofajaxlinks = atags.size();
+      for (int i = 0; i < numberofajaxlinks; i++) {
+
+        if (atags.get(i).getAttribute("href") != null
+            && atags.get(i).getAttribute("href")
+                .equals("javascript:void(null);")) {
+
+          atags.get(i).click();
+
+          if (i == numberofajaxlinks - 1) {
+            // append everything to the driver in the last round
+            JavascriptExecutor jsx = (JavascriptExecutor) driver;
+            jsx.executeScript("document.body.innerHTML=document.body.innerHTML "
+                + accumulatedData + ";");
+            continue;
+          }
+
+          accumulatedData += driver.findElement(By.tagName("body"))
+              .getAttribute("innerHTML");
+
+          // refreshing the handlers as the page was interacted with
+          driver.navigate().refresh();
+          new WebDriverWait(driver, conf.getLong("libselenium.page.load.delay",
+              3));
+          atags = driver.findElements(By.tagName("a"));
+        }
+      }
+    } catch (Exception e) {
+      LOG.info(StringUtils.stringifyException(e));
+    }
+  }
+
+  public boolean shouldProcessURL(String URL) {
+    return true;
+  }
+}