You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by as...@apache.org on 2015/10/08 20:55:28 UTC
svn commit: r1707601 - in /nutch/trunk: ./
src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/
Author: asitang
Date: Thu Oct 8 18:55:27 2015
New Revision: 1707601
URL: http://svn.apache.org/viewvc?rev=1707601&view=rev
Log:
NUTCH-2108 Add a function to the selenium interactive plugin interface to do multiple manipulation of driver and then return the data this closes #67
Added:
nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefalultMultiInteractionHandler.java
nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefaultClickAllAjaxLinksHandler.java
Modified:
nutch/trunk/CHANGES.txt
Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1707601&r1=1707600&r2=1707601&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Thu Oct 8 18:55:27 2015
@@ -2,6 +2,10 @@ Nutch Change Log
Nutch Current Development 1.11-SNAPSHOT
+* NUTCH-2109 Create a brute force click-all-ajax-links utility fucntion for selenium interactive plugin
+
+* NUTCH-2108 Add a function to the selenium interactive plugin interface to do multiple manipulation of driver and then return the data
+
* NUTCH-2124 Fetcher following same redirect again and again (Yogendra Kumar Soni via snagel)
* NUTCH-2123 Seed List REST API returns Text but headers indicate/require JSON
Added: nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefalultMultiInteractionHandler.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefalultMultiInteractionHandler.java?rev=1707601&view=auto
==============================================================================
--- nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefalultMultiInteractionHandler.java (added)
+++ nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefalultMultiInteractionHandler.java Thu Oct 8 18:55:27 2015
@@ -0,0 +1,51 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.interactiveselenium;
+
+import org.apache.hadoop.util.StringUtils;
+import org.openqa.selenium.JavascriptExecutor;
+import org.openqa.selenium.WebDriver;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * This is a placeholder/example of a technique or use case where we do multiple
+ * interaction with the web driver and need data from each such interaction in the end. This code shows that after you have
+ * done multiple interactions and accumulated data you can in the end append that to the driver.
+ */
+public class DefalultMultiInteractionHandler implements
+ InteractiveSeleniumHandler {
+ private static final Logger LOG = LoggerFactory
+ .getLogger(DefalultMultiInteractionHandler.class);
+
+ public void processDriver(WebDriver driver) {
+ try {
+ // loop and get multiple pages in this string
+ String accumulatedData = "";
+ // append the string to the last page's driver
+ JavascriptExecutor jsx = (JavascriptExecutor) driver;
+ jsx.executeScript("document.body.innerHTML=document.body.innerHTML "
+ + accumulatedData + ";");
+ } catch (Exception e) {
+ LOG.info(StringUtils.stringifyException(e));
+ }
+ }
+
+ public boolean shouldProcessURL(String URL) {
+ return true;
+ }
+}
Added: nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefaultClickAllAjaxLinksHandler.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefaultClickAllAjaxLinksHandler.java?rev=1707601&view=auto
==============================================================================
--- nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefaultClickAllAjaxLinksHandler.java (added)
+++ nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefaultClickAllAjaxLinksHandler.java Thu Oct 8 18:55:27 2015
@@ -0,0 +1,86 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.interactiveselenium;
+
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.nutch.util.NutchConfiguration;
+import org.openqa.selenium.By;
+import org.openqa.selenium.JavascriptExecutor;
+import org.openqa.selenium.WebDriver;
+import org.openqa.selenium.WebElement;
+import org.openqa.selenium.support.ui.WebDriverWait;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * This handler clicks all the <a hfer="javascript:void(null);"> tags
+ * because it considers them as not usual links but ajax links/interactions. This uses the same logic of
+ * DefalultMultiInteractionHandler.
+ */
+public class DefaultClickAllAjaxLinksHandler implements InteractiveSeleniumHandler {
+ private static final Logger LOG = LoggerFactory
+ .getLogger(DefaultClickAllAjaxLinksHandler.class);
+
+ public void processDriver(WebDriver driver) {
+
+ try {
+ String accumulatedData = "";
+
+ driver.findElement(By.tagName("body")).getAttribute("innerHTML");
+ Configuration conf = NutchConfiguration.create();
+ new WebDriverWait(driver, conf.getLong("libselenium.page.load.delay", 3));
+
+ List<WebElement> atags = driver.findElements(By.tagName("a"));
+ int numberofajaxlinks = atags.size();
+ for (int i = 0; i < numberofajaxlinks; i++) {
+
+ if (atags.get(i).getAttribute("href") != null
+ && atags.get(i).getAttribute("href")
+ .equals("javascript:void(null);")) {
+
+ atags.get(i).click();
+
+ if (i == numberofajaxlinks - 1) {
+ // append everything to the driver in the last round
+ JavascriptExecutor jsx = (JavascriptExecutor) driver;
+ jsx.executeScript("document.body.innerHTML=document.body.innerHTML "
+ + accumulatedData + ";");
+ continue;
+ }
+
+ accumulatedData += driver.findElement(By.tagName("body"))
+ .getAttribute("innerHTML");
+
+ // refreshing the handlers as the page was interacted with
+ driver.navigate().refresh();
+ new WebDriverWait(driver, conf.getLong("libselenium.page.load.delay",
+ 3));
+ atags = driver.findElements(By.tagName("a"));
+ }
+ }
+ } catch (Exception e) {
+ LOG.info(StringUtils.stringifyException(e));
+ }
+ }
+
+ public boolean shouldProcessURL(String URL) {
+ return true;
+ }
+}