You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2016/04/18 00:35:45 UTC
[1/5] nutch git commit: fix for NUTCH-2191 contributed by karanjeets
Repository: nutch
Updated Branches:
refs/heads/master d6bcefd92 -> 044e8e77e
fix for NUTCH-2191 contributed by karanjeets
Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/fa334722
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/fa334722
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/fa334722
Branch: refs/heads/master
Commit: fa33472297aca6a6468461bb6945225c93590d6d
Parents: a9b2491
Author: Karanjeet Singh <co...@gmail.com>
Authored: Sat Mar 26 23:21:28 2016 -0700
Committer: Karanjeet Singh <co...@gmail.com>
Committed: Sat Mar 26 23:21:28 2016 -0700
----------------------------------------------------------------------
build.xml | 6 +
conf/nutch-default.xml | 66 ++++
src/plugin/build.xml | 4 +
src/plugin/lib-htmlunit/build-ivy.xml | 54 +++
src/plugin/lib-htmlunit/build.xml | 28 ++
src/plugin/lib-htmlunit/ivy.xml | 52 +++
src/plugin/lib-htmlunit/plugin.xml | 166 +++++++++
.../protocol/htmlunit/HtmlUnitWebDriver.java | 190 ++++++++++
.../htmlunit/HtmlUnitWebWindowListener.java | 36 ++
src/plugin/protocol-htmlunit/build.xml | 53 +++
src/plugin/protocol-htmlunit/ivy.xml | 38 ++
src/plugin/protocol-htmlunit/plugin.xml | 51 +++
.../apache/nutch/protocol/htmlunit/Http.java | 67 ++++
.../nutch/protocol/htmlunit/HttpResponse.java | 350 +++++++++++++++++++
.../apache/nutch/protocol/htmlunit/package.html | 5 +
15 files changed, 1166 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/nutch/blob/fa334722/build.xml
----------------------------------------------------------------------
diff --git a/build.xml b/build.xml
index f8aa196..5cff1ea 100644
--- a/build.xml
+++ b/build.xml
@@ -189,6 +189,7 @@
<packageset dir="${plugins.dir}/indexer-elastic/src/java/" />
<packageset dir="${plugins.dir}/indexer-solr/src/java"/>
<packageset dir="${plugins.dir}/language-identifier/src/java"/>
+ <packageset dir="${plugins.dir}/lib-htmlunit/src/java"/>
<packageset dir="${plugins.dir}/lib-http/src/java"/>
<packageset dir="${plugins.dir}/lib-selenium/src/java"/>
<packageset dir="${plugins.dir}/lib-regex-filter/src/java"/>
@@ -203,6 +204,7 @@
<packageset dir="${plugins.dir}/parsefilter-regex/src/java"/>
<packageset dir="${plugins.dir}/protocol-file/src/java"/>
<packageset dir="${plugins.dir}/protocol-ftp/src/java"/>
+ <packageset dir="${plugins.dir}/protocol-htmlunit/src/java"/>
<packageset dir="${plugins.dir}/protocol-http/src/java"/>
<packageset dir="${plugins.dir}/protocol-httpclient/src/java"/>
<packageset dir="${plugins.dir}/protocol-interactiveselenium/src/java"/>
@@ -629,6 +631,7 @@
<packageset dir="${plugins.dir}/indexer-elastic/src/java/" />
<packageset dir="${plugins.dir}/indexer-solr/src/java"/>
<packageset dir="${plugins.dir}/language-identifier/src/java"/>
+ <packageset dir="${plugins.dir}/lib-htmlunit/src/java"/>
<packageset dir="${plugins.dir}/lib-http/src/java"/>
<packageset dir="${plugins.dir}/lib-selenium/src/java"/>
<packageset dir="${plugins.dir}/lib-regex-filter/src/java"/>
@@ -643,6 +646,7 @@
<packageset dir="${plugins.dir}/parse-zip/src/java"/>
<packageset dir="${plugins.dir}/protocol-file/src/java"/>
<packageset dir="${plugins.dir}/protocol-ftp/src/java"/>
+ <packageset dir="${plugins.dir}/protocol-htmlunit/src/java"/>
<packageset dir="${plugins.dir}/protocol-http/src/java"/>
<packageset dir="${plugins.dir}/protocol-httpclient/src/java"/>
<packageset dir="${plugins.dir}/protocol-selenium/src/java"/>
@@ -1033,6 +1037,7 @@
<source path="${plugins.dir}/index-static/src/test/" />
<source path="${plugins.dir}/language-identifier/src/java/" />
<source path="${plugins.dir}/language-identifier/src/test/" />
+ <source path="${plugins.dir}/lib-htmlunit/src/java/" />
<source path="${plugins.dir}/lib-http/src/java/" />
<source path="${plugins.dir}/lib-http/src/test/" />
<source path="${plugins.dir}/lib-selenium/src/java/" />
@@ -1057,6 +1062,7 @@
<source path="${plugins.dir}/protocol-file/src/java/" />
<source path="${plugins.dir}/protocol-file/src/test/" />
<source path="${plugins.dir}/protocol-ftp/src/java/" />
+ <source path="${plugins.dir}/protocol-htmlunit/src/java"/>
<source path="${plugins.dir}/protocol-httpclient/src/java/" />
<source path="${plugins.dir}/protocol-httpclient/src/test/" />
<source path="${plugins.dir}/protocol-http/src/java/" />
http://git-wip-us.apache.org/repos/asf/nutch/blob/fa334722/conf/nutch-default.xml
----------------------------------------------------------------------
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index 93503f3..a5f17bf 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -1874,6 +1874,72 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter-->
</description>
</property>
+
+<!-- lib-htmlunit plugin properties; applies to protocol-htmlunit -->
+
+<property>
+ <name>htmlunit.page.load.delay</name>
+ <value>3</value>
+ <description>
+ The delay in seconds to use when loading a page with lib-htmlunit. This
+ setting is used by protocol-htmlunit since they depending on
+ lib-htmlunit for fetching.
+ </description>
+</property>
+
+<property>
+ <name>htmlunit.enable.javascript</name>
+ <value>true</value>
+ <description>
+ A Boolean value representing if javascript should
+ be enabled or disabled when using htmlunit. The default value is enabled.
+ </description>
+</property>
+
+<property>
+ <name>htmlunit.javascript.timeout</name>
+ <value>3500</value>
+ <description>
+ The timeout in milliseconds when loading javascript with lib-htmlunit. This
+ setting is used by protocol-htmlunit since they depending on
+ lib-htmlunit for fetching.
+ </description>
+</property>
+
+<property>
+ <name>htmlunit.enable.css</name>
+ <value>false</value>
+ <description>
+ A Boolean value representing if CSS should
+ be enabled or disabled when using htmlunit. The default value is disabled.
+ </description>
+</property>
+
+<property>
+ <name>htmlunit.take.screenshot</name>
+ <value>false</value>
+ <description>
+ Boolean property determining whether the protocol-htmlunit
+ WebDriver should capture a screenshot of the URL. If set to
+ true remember to define the 'htmlunit.screenshot.location'
+ property as this determines the location screenshots should be
+ persisted to on HDFS. If that property is not set, screenshots
+ are simply discarded.
+ </description>
+</property>
+
+<property>
+ <name>htmlunit.screenshot.location</name>
+ <value></value>
+ <description>
+ The location on disk where a URL screenshot should be saved
+ to if the 'htmlunit.take.screenshot' property is set to true.
+ By default this is null, in this case screenshots held in memory
+ are simply discarded.
+ </description>
+</property>
+
+
<!-- protocol-selenium plugin properties -->
<property>
http://git-wip-us.apache.org/repos/asf/nutch/blob/fa334722/src/plugin/build.xml
----------------------------------------------------------------------
diff --git a/src/plugin/build.xml b/src/plugin/build.xml
index 10731b3..75ae2e7 100755
--- a/src/plugin/build.xml
+++ b/src/plugin/build.xml
@@ -53,6 +53,8 @@
<ant dir="protocol-ftp" target="deploy"/>
<ant dir="protocol-http" target="deploy"/>
<ant dir="protocol-httpclient" target="deploy"/>
+ <ant dir="lib-htmlunit" target="deploy"/>
+ <ant dir="protocol-htmlunit" target="deploy" />
<ant dir="lib-selenium" target="deploy"/>
<ant dir="protocol-selenium" target="deploy" />
<ant dir="protocol-interactiveselenium" target="deploy" />
@@ -170,6 +172,8 @@
<ant dir="protocol-ftp" target="clean"/>
<ant dir="protocol-http" target="clean"/>
<ant dir="protocol-httpclient" target="clean"/>
+ <ant dir="lib-htmlunit" target="clean"/>
+ <ant dir="protocol-htmlunit" target="clean" />
<ant dir="lib-selenium" target="clean"/>
<ant dir="protocol-selenium" target="clean" />
<ant dir="protocol-interactiveselenium" target="clean" />
http://git-wip-us.apache.org/repos/asf/nutch/blob/fa334722/src/plugin/lib-htmlunit/build-ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/lib-htmlunit/build-ivy.xml b/src/plugin/lib-htmlunit/build-ivy.xml
new file mode 100644
index 0000000..7022f4e
--- /dev/null
+++ b/src/plugin/lib-htmlunit/build-ivy.xml
@@ -0,0 +1,54 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="lib-htmlunit" default="deps-jar" xmlns:ivy="antlib:org.apache.ivy.ant">
+
+ <property name="ivy.install.version" value="2.1.0" />
+ <condition property="ivy.home" value="${env.IVY_HOME}">
+ <isset property="env.IVY_HOME" />
+ </condition>
+ <property name="ivy.home" value="${user.home}/.ant" />
+ <property name="ivy.checksums" value="" />
+ <property name="ivy.jar.dir" value="${ivy.home}/lib" />
+ <property name="ivy.jar.file" value="${ivy.jar.dir}/ivy.jar" />
+
+ <target name="download-ivy" unless="offline">
+
+ <mkdir dir="${ivy.jar.dir}"/>
+ <!-- download Ivy from web site so that it can be used even without any special installation -->
+ <get src="http://repo2.maven.org/maven2/org/apache/ivy/ivy/${ivy.install.version}/ivy-${ivy.install.version}.jar"
+ dest="${ivy.jar.file}" usetimestamp="true"/>
+ </target>
+
+ <target name="init-ivy" depends="download-ivy">
+ <!-- try to load ivy here from ivy home, in case the user has not already dropped
+ it into ant's lib dir (note that the latter copy will always take precedence).
+ We will not fail as long as local lib dir exists (it may be empty) and
+ ivy is in at least one of ant's lib dir or the local lib dir. -->
+ <path id="ivy.lib.path">
+ <fileset dir="${ivy.jar.dir}" includes="*.jar"/>
+
+ </path>
+ <taskdef resource="org/apache/ivy/ant/antlib.xml"
+ uri="antlib:org.apache.ivy.ant" classpathref="ivy.lib.path"/>
+ </target>
+
+ <target name="deps-jar" depends="init-ivy">
+ <ivy:retrieve pattern="lib/[artifact]-[revision].[ext]" sync="true"/>
+ </target>
+
+</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/fa334722/src/plugin/lib-htmlunit/build.xml
----------------------------------------------------------------------
diff --git a/src/plugin/lib-htmlunit/build.xml b/src/plugin/lib-htmlunit/build.xml
new file mode 100644
index 0000000..14f5d8f
--- /dev/null
+++ b/src/plugin/lib-htmlunit/build.xml
@@ -0,0 +1,28 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="lib-htmlunit" default="jar-core">
+
+ <import file="../build-plugin.xml"/>
+
+ <!-- Add compilation dependencies to classpath -->
+ <path id="plugin.deps">
+ <fileset dir="${nutch.root}/build">
+ <include name="**/lib-http/*.jar" />
+ </fileset>
+ </path>
+</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/fa334722/src/plugin/lib-htmlunit/ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/lib-htmlunit/ivy.xml b/src/plugin/lib-htmlunit/ivy.xml
new file mode 100644
index 0000000..6430535
--- /dev/null
+++ b/src/plugin/lib-htmlunit/ivy.xml
@@ -0,0 +1,52 @@
+<?xml version="1.0" ?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<ivy-module version="1.0">
+ <info organisation="org.apache.nutch" module="${ant.project.name}">
+ <license name="Apache 2.0"/>
+ <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+ <description>
+ Apache Nutch
+ </description>
+ </info>
+
+ <configurations>
+ <include file="../../../ivy/ivy-configurations.xml"/>
+ </configurations>
+
+ <publications>
+ <!--get the artifact from our module name-->
+ <artifact conf="master"/>
+ </publications>
+
+ <dependencies>
+ <!-- begin selenium dependencies -->
+ <dependency org="org.seleniumhq.selenium" name="selenium-java" rev="2.44.0" />
+
+ <dependency org="com.opera" name="operadriver" rev="1.5">
+ <exclude org="org.seleniumhq.selenium" name="selenium-remote-driver" />
+ </dependency>
+ <dependency org="com.codeborne" name="phantomjsdriver" rev="1.2.1" >
+ <exclude org="org.seleniumhq.selenium" name="selenium-remote-driver" />
+ <exclude org="org.seleniumhq.selenium" name="selenium-java" />
+ </dependency>
+ <!-- end selenium dependencies -->
+ </dependencies>
+
+</ivy-module>
http://git-wip-us.apache.org/repos/asf/nutch/blob/fa334722/src/plugin/lib-htmlunit/plugin.xml
----------------------------------------------------------------------
diff --git a/src/plugin/lib-htmlunit/plugin.xml b/src/plugin/lib-htmlunit/plugin.xml
new file mode 100644
index 0000000..290a137
--- /dev/null
+++ b/src/plugin/lib-htmlunit/plugin.xml
@@ -0,0 +1,166 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<!--
+ ! A common framework for http protocol implementations
+ !-->
+<plugin
+ id="lib-htmlunit"
+ name="HTTP Framework"
+ version="1.0"
+ provider-name="org.apache.nutch">
+
+ <runtime>
+ <library name="lib-htmlunit.jar">
+ <export name="*"/>
+ </library>
+ <!-- all classes from dependent libraries are exported -->
+ <library name="cglib-nodep-2.1_3.jar">
+ <export name="*"/>
+ </library>
+ <library name="commons-codec-1.9.jar">
+ <export name="*"/>
+ </library>
+ <library name="commons-collections-3.2.1.jar">
+ <export name="*"/>
+ </library>
+ <library name="commons-exec-1.1.jar">
+ <export name="*"/>
+ </library>
+ <library name="commons-io-2.4.jar">
+ <export name="*"/>
+ </library>
+ <library name="commons-jxpath-1.3.jar">
+ <export name="*"/>
+ </library>
+ <library name="commons-lang3-3.3.2.jar">
+ <export name="*"/>
+ </library>
+ <library name="commons-logging-1.1.3.jar">
+ <export name="*"/>
+ </library>
+ <library name="cssparser-0.9.14.jar">
+ <export name="*"/>
+ </library>
+ <library name="gson-2.3.jar">
+ <export name="*"/>
+ </library>
+ <library name="guava-18.0.jar">
+ <export name="*"/>
+ </library>
+ <library name="htmlunit-2.15.jar">
+ <export name="*"/>
+ </library>
+ <library name="htmlunit-core-js-2.15.jar">
+ <export name="*"/>
+ </library>
+ <library name="httpclient-4.3.4.jar">
+ <export name="*"/>
+ </library>
+ <library name="httpcore-4.3.2.jar">
+ <export name="*"/>
+ </library>
+ <library name="httpmime-4.3.3.jar">
+ <export name="*"/>
+ </library>
+ <library name="ini4j-0.5.2.jar">
+ <export name="*"/>
+ </library>
+ <library name="jetty-http-8.1.15.v20140411.jar">
+ <export name="*"/>
+ </library>
+ <library name="jetty-io-8.1.15.v20140411.jar">
+ <export name="*"/>
+ </library>
+ <library name="jetty-util-8.1.15.v20140411.jar">
+ <export name="*"/>
+ </library>
+ <library name="jetty-websocket-8.1.15.v20140411.jar">
+ <export name="*"/>
+ </library>
+ <library name="jna-3.4.0.jar">
+ <export name="*"/>
+ </library>
+ <library name="nekohtml-1.9.21.jar">
+ <export name="*"/>
+ </library>
+ <library name="netty-3.5.2.Final.jar">
+ <export name="*"/>
+ </library>
+ <library name="operadriver-1.5.jar">
+ <export name="*"/>
+ </library>
+ <library name="operalaunchers-1.1.jar">
+ <export name="*"/>
+ </library>
+ <library name="phantomjsdriver-1.2.1.jar">
+ <export name="*"/>
+ </library>
+ <library name="platform-3.4.0.jar">
+ <export name="*"/>
+ </library>
+ <library name="protobuf-java-2.4.1.jar">
+ <export name="*"/>
+ </library>
+ <library name="sac-1.3.jar">
+ <export name="*"/>
+ </library>
+ <library name="selenium-api-2.44.0.jar">
+ <export name="*"/>
+ </library>
+ <library name="selenium-chrome-driver-2.44.0.jar">
+ <export name="*"/>
+ </library>
+ <library name="selenium-firefox-driver-2.44.0.jar">
+ <export name="*"/>
+ </library>
+ <library name="selenium-htmlunit-driver-2.44.0.jar">
+ <export name="*"/>
+ </library>
+ <library name="selenium-ie-driver-2.44.0.jar">
+ <export name="*"/>
+ </library>
+ <library name="selenium-java-2.44.0.jar">
+ <export name="*"/>
+ </library>
+ <library name="selenium-remote-driver-2.44.0.jar">
+ <export name="*"/>
+ </library>
+ <library name="selenium-safari-driver-2.44.0.jar">
+ <export name="*"/>
+ </library>
+ <library name="selenium-support-2.44.0.jar">
+ <export name="*"/>
+ </library>
+ <library name="serializer-2.7.1.jar">
+ <export name="*"/>
+ </library>
+ <library name="webbit-0.4.14.jar">
+ <export name="*"/>
+ </library>
+ <library name="xalan-2.7.1.jar">
+ <export name="*"/>
+ </library>
+ <library name="xercesImpl-2.11.0.jar">
+ <export name="*"/>
+ </library>
+ <library name="xml-apis-1.4.01.jar">
+ <export name="*"/>
+ </library>
+ </runtime>
+
+</plugin>
http://git-wip-us.apache.org/repos/asf/nutch/blob/fa334722/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebDriver.java
----------------------------------------------------------------------
diff --git a/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebDriver.java b/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebDriver.java
new file mode 100644
index 0000000..fc231c3
--- /dev/null
+++ b/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebDriver.java
@@ -0,0 +1,190 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.htmlunit;
+
+import java.io.BufferedInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.util.concurrent.TimeUnit;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IOUtils;
+import org.openqa.selenium.By;
+import org.openqa.selenium.JavascriptExecutor;
+import org.openqa.selenium.OutputType;
+import org.openqa.selenium.TakesScreenshot;
+import org.openqa.selenium.TimeoutException;
+import org.openqa.selenium.WebDriver;
+import org.openqa.selenium.WebElement;
+import org.openqa.selenium.htmlunit.HtmlUnitDriver;
+import org.openqa.selenium.io.TemporaryFilesystem;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.gargoylesoftware.htmlunit.WebClient;
+
+public class HtmlUnitWebDriver extends HtmlUnitDriver {
+
+ private static final Logger LOG = LoggerFactory.getLogger(HtmlUnitWebDriver.class);
+ private static boolean enableJavascript;
+ private static boolean enableCss;
+ private static boolean enableRedirect;
+ private static long javascriptTimeout;
+ private static int maxRedirects;
+
+ public HtmlUnitWebDriver() {
+ super(enableJavascript);
+ }
+
+ @Override
+ protected WebClient modifyWebClient(WebClient client) {
+ client.getOptions().setJavaScriptEnabled(enableJavascript);
+ client.getOptions().setCssEnabled(enableCss);
+ client.getOptions().setRedirectEnabled(enableRedirect);
+ if(enableJavascript)
+ client.setJavaScriptTimeout(javascriptTimeout);
+ client.getOptions().setThrowExceptionOnScriptError(false);
+ if(enableRedirect)
+ client.addWebWindowListener(new HtmlUnitWebWindowListener(maxRedirects));
+ return client;
+ }
+
+ public static WebDriver getDriverForPage(String url, Configuration conf) {
+ long pageLoadTimout = conf.getLong("htmlunit.page.load.delay", 3);
+ enableJavascript = conf.getBoolean("htmlunit.enable.javascript", true);
+ enableCss = conf.getBoolean("htmlunit.enable.css", false);
+ javascriptTimeout = conf.getLong("htmlunit.javascript.timeout", 3500);
+ int redirects = Integer.parseInt(conf.get("http.redirect.max", "0"));
+ enableRedirect = redirects <= 0 ? false : true;
+ maxRedirects = redirects;
+
+ WebDriver driver = null;
+
+ try {
+ driver = new HtmlUnitWebDriver();
+ driver.manage().timeouts().pageLoadTimeout(pageLoadTimout, TimeUnit.SECONDS);
+ driver.get(url);
+ } catch(Exception e) {
+ if(e instanceof TimeoutException) {
+ LOG.debug("HtmlUnit WebDriver: Timeout Exception: Capturing whatever loaded so far...");
+ return driver;
+ }
+ cleanUpDriver(driver);
+ throw new RuntimeException(e);
+ }
+
+ return driver;
+ }
+
+ public static String getHTMLContent(WebDriver driver, Configuration conf) {
+ try {
+ if (conf.getBoolean("htmlunit.take.screenshot", false))
+ takeScreenshot(driver, conf);
+
+ String innerHtml = "";
+ if(enableJavascript) {
+ WebElement body = driver.findElement(By.tagName("body"));
+ innerHtml = (String)((JavascriptExecutor)driver).executeScript("return arguments[0].innerHTML;", body);
+ }
+ else
+ innerHtml = driver.getPageSource().replaceAll("&", "&");
+ return innerHtml;
+ } catch(Exception e) {
+ TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles();
+ cleanUpDriver(driver);
+ throw new RuntimeException(e);
+ }
+ }
+
+ public static void cleanUpDriver(WebDriver driver) {
+ if (driver != null) {
+ try {
+ driver.close();
+ driver.quit();
+ TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles();
+ } catch (Exception e) {
+ throw new RuntimeException(e);
+ }
+ }
+ }
+
+ /**
+ * Function for obtaining the HTML BODY using the selected
+ * {@link org.openqa.selenium.WebDriver}.
+ * There are a number of configuration properties within
+ * <code>nutch-site.xml</code> which determine whether to
+ * take screenshots of the rendered pages and persist them
+ * as timestamped .png's into HDFS.
+ * @param url the URL to fetch and render
+ * @param conf the {@link org.apache.hadoop.conf.Configuration}
+ * @return the rendered inner HTML page
+ */
+ public static String getHtmlPage(String url, Configuration conf) {
+ WebDriver driver = getDriverForPage(url, conf);
+
+ try {
+ if (conf.getBoolean("htmlunit.take.screenshot", false))
+ takeScreenshot(driver, conf);
+
+
+ String innerHtml = "";
+ if(enableJavascript) {
+ WebElement body = driver.findElement(By.tagName("body"));
+ innerHtml = (String)((JavascriptExecutor)driver).executeScript("return arguments[0].innerHTML;", body);
+ }
+ else
+ innerHtml = driver.getPageSource().replaceAll("&", "&");
+ return innerHtml;
+
+ } catch (Exception e) {
+ TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles();
+ throw new RuntimeException(e);
+ } finally {
+ cleanUpDriver(driver);
+ }
+ }
+
+ private static void takeScreenshot(WebDriver driver, Configuration conf) {
+ try {
+ String url = driver.getCurrentUrl();
+ File srcFile = ((TakesScreenshot)driver).getScreenshotAs(OutputType.FILE);
+ LOG.debug("In-memory screenshot taken of: {}", url);
+ FileSystem fs = FileSystem.get(conf);
+ if (conf.get("htmlunit.screenshot.location") != null) {
+ Path screenshotPath = new Path(conf.get("htmlunit.screenshot.location") + "/" + srcFile.getName());
+ OutputStream os = null;
+ if (!fs.exists(screenshotPath)) {
+ LOG.debug("No existing screenshot already exists... creating new file at {} {}.", screenshotPath, srcFile.getName());
+ os = fs.create(screenshotPath);
+ }
+ InputStream is = new BufferedInputStream(new FileInputStream(srcFile));
+ IOUtils.copyBytes(is, os, conf);
+ LOG.debug("Screenshot for {} successfully saved to: {} {}", url, screenshotPath, srcFile.getName());
+ } else {
+ LOG.warn("Screenshot for {} not saved to HDFS (subsequently disgarded) as value for "
+ + "'htmlunit.screenshot.location' is absent from nutch-site.xml.", url);
+ }
+ } catch (Exception e) {
+ cleanUpDriver(driver);
+ throw new RuntimeException(e);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/fa334722/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebWindowListener.java
----------------------------------------------------------------------
diff --git a/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebWindowListener.java b/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebWindowListener.java
new file mode 100644
index 0000000..760f4aa
--- /dev/null
+++ b/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebWindowListener.java
@@ -0,0 +1,36 @@
+package org.apache.nutch.protocol.htmlunit;
+
+import com.gargoylesoftware.htmlunit.WebWindowEvent;
+import com.gargoylesoftware.htmlunit.WebWindowListener;
+
+public class HtmlUnitWebWindowListener implements WebWindowListener {
+
+ private Integer redirectCount = 0;
+ private Integer maxRedirects = 0;
+
+ public HtmlUnitWebWindowListener() {
+
+ }
+
+ public HtmlUnitWebWindowListener(int maxRedirects) {
+ this.maxRedirects = maxRedirects;
+ }
+
+ @Override
+ public void webWindowOpened(WebWindowEvent event) {
+
+ }
+
+ @Override
+ public void webWindowContentChanged(WebWindowEvent event) {
+ redirectCount++;
+ if(redirectCount > maxRedirects)
+ throw new RuntimeException("Redirect Count: " + redirectCount + " exceeded the Maximum Redirects allowed: " + maxRedirects);
+ }
+
+ @Override
+ public void webWindowClosed(WebWindowEvent event) {
+
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/fa334722/src/plugin/protocol-htmlunit/build.xml
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-htmlunit/build.xml b/src/plugin/protocol-htmlunit/build.xml
new file mode 100644
index 0000000..0ed0228
--- /dev/null
+++ b/src/plugin/protocol-htmlunit/build.xml
@@ -0,0 +1,53 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="protocol-htmlunit" default="jar-core">
+
+ <import file="../build-plugin.xml"/>
+
+ <!-- Build compilation dependencies -->
+ <target name="deps-jar">
+ <ant target="jar" inheritall="false" dir="../lib-http"/>
+ <ant target="jar" inheritall="false" dir="../lib-htmlunit"/>
+ </target>
+
+ <!-- Add compilation dependencies to classpath -->
+ <path id="plugin.deps">
+ <fileset dir="${nutch.root}/build">
+ <include name="**/lib-http/*.jar" />
+ <include name="**/lib-htmlunit/*.jar" />
+ </fileset>
+ <pathelement location="${build.dir}/test/conf"/>
+ </path>
+
+ <!-- Deploy Unit test dependencies -->
+ <target name="deps-test">
+ <ant target="deploy" inheritall="false" dir="../lib-http"/>
+ <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
+ <copy toDir="${build.test}">
+ <fileset dir="${src.test}" excludes="**/*.java"/>
+ </copy>
+ </target>
+
+ <!-- for junit test -->
+ <!--
+ <mkdir dir="${build.test}/data" />
+ <copy todir="${build.test}/data">
+ <fileset dir="jsp"/>
+ </copy>-->
+
+</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/fa334722/src/plugin/protocol-htmlunit/ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-htmlunit/ivy.xml b/src/plugin/protocol-htmlunit/ivy.xml
new file mode 100644
index 0000000..8aa78d2
--- /dev/null
+++ b/src/plugin/protocol-htmlunit/ivy.xml
@@ -0,0 +1,38 @@
+<?xml version="1.0" ?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<ivy-module version="1.0">
+ <info organisation="org.apache.nutch" module="${ant.project.name}">
+ <license name="Apache 2.0"/>
+ <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+ <description>
+ Apache Nutch
+ </description>
+ </info>
+
+ <configurations>
+ <include file="../../..//ivy/ivy-configurations.xml"/>
+ </configurations>
+
+ <publications>
+ <!--get the artifact from our module name-->
+ <artifact conf="master"/>
+ </publications>
+
+</ivy-module>
http://git-wip-us.apache.org/repos/asf/nutch/blob/fa334722/src/plugin/protocol-htmlunit/plugin.xml
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-htmlunit/plugin.xml b/src/plugin/protocol-htmlunit/plugin.xml
new file mode 100644
index 0000000..36bcb80
--- /dev/null
+++ b/src/plugin/protocol-htmlunit/plugin.xml
@@ -0,0 +1,51 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+ id="protocol-htmlunit"
+ name="HtmlUnit Protocol Plug-in"
+ version="1.0.0"
+ provider-name="nutch.apache.org">
+
+ <runtime>
+ <library name="protocol-htmlunit.jar">
+ <export name="*"/>
+ </library>
+ </runtime>
+
+ <requires>
+ <import plugin="nutch-extensionpoints"/>
+ <import plugin="lib-http"/>
+ <import plugin="lib-htmlunit"/>
+ </requires>
+
+ <extension id="org.apache.nutch.protocol.http"
+ name="HttpProtocol"
+ point="org.apache.nutch.protocol.Protocol">
+
+ <implementation id="org.apache.nutch.protocol.htmlunit.Http"
+ class="org.apache.nutch.protocol.htmlunit.Http">
+ <parameter name="protocolName" value="http"/>
+ </implementation>
+
+ <implementation id="org.apache.nutch.protocol.htmlunit.Http"
+ class="org.apache.nutch.protocol.htmlunit.Http">
+ <parameter name="protocolName" value="https"/>
+ </implementation>
+
+ </extension>
+</plugin>
http://git-wip-us.apache.org/repos/asf/nutch/blob/fa334722/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/Http.java
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/Http.java b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/Http.java
new file mode 100644
index 0000000..83b7687
--- /dev/null
+++ b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/Http.java
@@ -0,0 +1,67 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.htmlunit;
+
+import java.io.IOException;
+import java.net.URL;
+
+import org.apache.hadoop.conf.Configuration;
+
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.protocol.http.api.HttpBase;
+import org.apache.nutch.util.NutchConfiguration;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ *
+ *
+ */
+public class Http extends HttpBase {
+
+ public static final Logger LOG = LoggerFactory.getLogger(Http.class);
+
+ /**
+ * Default constructor.
+ */
+ public Http() {
+ super(LOG);
+ }
+
+ /**
+ * Set the {@link org.apache.hadoop.conf.Configuration} object.
+ *
+ * @param conf
+ */
+ public void setConf(Configuration conf) {
+ super.setConf(conf);
+ }
+
+ public static void main(String[] args) throws Exception {
+ Http http = new Http();
+ http.setConf(NutchConfiguration.create());
+ main(http, args);
+ }
+
+ protected Response getResponse(URL url, CrawlDatum datum, boolean redirect)
+ throws ProtocolException, IOException {
+ return new HttpResponse(this, url, datum);
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/fa334722/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
new file mode 100644
index 0000000..72b1fa1
--- /dev/null
+++ b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
@@ -0,0 +1,350 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.htmlunit;
+
+import java.io.BufferedInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.EOFException;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.io.PushbackInputStream;
+import java.net.InetSocketAddress;
+import java.net.Socket;
+import java.net.URL;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.metadata.SpellCheckedMetadata;
+import org.apache.nutch.net.protocols.HttpDateFormat;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.protocol.http.api.HttpBase;
+import org.apache.nutch.protocol.http.api.HttpException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * An HTTP response.
+ *
+ */
+public class HttpResponse implements Response {
+
+ private static final Logger LOG = LoggerFactory.getLogger(HttpResponse.class);
+
+ private Http http;
+ private URL url;
+ private byte[] content;
+ private int code;
+ private Metadata headers = new SpellCheckedMetadata();
+
+ /** The nutch configuration */
+ private Configuration conf = null;
+
+ public HttpResponse(Http http, URL url, CrawlDatum datum) throws ProtocolException, IOException {
+
+ this.conf = http.getConf();
+ this.http = http;
+ this.url = url;
+
+ LOG.info("fetching " + url);
+
+ String path = "".equals(url.getFile()) ? "/" : url.getFile();
+
+ // some servers will redirect a request with a host line like
+ // "Host: <hostname>:80" to "http://<hpstname>/<orig_path>"- they
+ // don't want the :80...
+ String host = url.getHost();
+ int port;
+ String portString;
+ if (url.getPort() == -1) {
+ port = 80;
+ portString = "";
+ } else {
+ port = url.getPort();
+ portString = ":" + port;
+ }
+
+ Socket socket = null;
+
+ try {
+ socket = new Socket(); // create the socket
+ socket.setSoTimeout(http.getTimeout());
+
+ // connect
+ String sockHost = http.useProxy(url) ? http.getProxyHost() : host;
+ int sockPort = http.useProxy(url) ? http.getProxyPort() : port;
+ InetSocketAddress sockAddr = new InetSocketAddress(sockHost, sockPort);
+ socket.connect(sockAddr, http.getTimeout());
+
+ // make request
+ OutputStream req = socket.getOutputStream();
+
+ StringBuffer reqStr = new StringBuffer("GET ");
+ if (http.useProxy(url)) {
+ reqStr.append(url.getProtocol() + "://" + host + portString + path);
+ } else {
+ reqStr.append(path);
+ }
+
+ // TODO: Write code for Https
+ reqStr.append(" HTTP/1.0\r\n");
+
+ reqStr.append("Host: ");
+ reqStr.append(host);
+ reqStr.append(portString);
+ reqStr.append("\r\n");
+
+ reqStr.append("Accept-Encoding: x-gzip, gzip, deflate\r\n");
+
+ String userAgent = http.getUserAgent();
+ if ((userAgent == null) || (userAgent.length() == 0)) {
+ if (Http.LOG.isErrorEnabled()) {
+ Http.LOG.error("User-agent is not set!");
+ }
+ } else {
+ reqStr.append("User-Agent: ");
+ reqStr.append(userAgent);
+ reqStr.append("\r\n");
+ }
+
+ reqStr.append("Accept-Language: ");
+ reqStr.append(this.http.getAcceptLanguage());
+ reqStr.append("\r\n");
+
+ reqStr.append("Accept: ");
+ reqStr.append(this.http.getAccept());
+ reqStr.append("\r\n");
+
+ if (datum.getModifiedTime() > 0) {
+ reqStr.append("If-Modified-Since: " + HttpDateFormat.toString(datum.getModifiedTime()));
+ reqStr.append("\r\n");
+ }
+ reqStr.append("\r\n");
+
+ byte[] reqBytes = reqStr.toString().getBytes();
+
+ req.write(reqBytes);
+ req.flush();
+
+ PushbackInputStream in = // process response
+ new PushbackInputStream(new BufferedInputStream(socket.getInputStream(), Http.BUFFER_SIZE),
+ Http.BUFFER_SIZE);
+
+ StringBuffer line = new StringBuffer();
+
+ boolean haveSeenNonContinueStatus = false;
+ while (!haveSeenNonContinueStatus) {
+ // parse status code line
+ this.code = parseStatusLine(in, line);
+ // parse headers
+ parseHeaders(in, line);
+ haveSeenNonContinueStatus = code != 100; // 100 is "Continue"
+ }
+
+ // Get Content type header
+ String contentType = getHeader(Response.CONTENT_TYPE);
+
+ // handle with Selenium only if content type in HTML or XHTML
+ if (contentType != null) {
+ if (contentType.contains("text/html") || contentType.contains("application/xhtml")) {
+ readPlainContent(url);
+ } else {
+ try {
+ int contentLength = Integer.MAX_VALUE;
+ String contentLengthString = headers.get(Response.CONTENT_LENGTH);
+ if (contentLengthString != null) {
+ try {
+ contentLength = Integer.parseInt(contentLengthString.trim());
+ } catch (NumberFormatException ex) {
+ throw new HttpException("bad content length: " + contentLengthString);
+ }
+ }
+
+ if (http.getMaxContent() >= 0 && contentLength > http.getMaxContent()) {
+ contentLength = http.getMaxContent();
+ }
+
+ byte[] buffer = new byte[HttpBase.BUFFER_SIZE];
+ int bufferFilled = 0;
+ int totalRead = 0;
+ ByteArrayOutputStream out = new ByteArrayOutputStream();
+ while ((bufferFilled = in.read(buffer, 0, buffer.length)) != -1
+ && totalRead + bufferFilled <= contentLength) {
+ totalRead += bufferFilled;
+ out.write(buffer, 0, bufferFilled);
+ }
+
+ content = out.toByteArray();
+
+ } catch (Exception e) {
+ if (code == 200)
+ throw new IOException(e.toString());
+ // for codes other than 200 OK, we are fine with empty content
+ } finally {
+ if (in != null) {
+ in.close();
+ }
+ }
+ }
+ }
+
+ } finally {
+ if (socket != null)
+ socket.close();
+ }
+ }
+
+ private void readPlainContent(URL url) throws IOException {
+ String page = HtmlUnitWebDriver.getHtmlPage(url.toString(), conf);
+ content = page.getBytes("UTF-8");
+ }
+
+ private int parseStatusLine(PushbackInputStream in, StringBuffer line) throws IOException, HttpException {
+ readLine(in, line, false);
+
+ int codeStart = line.indexOf(" ");
+ int codeEnd = line.indexOf(" ", codeStart + 1);
+
+ // handle lines with no plaintext result code, ie:
+ // "HTTP/1.1 200" vs "HTTP/1.1 200 OK"
+ if (codeEnd == -1)
+ codeEnd = line.length();
+
+ int code;
+ try {
+ code = Integer.parseInt(line.substring(codeStart + 1, codeEnd));
+ } catch (NumberFormatException e) {
+ throw new HttpException("bad status line '" + line + "': " + e.getMessage(), e);
+ }
+
+ return code;
+ }
+
+ private void processHeaderLine(StringBuffer line) throws IOException, HttpException {
+
+ int colonIndex = line.indexOf(":"); // key is up to colon
+ if (colonIndex == -1) {
+ int i;
+ for (i = 0; i < line.length(); i++)
+ if (!Character.isWhitespace(line.charAt(i)))
+ break;
+ if (i == line.length())
+ return;
+ throw new HttpException("No colon in header:" + line);
+ }
+ String key = line.substring(0, colonIndex);
+
+ int valueStart = colonIndex + 1; // skip whitespace
+ while (valueStart < line.length()) {
+ int c = line.charAt(valueStart);
+ if (c != ' ' && c != '\t')
+ break;
+ valueStart++;
+ }
+ String value = line.substring(valueStart);
+ headers.set(key, value);
+ }
+
+ // Adds headers to our headers Metadata
+ private void parseHeaders(PushbackInputStream in, StringBuffer line) throws IOException, HttpException {
+
+ while (readLine(in, line, true) != 0) {
+
+ // handle HTTP responses with missing blank line after headers
+ int pos;
+ if (((pos = line.indexOf("<!DOCTYPE")) != -1) || ((pos = line.indexOf("<HTML")) != -1)
+ || ((pos = line.indexOf("<html")) != -1)) {
+
+ in.unread(line.substring(pos).getBytes("UTF-8"));
+ line.setLength(pos);
+
+ try {
+ //TODO: (CM) We don't know the header names here
+ //since we're just handling them generically. It would
+ //be nice to provide some sort of mapping function here
+ //for the returned header names to the standard metadata
+ //names in the ParseData class
+ processHeaderLine(line);
+ } catch (Exception e) {
+ // fixme:
+ Http.LOG.warn("Error: ", e);
+ }
+ return;
+ }
+
+ processHeaderLine(line);
+ }
+ }
+
+ private static int readLine(PushbackInputStream in, StringBuffer line, boolean allowContinuedLine)
+ throws IOException {
+ line.setLength(0);
+ for (int c = in.read(); c != -1; c = in.read()) {
+ switch (c) {
+ case '\r':
+ if (peek(in) == '\n') {
+ in.read();
+ }
+ case '\n':
+ if (line.length() > 0) {
+ // at EOL -- check for continued line if the current
+ // (possibly continued) line wasn't blank
+ if (allowContinuedLine)
+ switch (peek(in)) {
+ case ' ':
+ case '\t': // line is continued
+ in.read();
+ continue;
+ }
+ }
+ return line.length(); // else complete
+ default:
+ line.append((char) c);
+ }
+ }
+ throw new EOFException();
+ }
+
+ private static int peek(PushbackInputStream in) throws IOException {
+ int value = in.read();
+ in.unread(value);
+ return value;
+ }
+
+ public URL getUrl() {
+ return url;
+ }
+
+ public String getHeader(String name) {
+ return headers.get(name);
+ }
+
+ public Metadata getHeaders() {
+ return headers;
+ }
+
+ public byte[] getContent() {
+ return content;
+ }
+
+ @Override
+ public int getCode() {
+ // TODO Auto-generated method stub
+ return code;
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/fa334722/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html
new file mode 100644
index 0000000..34d1d1c
--- /dev/null
+++ b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html
@@ -0,0 +1,5 @@
+<html>
+<body>
+<p>Protocol plugin which supports retrieving documents via the http protocol.</p><p></p>
+</body>
+</html>
[5/5] nutch git commit: fix conflicts in CHANGES.txt
Posted by ma...@apache.org.
fix conflicts in CHANGES.txt
Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/044e8e77
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/044e8e77
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/044e8e77
Branch: refs/heads/master
Commit: 044e8e77e015df051d0345927eca4fa5f38527fd
Parents: d6bcefd f529359
Author: Chris Mattmann <ma...@apache.org>
Authored: Sun Apr 17 15:35:33 2016 -0700
Committer: Chris Mattmann <ma...@apache.org>
Committed: Sun Apr 17 15:35:33 2016 -0700
----------------------------------------------------------------------
CHANGES.txt | 2 +
build.xml | 6 +
conf/nutch-default.xml | 82 +++--
default.properties | 3 +-
src/plugin/build.xml | 4 +
src/plugin/lib-htmlunit/build-ivy.xml | 54 +++
src/plugin/lib-htmlunit/build.xml | 28 ++
src/plugin/lib-htmlunit/ivy.xml | 52 +++
src/plugin/lib-htmlunit/plugin.xml | 166 +++++++++
.../protocol/htmlunit/HtmlUnitWebDriver.java | 189 ++++++++++
.../htmlunit/HtmlUnitWebWindowListener.java | 53 +++
.../nutch/protocol/selenium/HttpWebClient.java | 37 +-
src/plugin/protocol-htmlunit/build.xml | 46 +++
src/plugin/protocol-htmlunit/ivy.xml | 38 ++
src/plugin/protocol-htmlunit/plugin.xml | 51 +++
.../apache/nutch/protocol/htmlunit/Http.java | 63 ++++
.../nutch/protocol/htmlunit/HttpResponse.java | 347 +++++++++++++++++++
.../apache/nutch/protocol/htmlunit/package.html | 21 ++
18 files changed, 1197 insertions(+), 45 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/nutch/blob/044e8e77/CHANGES.txt
----------------------------------------------------------------------
diff --cc CHANGES.txt
index a3bde42,1f48eb6..e14d7c5
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@@ -10,10 -10,8 +10,12 @@@ in the release announcement and keep i
Nutch Change Log
+* NUTCH-2250 CommonCrawlDumper : Invalid format and skipped parts (Thamme Gowda N.,lewismc via mattmann)
+
+* NUTCH-2245 Developed the NGram Model on the existing Unigram Cosine Similarity Model (bhavyasanghavi via sujen)
+
+ * NUTCH-2191 Add HtmlUnit plugin in Nutch. (karanjeets and markus17 via mattmann)
+
* NUTCH-2241 Unstable Selenium plugin in Nutch. Fixed bugs and enhanced configuration (karanjeets via mattmann)
* NUTCH-2213 CommonCrawlDataDumper saves gzipped body in extracted form (jnioche via mattmann)
http://git-wip-us.apache.org/repos/asf/nutch/blob/044e8e77/conf/nutch-default.xml
----------------------------------------------------------------------
[4/5] nutch git commit: fix for NUTCH-2191 contributed by karanjeets
Posted by ma...@apache.org.
fix for NUTCH-2191 contributed by karanjeets
Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/f5293599
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/f5293599
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/f5293599
Branch: refs/heads/master
Commit: f52935994dec9468edf6087c5e11b3d9ed2517b1
Parents: 3cda222
Author: Karanjeet Singh <co...@gmail.com>
Authored: Tue Mar 29 01:47:10 2016 -0700
Committer: Karanjeet Singh <co...@gmail.com>
Committed: Tue Mar 29 01:47:10 2016 -0700
----------------------------------------------------------------------
conf/nutch-default.xml | 96 ++++++--------------
default.properties | 4 +-
.../protocol/htmlunit/HtmlUnitWebDriver.java | 42 ++++-----
.../htmlunit/HtmlUnitWebWindowListener.java | 16 ++++
.../nutch/protocol/selenium/HttpWebClient.java | 37 ++++----
src/plugin/protocol-htmlunit/build.xml | 7 --
.../apache/nutch/protocol/htmlunit/Http.java | 4 -
.../nutch/protocol/htmlunit/HttpResponse.java | 6 +-
.../apache/nutch/protocol/htmlunit/package.html | 16 ++++
9 files changed, 105 insertions(+), 123 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/nutch/blob/f5293599/conf/nutch-default.xml
----------------------------------------------------------------------
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index a5f17bf..1934991 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -1874,20 +1874,44 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter-->
</description>
</property>
-
-<!-- lib-htmlunit plugin properties; applies to protocol-htmlunit -->
+<!-- plugin properties that applies to lib-selenium, protocol-selenium,
+ protocol-interactiveselenium, lib-htmlunit, protocol-htmlunit -->
<property>
- <name>htmlunit.page.load.delay</name>
+ <name>page.load.delay</name>
<value>3</value>
<description>
- The delay in seconds to use when loading a page with lib-htmlunit. This
- setting is used by protocol-htmlunit since they depending on
- lib-htmlunit for fetching.
+ The delay in seconds to use when loading a page with htmlunit or selenium.
+ </description>
+</property>
+
+<property>
+ <name>take.screenshot</name>
+ <value>false</value>
+ <description>
+ Boolean property determining whether the protocol-htmlunit
+ WebDriver should capture a screenshot of the URL. If set to
+ true remember to define the 'screenshot.location'
+ property as this determines the location screenshots should be
+ persisted to on HDFS. If that property is not set, screenshots
+ are simply discarded.
</description>
</property>
<property>
+ <name>screenshot.location</name>
+ <value></value>
+ <description>
+ The location on disk where a URL screenshot should be saved
+ to if the 'take.screenshot' property is set to true.
+ By default this is null, in this case screenshots held in memory
+ are simply discarded.
+ </description>
+</property>
+
+<!-- lib-htmlunit plugin properties; applies to protocol-htmlunit -->
+
+<property>
<name>htmlunit.enable.javascript</name>
<value>true</value>
<description>
@@ -1915,31 +1939,6 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter-->
</description>
</property>
-<property>
- <name>htmlunit.take.screenshot</name>
- <value>false</value>
- <description>
- Boolean property determining whether the protocol-htmlunit
- WebDriver should capture a screenshot of the URL. If set to
- true remember to define the 'htmlunit.screenshot.location'
- property as this determines the location screenshots should be
- persisted to on HDFS. If that property is not set, screenshots
- are simply discarded.
- </description>
-</property>
-
-<property>
- <name>htmlunit.screenshot.location</name>
- <value></value>
- <description>
- The location on disk where a URL screenshot should be saved
- to if the 'htmlunit.take.screenshot' property is set to true.
- By default this is null, in this case screenshots held in memory
- are simply discarded.
- </description>
-</property>
-
-
<!-- protocol-selenium plugin properties -->
<property>
@@ -1956,30 +1955,6 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter-->
</property>
<property>
- <name>selenium.take.screenshot</name>
- <value>false</value>
- <description>
- Boolean property determining whether the protocol-selenium
- WebDriver should capture a screenshot of the URL. If set to
- true remember to define the 'selenium.screenshot.location'
- property as this determines the location screenshots should be
- persisted to on HDFS. If that property is not set, screenshots
- are simply discarded.
- </description>
-</property>
-
-<property>
- <name>selenium.screenshot.location</name>
- <value></value>
- <description>
- The location on disk where a URL screenshot should be saved
- to if the 'selenium.take.screenshot' property is set to true.
- By default this is null, in this case screenshots held in memory
- are simply discarded.
- </description>
-</property>
-
-<property>
<name>selenium.hub.port</name>
<value>4444</value>
<description>Selenium Hub Location connection port</description>
@@ -2069,17 +2044,6 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter-->
Currently this option exist for - 'firefox' </description>
</property>
-<!-- lib-selenium configuration -->
-<property>
- <name>libselenium.page.load.delay</name>
- <value>3</value>
- <description>
- The delay in seconds to use when loading a page with lib-selenium. This
- setting is used by protocol-selenium and protocol-interactiveselenium
- since they depending on lib-selenium for fetching.
- </description>
-</property>
-
<!-- protocol-interactiveselenium configuration -->
<property>
<name>interactiveselenium.handlers</name>
http://git-wip-us.apache.org/repos/asf/nutch/blob/f5293599/default.properties
----------------------------------------------------------------------
diff --git a/default.properties b/default.properties
index aec5d51..eb616c6 100644
--- a/default.properties
+++ b/default.properties
@@ -89,8 +89,8 @@ plugins.protocol=\
org.apache.nutch.protocol.ftp*:\
org.apache.nutch.protocol.http*:\
org.apache.nutch.protocol.httpclient*:\
- org.apache.nutch.protocol.selenium*
- org.apache.nutch.protocol.htmlunit*
+ org.apache.nutch.protocol.selenium*:\
+ org.apache.nutch.protocol.htmlunit*:\
#
# URL Filter Plugins
http://git-wip-us.apache.org/repos/asf/nutch/blob/f5293599/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebDriver.java
----------------------------------------------------------------------
diff --git a/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebDriver.java b/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebDriver.java
index 5e2c0ac..064894e 100644
--- a/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebDriver.java
+++ b/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebDriver.java
@@ -64,11 +64,11 @@ public class HtmlUnitWebDriver extends HtmlUnitDriver {
client.getOptions().setThrowExceptionOnScriptError(false);
if(enableRedirect)
client.addWebWindowListener(new HtmlUnitWebWindowListener(maxRedirects));
- return client;
+ return client;
}
public static WebDriver getDriverForPage(String url, Configuration conf) {
- long pageLoadTimout = conf.getLong("htmlunit.page.load.delay", 3);
+ long pageLoadTimout = conf.getLong("page.load.delay", 3);
enableJavascript = conf.getBoolean("htmlunit.enable.javascript", true);
enableCss = conf.getBoolean("htmlunit.enable.css", false);
javascriptTimeout = conf.getLong("htmlunit.javascript.timeout", 3500);
@@ -84,8 +84,8 @@ public class HtmlUnitWebDriver extends HtmlUnitDriver {
driver.get(url);
} catch(Exception e) {
if(e instanceof TimeoutException) {
- LOG.debug("HtmlUnit WebDriver: Timeout Exception: Capturing whatever loaded so far...");
- return driver;
+ LOG.debug("HtmlUnit WebDriver: Timeout Exception: Capturing whatever loaded so far...");
+ return driver;
}
cleanUpDriver(driver);
throw new RuntimeException(e);
@@ -96,19 +96,19 @@ public class HtmlUnitWebDriver extends HtmlUnitDriver {
public static String getHTMLContent(WebDriver driver, Configuration conf) {
try {
- if (conf.getBoolean("htmlunit.take.screenshot", false))
- takeScreenshot(driver, conf);
+ if (conf.getBoolean("take.screenshot", false))
+ takeScreenshot(driver, conf);
String innerHtml = "";
if(enableJavascript) {
- WebElement body = driver.findElement(By.tagName("body"));
- innerHtml = (String)((JavascriptExecutor)driver).executeScript("return arguments[0].innerHTML;", body);
+ WebElement body = driver.findElement(By.tagName("body"));
+ innerHtml = (String)((JavascriptExecutor)driver).executeScript("return arguments[0].innerHTML;", body);
}
else
- innerHtml = driver.getPageSource().replaceAll("&", "&");
+ innerHtml = driver.getPageSource().replaceAll("&", "&");
return innerHtml;
} catch(Exception e) {
- TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles();
+ TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles();
cleanUpDriver(driver);
throw new RuntimeException(e);
}
@@ -141,23 +141,23 @@ public class HtmlUnitWebDriver extends HtmlUnitDriver {
WebDriver driver = getDriverForPage(url, conf);
try {
- if (conf.getBoolean("htmlunit.take.screenshot", false))
- takeScreenshot(driver, conf);
+ if (conf.getBoolean("take.screenshot", false))
+ takeScreenshot(driver, conf);
String innerHtml = "";
if(enableJavascript) {
- WebElement body = driver.findElement(By.tagName("body"));
- innerHtml = (String)((JavascriptExecutor)driver).executeScript("return arguments[0].innerHTML;", body);
+ WebElement body = driver.findElement(By.tagName("body"));
+ innerHtml = (String)((JavascriptExecutor)driver).executeScript("return arguments[0].innerHTML;", body);
}
else
- innerHtml = driver.getPageSource().replaceAll("&", "&");
+ innerHtml = driver.getPageSource().replaceAll("&", "&");
return innerHtml;
} catch (Exception e) {
- TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles();
- throw new RuntimeException(e);
+ TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles();
+ throw new RuntimeException(e);
} finally {
- cleanUpDriver(driver);
+ cleanUpDriver(driver);
}
}
@@ -167,8 +167,8 @@ public class HtmlUnitWebDriver extends HtmlUnitDriver {
File srcFile = ((TakesScreenshot)driver).getScreenshotAs(OutputType.FILE);
LOG.debug("In-memory screenshot taken of: {}", url);
FileSystem fs = FileSystem.get(conf);
- if (conf.get("htmlunit.screenshot.location") != null) {
- Path screenshotPath = new Path(conf.get("htmlunit.screenshot.location") + "/" + srcFile.getName());
+ if (conf.get("screenshot.location") != null) {
+ Path screenshotPath = new Path(conf.get("screenshot.location") + "/" + srcFile.getName());
OutputStream os = null;
if (!fs.exists(screenshotPath)) {
LOG.debug("No existing screenshot already exists... creating new file at {} {}.", screenshotPath, srcFile.getName());
@@ -179,7 +179,7 @@ public class HtmlUnitWebDriver extends HtmlUnitDriver {
LOG.debug("Screenshot for {} successfully saved to: {} {}", url, screenshotPath, srcFile.getName());
} else {
LOG.warn("Screenshot for {} not saved to HDFS (subsequently disgarded) as value for "
- + "'htmlunit.screenshot.location' is absent from nutch-site.xml.", url);
+ + "'screenshot.location' is absent from nutch-site.xml.", url);
}
} catch (Exception e) {
cleanUpDriver(driver);
http://git-wip-us.apache.org/repos/asf/nutch/blob/f5293599/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebWindowListener.java
----------------------------------------------------------------------
diff --git a/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebWindowListener.java b/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebWindowListener.java
index baa8774..c2b88a6 100644
--- a/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebWindowListener.java
+++ b/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebWindowListener.java
@@ -1,3 +1,19 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
package org.apache.nutch.protocol.htmlunit;
import com.gargoylesoftware.htmlunit.WebWindowEvent;
http://git-wip-us.apache.org/repos/asf/nutch/blob/f5293599/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java
----------------------------------------------------------------------
diff --git a/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java b/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java
index 583b840..3a20cfe 100644
--- a/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java
+++ b/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java
@@ -68,7 +68,7 @@ public class HttpWebClient {
public static WebDriver getDriverForPage(String url, Configuration conf) {
WebDriver driver = null;
DesiredCapabilities capabilities = null;
- long pageLoadWait = conf.getLong("libselenium.page.load.delay", 3);
+ long pageLoadWait = conf.getLong("page.load.delay", 3);
try {
String driverType = conf.get("selenium.driver", "firefox");
@@ -129,11 +129,11 @@ public class HttpWebClient {
driver.manage().timeouts().pageLoadTimeout(pageLoadWait, TimeUnit.SECONDS);
driver.get(url);
} catch (Exception e) {
- if(e instanceof TimeoutException) {
- LOG.debug("Selenium WebDriver: Timeout Exception: Capturing whatever loaded so far...");
- return driver;
- }
- cleanUpDriver(driver);
+ if(e instanceof TimeoutException) {
+ LOG.debug("Selenium WebDriver: Timeout Exception: Capturing whatever loaded so far...");
+ return driver;
+ }
+ cleanUpDriver(driver);
throw new RuntimeException(e);
}
@@ -141,7 +141,7 @@ public class HttpWebClient {
}
public static String getHTMLContent(WebDriver driver, Configuration conf) {
- if (conf.getBoolean("selenium.take.screenshot", false)) {
+ if (conf.getBoolean("take.screenshot", false)) {
takeScreenshot(driver, conf);
}
@@ -149,15 +149,15 @@ public class HttpWebClient {
}
public static void cleanUpDriver(WebDriver driver) {
- if (driver != null) {
- try {
+ if (driver != null) {
+ try {
driver.close();
- driver.quit();
- TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles();
- } catch (Exception e) {
- throw new RuntimeException(e);
- }
+ driver.quit();
+ TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles();
+ } catch (Exception e) {
+ throw new RuntimeException(e);
}
+ }
}
/**
@@ -175,7 +175,7 @@ public class HttpWebClient {
WebDriver driver = getDriverForPage(url, conf);
try {
- if (conf.getBoolean("selenium.take.screenshot", false)) {
+ if (conf.getBoolean("take.screenshot", false)) {
takeScreenshot(driver, conf);
}
@@ -201,8 +201,8 @@ public class HttpWebClient {
File srcFile = ((TakesScreenshot)driver).getScreenshotAs(OutputType.FILE);
LOG.debug("In-memory screenshot taken of: {}", url);
FileSystem fs = FileSystem.get(conf);
- Path screenshotPath = new Path(conf.get("selenium.screenshot.location") + "/" + srcFile.getName());
- if (screenshotPath != null) {
+ if (conf.get("screenshot.location") != null) {
+ Path screenshotPath = new Path(conf.get("screenshot.location") + "/" + srcFile.getName());
OutputStream os = null;
if (!fs.exists(screenshotPath)) {
LOG.debug("No existing screenshot already exists... creating new file at {} {}.", screenshotPath, srcFile.getName());
@@ -213,9 +213,10 @@ public class HttpWebClient {
LOG.debug("Screenshot for {} successfully saved to: {} {}", url, screenshotPath, srcFile.getName());
} else {
LOG.warn("Screenshot for {} not saved to HDFS (subsequently disgarded) as value for "
- + "'selenium.screenshot.location' is absent from nutch-site.xml.", url);
+ + "'screenshot.location' is absent from nutch-site.xml.", url);
}
} catch (Exception e) {
+ cleanUpDriver(driver);
throw new RuntimeException(e);
}
}
http://git-wip-us.apache.org/repos/asf/nutch/blob/f5293599/src/plugin/protocol-htmlunit/build.xml
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-htmlunit/build.xml b/src/plugin/protocol-htmlunit/build.xml
index 0ed0228..bf695fe 100644
--- a/src/plugin/protocol-htmlunit/build.xml
+++ b/src/plugin/protocol-htmlunit/build.xml
@@ -43,11 +43,4 @@
</copy>
</target>
- <!-- for junit test -->
- <!--
- <mkdir dir="${build.test}/data" />
- <copy todir="${build.test}/data">
- <fileset dir="jsp"/>
- </copy>-->
-
</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/f5293599/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/Http.java
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/Http.java b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/Http.java
index 83b7687..c40ed69 100644
--- a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/Http.java
+++ b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/Http.java
@@ -30,10 +30,6 @@ import org.apache.nutch.util.NutchConfiguration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-/**
- *
- *
- */
public class Http extends HttpBase {
public static final Logger LOG = LoggerFactory.getLogger(Http.class);
http://git-wip-us.apache.org/repos/asf/nutch/blob/f5293599/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
index a2f3b1e..7242f40 100644
--- a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
+++ b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
@@ -38,10 +38,6 @@ import org.apache.nutch.protocol.http.api.HttpException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-/**
- * An HTTP response.
- *
- */
public class HttpResponse implements Response {
private static final Logger LOG = LoggerFactory.getLogger(HttpResponse.class);
@@ -61,7 +57,7 @@ public class HttpResponse implements Response {
this.http = http;
this.url = url;
- LOG.info("fetching " + url);
+ LOG.info("fetching {}", url);
String path = "".equals(url.getFile()) ? "/" : url.getFile();
http://git-wip-us.apache.org/repos/asf/nutch/blob/f5293599/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html
index 34d1d1c..4181951 100644
--- a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html
+++ b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html
@@ -1,3 +1,19 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
<html>
<body>
<p>Protocol plugin which supports retrieving documents via the http protocol.</p><p></p>
[3/5] nutch git commit: fix for NUTCH-2191 contributed by karanjeets
Posted by ma...@apache.org.
fix for NUTCH-2191 contributed by karanjeets
Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/3cda2229
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/3cda2229
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/3cda2229
Branch: refs/heads/master
Commit: 3cda222971c970270dcc7525b97dfffe4b818ced
Parents: 366104d
Author: Karanjeet Singh <co...@gmail.com>
Authored: Mon Mar 28 22:58:40 2016 -0700
Committer: Karanjeet Singh <co...@gmail.com>
Committed: Mon Mar 28 22:58:40 2016 -0700
----------------------------------------------------------------------
default.properties | 1 +
.../protocol/htmlunit/HtmlUnitWebDriver.java | 125 +++++++++----------
.../htmlunit/HtmlUnitWebWindowListener.java | 53 ++++----
.../nutch/protocol/htmlunit/HttpResponse.java | 5 +-
4 files changed, 93 insertions(+), 91 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/nutch/blob/3cda2229/default.properties
----------------------------------------------------------------------
diff --git a/default.properties b/default.properties
index d34f778..aec5d51 100644
--- a/default.properties
+++ b/default.properties
@@ -90,6 +90,7 @@ plugins.protocol=\
org.apache.nutch.protocol.http*:\
org.apache.nutch.protocol.httpclient*:\
org.apache.nutch.protocol.selenium*
+ org.apache.nutch.protocol.htmlunit*
#
# URL Filter Plugins
http://git-wip-us.apache.org/repos/asf/nutch/blob/3cda2229/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebDriver.java
----------------------------------------------------------------------
diff --git a/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebDriver.java b/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebDriver.java
index fc231c3..5e2c0ac 100644
--- a/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebDriver.java
+++ b/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebDriver.java
@@ -51,79 +51,79 @@ public class HtmlUnitWebDriver extends HtmlUnitDriver {
private static int maxRedirects;
public HtmlUnitWebDriver() {
- super(enableJavascript);
+ super(enableJavascript);
}
@Override
protected WebClient modifyWebClient(WebClient client) {
- client.getOptions().setJavaScriptEnabled(enableJavascript);
- client.getOptions().setCssEnabled(enableCss);
- client.getOptions().setRedirectEnabled(enableRedirect);
- if(enableJavascript)
- client.setJavaScriptTimeout(javascriptTimeout);
- client.getOptions().setThrowExceptionOnScriptError(false);
- if(enableRedirect)
- client.addWebWindowListener(new HtmlUnitWebWindowListener(maxRedirects));
- return client;
+ client.getOptions().setJavaScriptEnabled(enableJavascript);
+ client.getOptions().setCssEnabled(enableCss);
+ client.getOptions().setRedirectEnabled(enableRedirect);
+ if(enableJavascript)
+ client.setJavaScriptTimeout(javascriptTimeout);
+ client.getOptions().setThrowExceptionOnScriptError(false);
+ if(enableRedirect)
+ client.addWebWindowListener(new HtmlUnitWebWindowListener(maxRedirects));
+ return client;
}
public static WebDriver getDriverForPage(String url, Configuration conf) {
- long pageLoadTimout = conf.getLong("htmlunit.page.load.delay", 3);
- enableJavascript = conf.getBoolean("htmlunit.enable.javascript", true);
- enableCss = conf.getBoolean("htmlunit.enable.css", false);
- javascriptTimeout = conf.getLong("htmlunit.javascript.timeout", 3500);
- int redirects = Integer.parseInt(conf.get("http.redirect.max", "0"));
- enableRedirect = redirects <= 0 ? false : true;
- maxRedirects = redirects;
+ long pageLoadTimout = conf.getLong("htmlunit.page.load.delay", 3);
+ enableJavascript = conf.getBoolean("htmlunit.enable.javascript", true);
+ enableCss = conf.getBoolean("htmlunit.enable.css", false);
+ javascriptTimeout = conf.getLong("htmlunit.javascript.timeout", 3500);
+ int redirects = Integer.parseInt(conf.get("http.redirect.max", "0"));
+ enableRedirect = redirects <= 0 ? false : true;
+ maxRedirects = redirects;
- WebDriver driver = null;
+ WebDriver driver = null;
- try {
- driver = new HtmlUnitWebDriver();
- driver.manage().timeouts().pageLoadTimeout(pageLoadTimout, TimeUnit.SECONDS);
- driver.get(url);
- } catch(Exception e) {
- if(e instanceof TimeoutException) {
- LOG.debug("HtmlUnit WebDriver: Timeout Exception: Capturing whatever loaded so far...");
- return driver;
- }
- cleanUpDriver(driver);
- throw new RuntimeException(e);
- }
+ try {
+ driver = new HtmlUnitWebDriver();
+ driver.manage().timeouts().pageLoadTimeout(pageLoadTimout, TimeUnit.SECONDS);
+ driver.get(url);
+ } catch(Exception e) {
+ if(e instanceof TimeoutException) {
+ LOG.debug("HtmlUnit WebDriver: Timeout Exception: Capturing whatever loaded so far...");
+ return driver;
+ }
+ cleanUpDriver(driver);
+ throw new RuntimeException(e);
+ }
- return driver;
+ return driver;
}
public static String getHTMLContent(WebDriver driver, Configuration conf) {
- try {
- if (conf.getBoolean("htmlunit.take.screenshot", false))
- takeScreenshot(driver, conf);
+ try {
+ if (conf.getBoolean("htmlunit.take.screenshot", false))
+ takeScreenshot(driver, conf);
- String innerHtml = "";
- if(enableJavascript) {
- WebElement body = driver.findElement(By.tagName("body"));
- innerHtml = (String)((JavascriptExecutor)driver).executeScript("return arguments[0].innerHTML;", body);
- }
- else
- innerHtml = driver.getPageSource().replaceAll("&", "&");
- return innerHtml;
- } catch(Exception e) {
- TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles();
- cleanUpDriver(driver);
- throw new RuntimeException(e);
- }
+ String innerHtml = "";
+ if(enableJavascript) {
+ WebElement body = driver.findElement(By.tagName("body"));
+ innerHtml = (String)((JavascriptExecutor)driver).executeScript("return arguments[0].innerHTML;", body);
+ }
+ else
+ innerHtml = driver.getPageSource().replaceAll("&", "&");
+ return innerHtml;
+ } catch(Exception e) {
+ TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles();
+ cleanUpDriver(driver);
+ throw new RuntimeException(e);
+ }
}
public static void cleanUpDriver(WebDriver driver) {
- if (driver != null) {
- try {
- driver.close();
- driver.quit();
- TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles();
- } catch (Exception e) {
- throw new RuntimeException(e);
- }
+ if (driver != null) {
+ try {
+ driver.close();
+ driver.quit();
+ TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles();
+ } catch (Exception e) {
+ throw new RuntimeException(e);
}
+ }
}
/**
@@ -142,23 +142,22 @@ public class HtmlUnitWebDriver extends HtmlUnitDriver {
try {
if (conf.getBoolean("htmlunit.take.screenshot", false))
- takeScreenshot(driver, conf);
+ takeScreenshot(driver, conf);
-
String innerHtml = "";
if(enableJavascript) {
- WebElement body = driver.findElement(By.tagName("body"));
- innerHtml = (String)((JavascriptExecutor)driver).executeScript("return arguments[0].innerHTML;", body);
+ WebElement body = driver.findElement(By.tagName("body"));
+ innerHtml = (String)((JavascriptExecutor)driver).executeScript("return arguments[0].innerHTML;", body);
}
else
- innerHtml = driver.getPageSource().replaceAll("&", "&");
+ innerHtml = driver.getPageSource().replaceAll("&", "&");
return innerHtml;
} catch (Exception e) {
- TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles();
- throw new RuntimeException(e);
+ TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles();
+ throw new RuntimeException(e);
} finally {
- cleanUpDriver(driver);
+ cleanUpDriver(driver);
}
}
http://git-wip-us.apache.org/repos/asf/nutch/blob/3cda2229/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebWindowListener.java
----------------------------------------------------------------------
diff --git a/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebWindowListener.java b/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebWindowListener.java
index 760f4aa..baa8774 100644
--- a/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebWindowListener.java
+++ b/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebWindowListener.java
@@ -5,32 +5,33 @@ import com.gargoylesoftware.htmlunit.WebWindowListener;
public class HtmlUnitWebWindowListener implements WebWindowListener {
- private Integer redirectCount = 0;
- private Integer maxRedirects = 0;
-
- public HtmlUnitWebWindowListener() {
-
- }
-
- public HtmlUnitWebWindowListener(int maxRedirects) {
- this.maxRedirects = maxRedirects;
- }
-
- @Override
- public void webWindowOpened(WebWindowEvent event) {
-
- }
+ private Integer redirectCount = 0;
+ private Integer maxRedirects = 0;
+
+ public HtmlUnitWebWindowListener() {
+
+ }
+
+ public HtmlUnitWebWindowListener(int maxRedirects) {
+ this.maxRedirects = maxRedirects;
+ }
+
+ @Override
+ public void webWindowOpened(WebWindowEvent event) {
+
+ }
- @Override
- public void webWindowContentChanged(WebWindowEvent event) {
- redirectCount++;
- if(redirectCount > maxRedirects)
- throw new RuntimeException("Redirect Count: " + redirectCount + " exceeded the Maximum Redirects allowed: " + maxRedirects);
- }
+ @Override
+ public void webWindowContentChanged(WebWindowEvent event) {
+ redirectCount++;
+ if(redirectCount > maxRedirects)
+ throw new RuntimeException("Redirect Count: " + redirectCount + " exceeded the Maximum Redirects allowed: " + maxRedirects);
+ }
- @Override
- public void webWindowClosed(WebWindowEvent event) {
-
- }
-
+ @Override
+ public void webWindowClosed(WebWindowEvent event) {
+
+ }
+
}
+
http://git-wip-us.apache.org/repos/asf/nutch/blob/3cda2229/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
index 72b1fa1..a2f3b1e 100644
--- a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
+++ b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
@@ -344,7 +344,8 @@ public class HttpResponse implements Response {
@Override
public int getCode() {
- // TODO Auto-generated method stub
- return code;
+ // TODO Auto-generated method stub
+ return code;
}
}
+
[2/5] nutch git commit: fix for NUTCH-2191 contributed by karanjeets
Posted by ma...@apache.org.
fix for NUTCH-2191 contributed by karanjeets
Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/366104d1
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/366104d1
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/366104d1
Branch: refs/heads/master
Commit: 366104d169ec566004c0c1522b8ee39bc692b86d
Parents: fa33472
Author: Karanjeet Singh <co...@gmail.com>
Authored: Sat Mar 26 23:26:40 2016 -0700
Committer: Karanjeet Singh <co...@gmail.com>
Committed: Sat Mar 26 23:26:40 2016 -0700
----------------------------------------------------------------------
CHANGES.txt | 2 ++
1 file changed, 2 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/nutch/blob/366104d1/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index b15b78c..1f48eb6 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -10,6 +10,8 @@ in the release announcement and keep it on top in this CHANGES.txt for the Nutch
Nutch Change Log
+* NUTCH-2191 Add HtmlUnit plugin in Nutch. (karanjeets and markus17 via mattmann)
+
* NUTCH-2241 Unstable Selenium plugin in Nutch. Fixed bugs and enhanced configuration (karanjeets via mattmann)
* NUTCH-2213 CommonCrawlDataDumper saves gzipped body in extracted form (jnioche via mattmann)