You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2015/02/26 19:31:39 UTC

svn commit: r1662530 - in /nutch/trunk: ./ ivy/ src/plugin/ src/plugin/lib-selenium/ src/plugin/lib-selenium/src/ src/plugin/lib-selenium/src/java/ src/plugin/lib-selenium/src/java/org/ src/plugin/lib-selenium/src/java/org/apache/ src/plugin/lib-seleni...

Author: lewismc
Date: Thu Feb 26 18:31:39 2015
New Revision: 1662530

URL: http://svn.apache.org/r1662530
Log:
NUTCH-1933 nutch-selenium plugin

Added:
    nutch/trunk/src/plugin/lib-selenium/
    nutch/trunk/src/plugin/lib-selenium/build.xml
    nutch/trunk/src/plugin/lib-selenium/ivy.xml
    nutch/trunk/src/plugin/lib-selenium/plugin.xml
    nutch/trunk/src/plugin/lib-selenium/src/
    nutch/trunk/src/plugin/lib-selenium/src/java/
    nutch/trunk/src/plugin/lib-selenium/src/java/org/
    nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/
    nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/nutch/
    nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/
    nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/
    nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java
    nutch/trunk/src/plugin/protocol-selenium/
    nutch/trunk/src/plugin/protocol-selenium/build-ivy.xml
    nutch/trunk/src/plugin/protocol-selenium/build.xml
    nutch/trunk/src/plugin/protocol-selenium/ivy.xml
    nutch/trunk/src/plugin/protocol-selenium/plugin.xml
    nutch/trunk/src/plugin/protocol-selenium/src/
    nutch/trunk/src/plugin/protocol-selenium/src/java/
    nutch/trunk/src/plugin/protocol-selenium/src/java/org/
    nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/
    nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/
    nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/
    nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/
    nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/Http.java
    nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java
    nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/package.html
    nutch/trunk/src/plugin/protocol-selenium/src/target/
    nutch/trunk/src/plugin/protocol-selenium/src/target/classes/
    nutch/trunk/src/plugin/protocol-selenium/src/target/classes/org/
    nutch/trunk/src/plugin/protocol-selenium/src/target/classes/org/apache/
    nutch/trunk/src/plugin/protocol-selenium/src/target/classes/org/apache/nutch/
    nutch/trunk/src/plugin/protocol-selenium/src/target/classes/org/apache/nutch/protocol/
    nutch/trunk/src/plugin/protocol-selenium/src/target/classes/org/apache/nutch/protocol/htmlunit/
    nutch/trunk/src/plugin/protocol-selenium/src/target/classes/org/apache/nutch/protocol/htmlunit/package.html
Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/build.xml
    nutch/trunk/ivy/ivy.xml
    nutch/trunk/src/plugin/build.xml

Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1662530&r1=1662529&r2=1662530&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Thu Feb 26 18:31:39 2015
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Nutch Current Development 1.10-SNAPSHOT
 
+* NUTCH-1933 nutch-selenium plugin (Mo Omer, Mohammad Al-Moshin, lewismc)
+
 * NUTCH-827 HTTP POST Authentication (Jasper van Veghel, yuanyun.cn, snagel, lewismc)
 
 * NUTCH-1724 LinkDBReader to support regex output filtering (markus)

Modified: nutch/trunk/build.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/build.xml?rev=1662530&r1=1662529&r2=1662530&view=diff
==============================================================================
--- nutch/trunk/build.xml (original)
+++ nutch/trunk/build.xml Thu Feb 26 18:31:39 2015
@@ -184,6 +184,7 @@
       <packageset dir="${plugins.dir}/indexer-solr/src/java"/>
       <packageset dir="${plugins.dir}/language-identifier/src/java"/>
       <packageset dir="${plugins.dir}/lib-http/src/java"/>
+      <packageset dir="${plugins.dir}/lib-selenium/src/java"/>
       <packageset dir="${plugins.dir}/lib-regex-filter/src/java"/>
       <packageset dir="${plugins.dir}/microformats-reltag/src/java"/>
       <packageset dir="${plugins.dir}/parse-ext/src/java"/>
@@ -197,6 +198,7 @@
       <packageset dir="${plugins.dir}/protocol-ftp/src/java"/>
       <packageset dir="${plugins.dir}/protocol-http/src/java"/>
       <packageset dir="${plugins.dir}/protocol-httpclient/src/java"/>
+      <packageset dir="${plugins.dir}/protocol-selenium/src/java"/>
       <packageset dir="${plugins.dir}/scoring-depth/src/java"/>
       <packageset dir="${plugins.dir}/scoring-link/src/java"/>
       <packageset dir="${plugins.dir}/scoring-opic/src/java"/>
@@ -591,6 +593,7 @@
       <packageset dir="${plugins.dir}/indexer-solr/src/java"/>
       <packageset dir="${plugins.dir}/language-identifier/src/java"/>
       <packageset dir="${plugins.dir}/lib-http/src/java"/>
+      <packageset dir="${plugins.dir}/lib-selenium/src/java"/>
       <packageset dir="${plugins.dir}/lib-regex-filter/src/java"/>
       <packageset dir="${plugins.dir}/microformats-reltag/src/java"/>
       <packageset dir="${plugins.dir}/parse-ext/src/java"/>
@@ -604,6 +607,7 @@
       <packageset dir="${plugins.dir}/protocol-ftp/src/java"/>
       <packageset dir="${plugins.dir}/protocol-http/src/java"/>
       <packageset dir="${plugins.dir}/protocol-httpclient/src/java"/>
+      <packageset dir="${plugins.dir}/protocol-selenium/src/java"/>
       <packageset dir="${plugins.dir}/scoring-depth/src/java"/>
       <packageset dir="${plugins.dir}/scoring-link/src/java"/>
       <packageset dir="${plugins.dir}/scoring-opic/src/java"/>
@@ -985,6 +989,8 @@
         <source path="${plugins.dir}/language-identifier/src/test/" />
         <source path="${plugins.dir}/lib-http/src/java/" />
         <source path="${plugins.dir}/lib-http/src/test/" />
+        <source path="${plugins.dir}/lib-selenium/src/java/" />
+        <source path="${plugins.dir}/lib-selenium/src/test/" />
         <source path="${plugins.dir}/lib-regex-filter/src/java/" />
         <source path="${plugins.dir}/lib-regex-filter/src/test/" />
         <source path="${plugins.dir}/microformats-reltag/src/java/" />
@@ -1008,6 +1014,8 @@
         <source path="${plugins.dir}/protocol-httpclient/src/test/" />
         <source path="${plugins.dir}/protocol-http/src/java/" />
         <source path="${plugins.dir}/protocol-http/src/test/" />
+        <source path="${plugins.dir}/protocol-selenium/src/java"/>
+        <source path="${plugins.dir}/protocol-selenium/src/test"/>
         <source path="${plugins.dir}/scoring-depth/src/java/" />
         <source path="${plugins.dir}/scoring-link/src/java/" />
         <source path="${plugins.dir}/scoring-opic/src/java/" />

Modified: nutch/trunk/ivy/ivy.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/ivy/ivy.xml?rev=1662530&r1=1662529&r2=1662530&view=diff
==============================================================================
--- nutch/trunk/ivy/ivy.xml (original)
+++ nutch/trunk/ivy/ivy.xml Thu Feb 26 18:31:39 2015
@@ -23,24 +23,24 @@
 			database etc.
 		</description>
 	</info>
-
+	
 	<configurations>
 		<include file="${basedir}/ivy/ivy-configurations.xml" />
 	</configurations>
-
+	
 	<publications>
 		<!--get the artifact from our module name -->
 		<artifact conf="master" />
 	</publications>
-
+	
 	<dependencies>
 		<dependency org="org.slf4j" name="slf4j-api" rev="1.6.1"
 			conf="*->master" />
 		<dependency org="org.slf4j" name="slf4j-log4j12" rev="1.6.1"
 			conf="*->master" />
-
+		
 		<dependency org="log4j" name="log4j" rev="1.2.15" conf="*->master" />
-
+		
 		<dependency org="commons-lang" name="commons-lang" rev="2.6"
 			conf="*->default" />
 		<dependency org="commons-collections" name="commons-collections"
@@ -49,7 +49,7 @@
 			rev="3.1" conf="*->master" />
 		<dependency org="commons-codec" name="commons-codec" rev="1.3"
 			conf="*->default" />
-
+		
 		<dependency org="org.apache.hadoop" name="hadoop-core" rev="1.2.0"
 			conf="*->default">
 			<exclude org="hsqldb" name="hsqldb" />

Modified: nutch/trunk/src/plugin/build.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/build.xml?rev=1662530&r1=1662529&r2=1662530&view=diff
==============================================================================
--- nutch/trunk/src/plugin/build.xml (original)
+++ nutch/trunk/src/plugin/build.xml Thu Feb 26 18:31:39 2015
@@ -50,6 +50,8 @@
      <ant dir="protocol-ftp" target="deploy"/>
      <ant dir="protocol-http" target="deploy"/>
      <ant dir="protocol-httpclient" target="deploy"/>
+     <ant dir="lib-selenium" target="deploy"/>
+     <ant dir="protocol-selenium" target="deploy" />
      <ant dir="parse-ext" target="deploy"/>
      <ant dir="parse-js" target="deploy"/>
      <ant dir="parse-html" target="deploy"/>
@@ -149,6 +151,8 @@
     <ant dir="protocol-ftp" target="clean"/>
     <ant dir="protocol-http" target="clean"/>
     <ant dir="protocol-httpclient" target="clean"/>
+    <ant dir="lib-selenium" target="clean"/>
+    <ant dir="protocol-selenium" target="clean" />
     <ant dir="parse-ext" target="clean"/>
     <ant dir="parse-js" target="clean"/>
     <ant dir="parse-html" target="clean"/>

Added: nutch/trunk/src/plugin/lib-selenium/build.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-selenium/build.xml?rev=1662530&view=auto
==============================================================================
--- nutch/trunk/src/plugin/lib-selenium/build.xml (added)
+++ nutch/trunk/src/plugin/lib-selenium/build.xml Thu Feb 26 18:31:39 2015
@@ -0,0 +1,28 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="lib-selenium" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+  <!-- Add compilation dependencies to classpath -->
+  <path id="plugin.deps">    
+    <fileset dir="${nutch.root}/build">
+      <include name="**/lib-http/*.jar" />
+    </fileset>
+  </path>
+</project>

Added: nutch/trunk/src/plugin/lib-selenium/ivy.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-selenium/ivy.xml?rev=1662530&view=auto
==============================================================================
--- nutch/trunk/src/plugin/lib-selenium/ivy.xml (added)
+++ nutch/trunk/src/plugin/lib-selenium/ivy.xml Thu Feb 26 18:31:39 2015
@@ -0,0 +1,48 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+    <!-- begin selenium dependencies -->
+    <dependency org="org.seleniumhq.selenium" name="selenium-java" rev="2.44.0" />
+    
+    <dependency org="com.opera" name="operadriver" rev="1.5">
+      <exclude org="org.seleniumhq.selenium" name="selenium-remote-driver" />
+    </dependency>
+    <!-- end selenium dependencies -->
+  </dependencies>
+  
+</ivy-module>

Added: nutch/trunk/src/plugin/lib-selenium/plugin.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-selenium/plugin.xml?rev=1662530&view=auto
==============================================================================
--- nutch/trunk/src/plugin/lib-selenium/plugin.xml (added)
+++ nutch/trunk/src/plugin/lib-selenium/plugin.xml Thu Feb 26 18:31:39 2015
@@ -0,0 +1,42 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<!--
+ ! A common framework for http protocol implementations
+ !-->
+<plugin
+   id="lib-selenium"
+   name="HTTP Framework"
+   version="1.0"
+   provider-name="org.apache.nutch">
+
+   <runtime>
+     <library name="lib-selenium.jar">
+        <export name="*"/>
+     </library>       
+   </runtime>
+
+   <requires>
+     <library name="selenium-java-2.4.0.jar">
+       <export name="*"/>
+     </library>
+     <library name="operadriver-1.5.jar">
+       <export name="*"/>
+       <exclude name="selenium-remote-driver" />
+     </library>
+   </requires>
+</plugin>

Added: nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java?rev=1662530&view=auto
==============================================================================
--- nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java (added)
+++ nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java Thu Feb 26 18:31:39 2015
@@ -0,0 +1,78 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.selenium;
+
+import org.apache.hadoop.conf.Configuration;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.openqa.selenium.By;
+import org.openqa.selenium.WebDriver;
+import org.openqa.selenium.firefox.FirefoxDriver;
+import org.openqa.selenium.firefox.FirefoxProfile;
+import org.openqa.selenium.support.ui.WebDriverWait;
+
+import java.lang.String;
+
+public class HttpWebClient {
+
+  private static final Logger LOG = LoggerFactory.getLogger("org.apache.nutch.protocol");
+
+  public static ThreadLocal<WebDriver> threadWebDriver = new ThreadLocal<WebDriver>() {
+
+    @Override
+    protected WebDriver initialValue()
+    {
+      FirefoxProfile profile = new FirefoxProfile();
+      profile.setPreference("permissions.default.stylesheet", 2);
+      profile.setPreference("permissions.default.image", 2);
+      profile.setPreference("dom.ipc.plugins.enabled.libflashplayer.so", "false");
+      WebDriver driver = new FirefoxDriver(profile);
+      return driver;
+    };
+  };
+
+  public static String getHtmlPage(String url, Configuration conf) {
+    WebDriver driver = null;
+
+    try {
+      driver = new FirefoxDriver();
+      //} WebDriver driver = threadWebDriver.get();
+      //  if (driver == null) {
+      //    driver = new FirefoxDriver();
+      //  }
+
+      driver.get(url);
+
+      // Wait for the page to load, timeout after 3 seconds
+      new WebDriverWait(driver, 3);
+
+      String innerHtml = driver.findElement(By.tagName("body")).getAttribute("innerHTML");
+
+      return innerHtml;
+
+      // I'm sure this catch statement is a code smell ; borrowing it from lib-htmlunit
+    } catch (Exception e) {
+      throw new RuntimeException(e);
+    } finally {
+      if (driver != null) try { driver.quit(); } catch (Exception e) { throw new RuntimeException(e); }
+    }
+  };
+
+  public static String getHtmlPage(String url) {
+    return getHtmlPage(url, null);
+  }
+}
\ No newline at end of file

Added: nutch/trunk/src/plugin/protocol-selenium/build-ivy.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-selenium/build-ivy.xml?rev=1662530&view=auto
==============================================================================
--- nutch/trunk/src/plugin/protocol-selenium/build-ivy.xml (added)
+++ nutch/trunk/src/plugin/protocol-selenium/build-ivy.xml Thu Feb 26 18:31:39 2015
@@ -0,0 +1,54 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="protocol-selenium" default="deps-jar" xmlns:ivy="antlib:org.apache.ivy.ant">
+
+    <property name="ivy.install.version" value="2.1.0" />
+    <condition property="ivy.home" value="${env.IVY_HOME}">
+      <isset property="env.IVY_HOME" />
+    </condition>
+    <property name="ivy.home" value="${user.home}/.ant" />
+    <property name="ivy.checksums" value="" />
+    <property name="ivy.jar.dir" value="${ivy.home}/lib" />
+    <property name="ivy.jar.file" value="${ivy.jar.dir}/ivy.jar" />
+
+    <target name="download-ivy" unless="offline">
+
+        <mkdir dir="${ivy.jar.dir}"/>
+        <!-- download Ivy from web site so that it can be used even without any special installation -->
+        <get src="http://repo2.maven.org/maven2/org/apache/ivy/ivy/${ivy.install.version}/ivy-${ivy.install.version}.jar" 
+             dest="${ivy.jar.file}" usetimestamp="true"/>
+    </target>
+
+    <target name="init-ivy" depends="download-ivy">
+      <!-- try to load ivy here from ivy home, in case the user has not already dropped
+              it into ant's lib dir (note that the latter copy will always take precedence).
+              We will not fail as long as local lib dir exists (it may be empty) and
+              ivy is in at least one of ant's lib dir or the local lib dir. -->
+        <path id="ivy.lib.path">
+            <fileset dir="${ivy.jar.dir}" includes="*.jar"/>
+
+        </path>
+        <taskdef resource="org/apache/ivy/ant/antlib.xml"
+                 uri="antlib:org.apache.ivy.ant" classpathref="ivy.lib.path"/>
+    </target>
+
+  <target name="deps-jar" depends="init-ivy">
+    <ivy:retrieve pattern="lib/[artifact]-[revision].[ext]"/>
+  </target>
+
+</project>

Added: nutch/trunk/src/plugin/protocol-selenium/build.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-selenium/build.xml?rev=1662530&view=auto
==============================================================================
--- nutch/trunk/src/plugin/protocol-selenium/build.xml (added)
+++ nutch/trunk/src/plugin/protocol-selenium/build.xml Thu Feb 26 18:31:39 2015
@@ -0,0 +1,36 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="protocol-selenium" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+  <!-- Build compilation dependencies -->
+  <target name="deps-jar">
+    <ant target="jar" inheritall="false" dir="../lib-http"/>
+    <ant target="jar" inheritall="false" dir="../lib-selenium"/>
+  </target>
+
+  <!-- Add compilation dependencies to classpath -->
+  <path id="plugin.deps">
+    <fileset dir="${nutch.root}/build">
+      <include name="**/lib-http/*.jar" />
+      <include name="**/lib-selenium/*.jar" />
+    </fileset>
+  </path>
+
+</project>

Added: nutch/trunk/src/plugin/protocol-selenium/ivy.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-selenium/ivy.xml?rev=1662530&view=auto
==============================================================================
--- nutch/trunk/src/plugin/protocol-selenium/ivy.xml (added)
+++ nutch/trunk/src/plugin/protocol-selenium/ivy.xml Thu Feb 26 18:31:39 2015
@@ -0,0 +1,48 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="default"/>
+  </publications>
+
+  <dependencies>
+    <!-- begin selenium dependencies -->
+    <dependency org="org.seleniumhq.selenium" name="selenium-java" rev="2.44.0" />
+    
+    <dependency org="com.opera" name="operadriver" rev="1.5">
+      <exclude org="org.seleniumhq.selenium" name="selenium-remote-driver" />
+    </dependency>
+    <!-- end selenium dependencies -->
+  </dependencies>
+  
+</ivy-module>

Added: nutch/trunk/src/plugin/protocol-selenium/plugin.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-selenium/plugin.xml?rev=1662530&view=auto
==============================================================================
--- nutch/trunk/src/plugin/protocol-selenium/plugin.xml (added)
+++ nutch/trunk/src/plugin/protocol-selenium/plugin.xml Thu Feb 26 18:31:39 2015
@@ -0,0 +1,90 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="protocol-selenium"
+   name="Http Protocol Plug-in"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="protocol-selenium.jar">
+         <export name="*"/>
+      </library>
+      <library name="cglib-nodep-2.1_3.jar"/>
+      <library name="commons-codec-1.9.jar"/>
+      <library name="commons-collections-3.2.1.jar"/>
+      <library name="commons-exec-1.1.jar"/>
+      <library name="commons-io-2.4.jar"/>
+      <library name="commons-jxpath-1.3.jar"/>
+      <library name="commons-lang3-3.3.2.jar"/>
+      <library name="commons-logging-1.1.3.jar"/>
+      <library name="cssparser-0.9.14.jar"/>
+      <library name="gson-2.3.jar"/>
+      <library name="guava-18.0.jar"/>
+      <library name="htmlunit-2.15.jar"/>
+      <library name="htmlunit-core-js-2.15.jar"/>
+      <library name="httpclient-4.3.4.jar"/>
+      <library name="httpcore-4.3.2.jar"/>
+      <library name="httpmime-4.3.3.jar"/>
+      <library name="ini4j-0.5.2.jar"/>
+      <library name="jetty-http-8.1.15.v20140411.jar"/>
+      <library name="jetty-io-8.1.15.v20140411.jar"/>
+      <library name="jetty-util-8.1.15.v20140411.jar"/>
+      <library name="jetty-websocket-8.1.15.v20140411.jar"/>
+      <library name="jna-3.4.0.jar"/>
+      <library name="nekohtml-1.9.21.jar"/>
+      <library name="netty-3.5.2.Final.jar"/>
+      <library name="operadriver-1.5.jar"/>
+      <library name="operalaunchers-1.1.jar"/>
+      <library name="platform-3.4.0.jar"/>
+      <library name="protobuf-java-2.4.1.jar"/>
+      <library name="sac-1.3.jar"/>
+      <library name="selenium-api-2.44.0.jar"/>
+      <library name="selenium-chrome-driver-2.44.0.jar"/>
+      <library name="selenium-firefox-driver-2.44.0.jar"/>
+      <library name="selenium-htmlunit-driver-2.44.0.jar"/>
+      <library name="selenium-ie-driver-2.44.0.jar"/>
+      <library name="selenium-java-2.44.0.jar"/>
+      <library name="selenium-remote-driver-2.44.0.jar"/>
+      <library name="selenium-safari-driver-2.44.0.jar"/>
+      <library name="selenium-support-2.44.0.jar"/>
+      <library name="serializer-2.7.1.jar"/>
+      <library name="webbit-0.4.14.jar"/>
+      <library name="xalan-2.7.1.jar"/>
+      <library name="xercesImpl-2.11.0.jar"/>
+      <library name="xml-apis-1.4.01.jar"/>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+      <import plugin="lib-http"/>
+      <import plugin="lib-selenium"/>
+   </requires>
+
+   <extension id="org.apache.nutch.protocol.selenium"
+              name="HttpProtocol"
+              point="org.apache.nutch.protocol.Protocol">
+
+      <implementation id="org.apache.nutch.protocol.selenium.Http"
+                      class="org.apache.nutch.protocol.selenium.Http">
+        <parameter name="protocolName" value="http"/>
+      </implementation>
+
+   </extension>
+
+</plugin>

Added: nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/Http.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/Http.java?rev=1662530&view=auto
==============================================================================
--- nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/Http.java (added)
+++ nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/Http.java Thu Feb 26 18:31:39 2015
@@ -0,0 +1,59 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.selenium;
+
+// JDK imports
+import java.io.IOException;
+import java.net.URL;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.http.api.HttpBase;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.util.NutchConfiguration;
+
+import org.apache.nutch.protocol.selenium.HttpResponse;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class Http extends HttpBase {
+
+  public static final Logger LOG = LoggerFactory.getLogger(Http.class);
+
+  public Http() {
+    super(LOG);
+  }
+
+  @Override
+  public void setConf(Configuration conf) {
+    super.setConf(conf);
+  }
+
+  public static void main(String[] args) throws Exception {
+    Http http = new Http();
+    http.setConf(NutchConfiguration.create());
+    main(http, args);
+  }
+
+  @Override
+  protected Response getResponse(URL url, CrawlDatum datum, boolean redirect)
+      throws ProtocolException, IOException {
+    return new HttpResponse(this, url, datum);
+  }
+
+}

Added: nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java?rev=1662530&view=auto
==============================================================================
--- nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java (added)
+++ nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java Thu Feb 26 18:31:39 2015
@@ -0,0 +1,360 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.selenium;
+
+// JDK imports
+import java.io.BufferedInputStream;
+import java.io.EOFException;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.PushbackInputStream;
+import java.net.InetSocketAddress;
+import java.net.Socket;
+import java.net.URL;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.metadata.SpellCheckedMetadata;
+import org.apache.nutch.net.protocols.HttpDateFormat;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.protocol.http.api.HttpException;
+import org.apache.nutch.protocol.http.api.HttpBase;
+
+/* Most of this code was borrowed from protocol-htmlunit; which in turn borrowed it from protocol-httpclient */
+
+public class HttpResponse implements Response {
+
+  private Http http;
+  private URL url;
+  private String orig;
+  private String base;
+  private byte[] content;
+  private int code;
+  private Metadata headers = new SpellCheckedMetadata();
+
+  /** The nutch configuration */
+  private Configuration conf = null;
+
+  public HttpResponse(Http http, URL url, CrawlDatum datum) throws ProtocolException, IOException {
+
+    this.conf = http.getConf();
+    this.http = http;
+    this.url = url;
+    this.orig = url.toString();
+    this.base = url.toString();
+
+    if (!"http".equals(url.getProtocol()))
+      throw new HttpException("Not an HTTP url:" + url);
+
+    if (Http.LOG.isTraceEnabled()) {
+      Http.LOG.trace("fetching " + url);
+    }
+
+    String path = "".equals(url.getFile()) ? "/" : url.getFile();
+
+    // some servers will redirect a request with a host line like
+    // "Host: <hostname>:80" to "http://<hpstname>/<orig_path>"- they
+    // don't want the :80...
+
+    String host = url.getHost();
+    int port;
+    String portString;
+    if (url.getPort() == -1) {
+      port = 80;
+      portString = "";
+    } else {
+      port = url.getPort();
+      portString = ":" + port;
+    }
+    Socket socket = null;
+
+    try {
+      socket = new Socket(); // create the socket
+      socket.setSoTimeout(http.getTimeout());
+
+      // connect
+      String sockHost = http.useProxy() ? http.getProxyHost() : host;
+      int sockPort = http.useProxy() ? http.getProxyPort() : port;
+      InetSocketAddress sockAddr = new InetSocketAddress(sockHost, sockPort);
+      socket.connect(sockAddr, http.getTimeout());
+
+      // make request
+      OutputStream req = socket.getOutputStream();
+
+      StringBuffer reqStr = new StringBuffer("GET ");
+      if (http.useProxy()) {
+        reqStr.append(url.getProtocol() + "://" + host + portString + path);
+      } else {
+        reqStr.append(path);
+      }
+
+      reqStr.append(" HTTP/1.0\r\n");
+
+      reqStr.append("Host: ");
+      reqStr.append(host);
+      reqStr.append(portString);
+      reqStr.append("\r\n");
+
+      reqStr.append("Accept-Encoding: x-gzip, gzip, deflate\r\n");
+
+      String userAgent = http.getUserAgent();
+      if ((userAgent == null) || (userAgent.length() == 0)) {
+        if (Http.LOG.isErrorEnabled()) {
+          Http.LOG.error("User-agent is not set!");
+        }
+      } else {
+        reqStr.append("User-Agent: ");
+        reqStr.append(userAgent);
+        reqStr.append("\r\n");
+      }
+
+      reqStr.append("Accept-Language: ");
+      reqStr.append(this.http.getAcceptLanguage());
+      reqStr.append("\r\n");
+
+      reqStr.append("Accept: ");
+      reqStr.append(this.http.getAccept());
+      reqStr.append("\r\n");
+
+      if (datum.getModifiedTime() > 0) {
+        reqStr.append("If-Modified-Since: " + HttpDateFormat.toString(datum.getModifiedTime()));
+        reqStr.append("\r\n");
+      }
+      reqStr.append("\r\n");
+
+      byte[] reqBytes = reqStr.toString().getBytes();
+
+      req.write(reqBytes);
+      req.flush();
+
+      PushbackInputStream in = // process response
+          new PushbackInputStream(new BufferedInputStream(socket.getInputStream(), Http.BUFFER_SIZE),
+              Http.BUFFER_SIZE);
+
+      StringBuffer line = new StringBuffer();
+
+      boolean haveSeenNonContinueStatus = false;
+      while (!haveSeenNonContinueStatus) {
+        // parse status code line
+        this.code = parseStatusLine(in, line);
+        // parse headers
+        parseHeaders(in, line);
+        haveSeenNonContinueStatus = code != 100; // 100 is "Continue"
+      }
+
+      // Get Content type header
+      String contentType = getHeader(Response.CONTENT_TYPE);
+
+      // handle with Selenium only if content type in HTML or XHTML 
+      if (contentType != null) {
+        if (contentType.contains("text/html") || contentType.contains("application/xhtml")) {
+          readPlainContent(url);
+        } else {
+          try {
+            int contentLength = Integer.MAX_VALUE;
+            String contentLengthString = headers.get(Response.CONTENT_LENGTH);
+            if (contentLengthString != null) {
+              try {
+                contentLength = Integer.parseInt(contentLengthString.trim());
+              } catch (NumberFormatException ex) {
+                throw new HttpException("bad content length: " + contentLengthString);
+              }
+            }
+
+            if (http.getMaxContent() >= 0 && contentLength > http.getMaxContent()) {
+              contentLength = http.getMaxContent();
+            }
+
+            byte[] buffer = new byte[HttpBase.BUFFER_SIZE];
+            int bufferFilled = 0;
+            int totalRead = 0;
+            ByteArrayOutputStream out = new ByteArrayOutputStream();
+            while ((bufferFilled = in.read(buffer, 0, buffer.length)) != -1
+                && totalRead + bufferFilled <= contentLength) {
+              totalRead += bufferFilled;
+              out.write(buffer, 0, bufferFilled);
+            }
+
+            content = out.toByteArray();
+
+          } catch (Exception e) {
+            if (code == 200)
+              throw new IOException(e.toString());
+            // for codes other than 200 OK, we are fine with empty content
+          } finally {
+            if (in != null) {
+              in.close();
+            }
+          }
+        }
+      } 
+
+    } finally {
+      if (socket != null)
+        socket.close();
+    }
+  }
+
+  /* ------------------------- *
+   * <implementation:Response> *
+   * ------------------------- */
+
+  public URL getUrl() {
+    return url;
+  }
+
+  public int getCode() {
+    return code;
+  }
+
+  public String getHeader(String name) {
+    return headers.get(name);
+  }
+
+  public Metadata getHeaders() {
+    return headers;
+  }
+
+  public byte[] getContent() {
+    return content;
+  }
+
+  /* ------------------------- *
+   * <implementation:Response> *
+   * ------------------------- */
+
+  private void readPlainContent(URL url) throws IOException {
+    String page = HttpWebClient.getHtmlPage(url.toString(), conf);
+
+    content = page.getBytes("UTF-8");
+  }
+
+  private int parseStatusLine(PushbackInputStream in, StringBuffer line) throws IOException, HttpException {
+    readLine(in, line, false);
+
+    int codeStart = line.indexOf(" ");
+    int codeEnd = line.indexOf(" ", codeStart + 1);
+
+    // handle lines with no plaintext result code, ie:
+    // "HTTP/1.1 200" vs "HTTP/1.1 200 OK"
+    if (codeEnd == -1)
+      codeEnd = line.length();
+
+    int code;
+    try {
+      code = Integer.parseInt(line.substring(codeStart + 1, codeEnd));
+    } catch (NumberFormatException e) {
+      throw new HttpException("bad status line '" + line + "': " + e.getMessage(), e);
+    }
+
+    return code;
+  }
+
+  private void processHeaderLine(StringBuffer line) throws IOException, HttpException {
+
+    int colonIndex = line.indexOf(":"); // key is up to colon
+    if (colonIndex == -1) {
+      int i;
+      for (i = 0; i < line.length(); i++)
+        if (!Character.isWhitespace(line.charAt(i)))
+          break;
+      if (i == line.length())
+        return;
+      throw new HttpException("No colon in header:" + line);
+    }
+    String key = line.substring(0, colonIndex);
+
+    int valueStart = colonIndex + 1; // skip whitespace
+    while (valueStart < line.length()) {
+      int c = line.charAt(valueStart);
+      if (c != ' ' && c != '\t')
+        break;
+      valueStart++;
+    }
+    String value = line.substring(valueStart);
+    headers.set(key, value);
+  }
+
+  // Adds headers to our headers Metadata
+  private void parseHeaders(PushbackInputStream in, StringBuffer line) throws IOException, HttpException {
+
+    while (readLine(in, line, true) != 0) {
+
+      // handle HTTP responses with missing blank line after headers
+      int pos;
+      if (((pos = line.indexOf("<!DOCTYPE")) != -1) || ((pos = line.indexOf("<HTML")) != -1)
+          || ((pos = line.indexOf("<html")) != -1)) {
+
+        in.unread(line.substring(pos).getBytes("UTF-8"));
+        line.setLength(pos);
+
+        try {
+          //TODO: (CM) We don't know the header names here
+          //since we're just handling them generically. It would
+          //be nice to provide some sort of mapping function here
+          //for the returned header names to the standard metadata
+          //names in the ParseData class
+          processHeaderLine(line);
+        } catch (Exception e) {
+          // fixme:
+          Http.LOG.warn("Error: ", e);
+        }
+        return;
+      }
+
+      processHeaderLine(line);
+    }
+  }
+
+  private static int readLine(PushbackInputStream in, StringBuffer line, boolean allowContinuedLine)
+      throws IOException {
+    line.setLength(0);
+    for (int c = in.read(); c != -1; c = in.read()) {
+      switch (c) {
+      case '\r':
+        if (peek(in) == '\n') {
+          in.read();
+        }
+      case '\n':
+        if (line.length() > 0) {
+          // at EOL -- check for continued line if the current
+          // (possibly continued) line wasn't blank
+          if (allowContinuedLine)
+            switch (peek(in)) {
+            case ' ':
+            case '\t': // line is continued
+              in.read();
+              continue;
+            }
+        }
+        return line.length(); // else complete
+      default:
+        line.append((char) c);
+      }
+    }
+    throw new EOFException();
+  }
+
+  private static int peek(PushbackInputStream in) throws IOException {
+    int value = in.read();
+    in.unread(value);
+    return value;
+  }
+}

Added: nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/package.html
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/package.html?rev=1662530&view=auto
==============================================================================
--- nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/package.html (added)
+++ nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/package.html Thu Feb 26 18:31:39 2015
@@ -0,0 +1,5 @@
+<html>
+<body>
+<p>Protocol plugin which supports retrieving documents via selenium.</p><p></p>
+</body>
+</html>

Added: nutch/trunk/src/plugin/protocol-selenium/src/target/classes/org/apache/nutch/protocol/htmlunit/package.html
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-selenium/src/target/classes/org/apache/nutch/protocol/htmlunit/package.html?rev=1662530&view=auto
==============================================================================
--- nutch/trunk/src/plugin/protocol-selenium/src/target/classes/org/apache/nutch/protocol/htmlunit/package.html (added)
+++ nutch/trunk/src/plugin/protocol-selenium/src/target/classes/org/apache/nutch/protocol/htmlunit/package.html Thu Feb 26 18:31:39 2015
@@ -0,0 +1,5 @@
+<html>
+<body>
+<p>Protocol plugin which supports retrieving documents via the htmlunit.</p><p></p>
+</body>
+</html>



Unsubscribe

Posted by Gioele Zanzico <gi...@manfrotto.com>.
Unsubscribe

Sent from my iPhone

On 26 Feb 2015, at 20:13, "Massimo Miccoli" <mm...@iltrovatore.it>> wrote:



Massimo

Il giorno 26/feb/2015, alle ore 19:31, lewismc@apache.org<ma...@apache.org> ha scritto:

Author: lewismc
Date: Thu Feb 26 18:31:39 2015
New Revision: 1662530

URL: http://svn.apache.org/r1662530
Log:
NUTCH-1933 nutch-selenium plugin

Added:
  nutch/trunk/src/plugin/lib-selenium/
  nutch/trunk/src/plugin/lib-selenium/build.xml
  nutch/trunk/src/plugin/lib-selenium/ivy.xml
  nutch/trunk/src/plugin/lib-selenium/plugin.xml
  nutch/trunk/src/plugin/lib-selenium/src/
  nutch/trunk/src/plugin/lib-selenium/src/java/
  nutch/trunk/src/plugin/lib-selenium/src/java/org/
  nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/
  nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/nutch/
  nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/
  nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/
  nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java
  nutch/trunk/src/plugin/protocol-selenium/
  nutch/trunk/src/plugin/protocol-selenium/build-ivy.xml
  nutch/trunk/src/plugin/protocol-selenium/build.xml
  nutch/trunk/src/plugin/protocol-selenium/ivy.xml
  nutch/trunk/src/plugin/protocol-selenium/plugin.xml
  nutch/trunk/src/plugin/protocol-selenium/src/
  nutch/trunk/src/plugin/protocol-selenium/src/java/
  nutch/trunk/src/plugin/protocol-selenium/src/java/org/
  nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/
  nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/
  nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/
  nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/
  nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/Http.java
  nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java
  nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/package.html
  nutch/trunk/src/plugin/protocol-selenium/src/target/
  nutch/trunk/src/plugin/protocol-selenium/src/target/classes/
  nutch/trunk/src/plugin/protocol-selenium/src/target/classes/org/
  nutch/trunk/src/plugin/protocol-selenium/src/target/classes/org/apache/
  nutch/trunk/src/plugin/protocol-selenium/src/target/classes/org/apache/nutch/
  nutch/trunk/src/plugin/protocol-selenium/src/target/classes/org/apache/nutch/protocol/
  nutch/trunk/src/plugin/protocol-selenium/src/target/classes/org/apache/nutch/protocol/htmlunit/
  nutch/trunk/src/plugin/protocol-selenium/src/target/classes/org/apache/nutch/protocol/htmlunit/package.html
Modified:
  nutch/trunk/CHANGES.txt
  nutch/trunk/build.xml
  nutch/trunk/ivy/ivy.xml
  nutch/trunk/src/plugin/build.xml

Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1662530&r1=1662529&r2=1662530&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Thu Feb 26 18:31:39 2015
@@ -2,6 +2,8 @@ Nutch Change Log

Nutch Current Development 1.10-SNAPSHOT

+* NUTCH-1933 nutch-selenium plugin (Mo Omer, Mohammad Al-Moshin, lewismc)
+
* NUTCH-827 HTTP POST Authentication (Jasper van Veghel, yuanyun.cn<http://yuanyun.cn>, snagel, lewismc)

* NUTCH-1724 LinkDBReader to support regex output filtering (markus)

Modified: nutch/trunk/build.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/build.xml?rev=1662530&r1=1662529&r2=1662530&view=diff
==============================================================================
--- nutch/trunk/build.xml (original)
+++ nutch/trunk/build.xml Thu Feb 26 18:31:39 2015
@@ -184,6 +184,7 @@
     <packageset dir="${plugins.dir}/indexer-solr/src/java"/>
     <packageset dir="${plugins.dir}/language-identifier/src/java"/>
     <packageset dir="${plugins.dir}/lib-http/src/java"/>
+      <packageset dir="${plugins.dir}/lib-selenium/src/java"/>
     <packageset dir="${plugins.dir}/lib-regex-filter/src/java"/>
     <packageset dir="${plugins.dir}/microformats-reltag/src/java"/>
     <packageset dir="${plugins.dir}/parse-ext/src/java"/>
@@ -197,6 +198,7 @@
     <packageset dir="${plugins.dir}/protocol-ftp/src/java"/>
     <packageset dir="${plugins.dir}/protocol-http/src/java"/>
     <packageset dir="${plugins.dir}/protocol-httpclient/src/java"/>
+      <packageset dir="${plugins.dir}/protocol-selenium/src/java"/>
     <packageset dir="${plugins.dir}/scoring-depth/src/java"/>
     <packageset dir="${plugins.dir}/scoring-link/src/java"/>
     <packageset dir="${plugins.dir}/scoring-opic/src/java"/>
@@ -591,6 +593,7 @@
     <packageset dir="${plugins.dir}/indexer-solr/src/java"/>
     <packageset dir="${plugins.dir}/language-identifier/src/java"/>
     <packageset dir="${plugins.dir}/lib-http/src/java"/>
+      <packageset dir="${plugins.dir}/lib-selenium/src/java"/>
     <packageset dir="${plugins.dir}/lib-regex-filter/src/java"/>
     <packageset dir="${plugins.dir}/microformats-reltag/src/java"/>
     <packageset dir="${plugins.dir}/parse-ext/src/java"/>
@@ -604,6 +607,7 @@
     <packageset dir="${plugins.dir}/protocol-ftp/src/java"/>
     <packageset dir="${plugins.dir}/protocol-http/src/java"/>
     <packageset dir="${plugins.dir}/protocol-httpclient/src/java"/>
+      <packageset dir="${plugins.dir}/protocol-selenium/src/java"/>
     <packageset dir="${plugins.dir}/scoring-depth/src/java"/>
     <packageset dir="${plugins.dir}/scoring-link/src/java"/>
     <packageset dir="${plugins.dir}/scoring-opic/src/java"/>
@@ -985,6 +989,8 @@
       <source path="${plugins.dir}/language-identifier/src/test/" />
       <source path="${plugins.dir}/lib-http/src/java/" />
       <source path="${plugins.dir}/lib-http/src/test/" />
+        <source path="${plugins.dir}/lib-selenium/src/java/" />
+        <source path="${plugins.dir}/lib-selenium/src/test/" />
       <source path="${plugins.dir}/lib-regex-filter/src/java/" />
       <source path="${plugins.dir}/lib-regex-filter/src/test/" />
       <source path="${plugins.dir}/microformats-reltag/src/java/" />
@@ -1008,6 +1014,8 @@
       <source path="${plugins.dir}/protocol-httpclient/src/test/" />
       <source path="${plugins.dir}/protocol-http/src/java/" />
       <source path="${plugins.dir}/protocol-http/src/test/" />
+        <source path="${plugins.dir}/protocol-selenium/src/java"/>
+        <source path="${plugins.dir}/protocol-selenium/src/test"/>
       <source path="${plugins.dir}/scoring-depth/src/java/" />
       <source path="${plugins.dir}/scoring-link/src/java/" />
       <source path="${plugins.dir}/scoring-opic/src/java/" />

Modified: nutch/trunk/ivy/ivy.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/ivy/ivy.xml?rev=1662530&r1=1662529&r2=1662530&view=diff
==============================================================================
--- nutch/trunk/ivy/ivy.xml (original)
+++ nutch/trunk/ivy/ivy.xml Thu Feb 26 18:31:39 2015
@@ -23,24 +23,24 @@
          database etc.
      </description>
  </info>
-
+
  <configurations>
      <include file="${basedir}/ivy/ivy-configurations.xml" />
  </configurations>
-
+
  <publications>
      <!--get the artifact from our module name -->
      <artifact conf="master" />
  </publications>
-
+
  <dependencies>
      <dependency org="org.slf4j" name="slf4j-api" rev="1.6.1"
          conf="*->master" />
      <dependency org="org.slf4j" name="slf4j-log4j12" rev="1.6.1"
          conf="*->master" />
-
+
      <dependency org="log4j" name="log4j" rev="1.2.15" conf="*->master" />
-
+
      <dependency org="commons-lang" name="commons-lang" rev="2.6"
          conf="*->default" />
      <dependency org="commons-collections" name="commons-collections"
@@ -49,7 +49,7 @@
          rev="3.1" conf="*->master" />
      <dependency org="commons-codec" name="commons-codec" rev="1.3"
          conf="*->default" />
-
+
      <dependency org="org.apache.hadoop" name="hadoop-core" rev="1.2.0"
          conf="*->default">
          <exclude org="hsqldb" name="hsqldb" />

Modified: nutch/trunk/src/plugin/build.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/build.xml?rev=1662530&r1=1662529&r2=1662530&view=diff
==============================================================================
--- nutch/trunk/src/plugin/build.xml (original)
+++ nutch/trunk/src/plugin/build.xml Thu Feb 26 18:31:39 2015
@@ -50,6 +50,8 @@
    <ant dir="protocol-ftp" target="deploy"/>
    <ant dir="protocol-http" target="deploy"/>
    <ant dir="protocol-httpclient" target="deploy"/>
+     <ant dir="lib-selenium" target="deploy"/>
+     <ant dir="protocol-selenium" target="deploy" />
    <ant dir="parse-ext" target="deploy"/>
    <ant dir="parse-js" target="deploy"/>
    <ant dir="parse-html" target="deploy"/>
@@ -149,6 +151,8 @@
   <ant dir="protocol-ftp" target="clean"/>
   <ant dir="protocol-http" target="clean"/>
   <ant dir="protocol-httpclient" target="clean"/>
+    <ant dir="lib-selenium" target="clean"/>
+    <ant dir="protocol-selenium" target="clean" />
   <ant dir="parse-ext" target="clean"/>
   <ant dir="parse-js" target="clean"/>
   <ant dir="parse-html" target="clean"/>

Added: nutch/trunk/src/plugin/lib-selenium/build.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-selenium/build.xml?rev=1662530&view=auto
==============================================================================
--- nutch/trunk/src/plugin/lib-selenium/build.xml (added)
+++ nutch/trunk/src/plugin/lib-selenium/build.xml Thu Feb 26 18:31:39 2015
@@ -0,0 +1,28 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="lib-selenium" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+  <!-- Add compilation dependencies to classpath -->
+  <path id="plugin.deps">
+    <fileset dir="${nutch.root}/build">
+      <include name="**/lib-http/*.jar" />
+    </fileset>
+  </path>
+</project>

Added: nutch/trunk/src/plugin/lib-selenium/ivy.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-selenium/ivy.xml?rev=1662530&view=auto
==============================================================================
--- nutch/trunk/src/plugin/lib-selenium/ivy.xml (added)
+++ nutch/trunk/src/plugin/lib-selenium/ivy.xml Thu Feb 26 18:31:39 2015
@@ -0,0 +1,48 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+    <!-- begin selenium dependencies -->
+    <dependency org="org.seleniumhq.selenium" name="selenium-java" rev="2.44.0" />
+
+    <dependency org="com.opera" name="operadriver" rev="1.5">
+      <exclude org="org.seleniumhq.selenium" name="selenium-remote-driver" />
+    </dependency>
+    <!-- end selenium dependencies -->
+  </dependencies>
+
+</ivy-module>

Added: nutch/trunk/src/plugin/lib-selenium/plugin.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-selenium/plugin.xml?rev=1662530&view=auto
==============================================================================
--- nutch/trunk/src/plugin/lib-selenium/plugin.xml (added)
+++ nutch/trunk/src/plugin/lib-selenium/plugin.xml Thu Feb 26 18:31:39 2015
@@ -0,0 +1,42 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<!--
+ ! A common framework for http protocol implementations
+ !-->
+<plugin
+   id="lib-selenium"
+   name="HTTP Framework"
+   version="1.0"
+   provider-name="org.apache.nutch">
+
+   <runtime>
+     <library name="lib-selenium.jar">
+        <export name="*"/>
+     </library>
+   </runtime>
+
+   <requires>
+     <library name="selenium-java-2.4.0.jar">
+       <export name="*"/>
+     </library>
+     <library name="operadriver-1.5.jar">
+       <export name="*"/>
+       <exclude name="selenium-remote-driver" />
+     </library>
+   </requires>
+</plugin>

Added: nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java?rev=1662530&view=auto
==============================================================================
--- nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java (added)
+++ nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java Thu Feb 26 18:31:39 2015
@@ -0,0 +1,78 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.selenium;
+
+import org.apache.hadoop.conf.Configuration;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.openqa.selenium.By;
+import org.openqa.selenium.WebDriver;
+import org.openqa.selenium.firefox.FirefoxDriver;
+import org.openqa.selenium.firefox.FirefoxProfile;
+import org.openqa.selenium.support.ui.WebDriverWait;
+
+import java.lang.String;
+
+public class HttpWebClient {
+
+  private static final Logger LOG = LoggerFactory.getLogger("org.apache.nutch.protocol");
+
+  public static ThreadLocal<WebDriver> threadWebDriver = new ThreadLocal<WebDriver>() {
+
+    @Override
+    protected WebDriver initialValue()
+    {
+      FirefoxProfile profile = new FirefoxProfile();
+      profile.setPreference("permissions.default.stylesheet", 2);
+      profile.setPreference("permissions.default.image", 2);
+      profile.setPreference("dom.ipc.plugins.enabled.libflashplayer.so", "false");
+      WebDriver driver = new FirefoxDriver(profile);
+      return driver;
+    };
+  };
+
+  public static String getHtmlPage(String url, Configuration conf) {
+    WebDriver driver = null;
+
+    try {
+      driver = new FirefoxDriver();
+      //} WebDriver driver = threadWebDriver.get();
+      //  if (driver == null) {
+      //    driver = new FirefoxDriver();
+      //  }
+
+      driver.get(url);
+
+      // Wait for the page to load, timeout after 3 seconds
+      new WebDriverWait(driver, 3);
+
+      String innerHtml = driver.findElement(By.tagName("body")).getAttribute("innerHTML");
+
+      return innerHtml;
+
+      // I'm sure this catch statement is a code smell ; borrowing it from lib-htmlunit
+    } catch (Exception e) {
+      throw new RuntimeException(e);
+    } finally {
+      if (driver != null) try { driver.quit(); } catch (Exception e) { throw new RuntimeException(e); }
+    }
+  };
+
+  public static String getHtmlPage(String url) {
+    return getHtmlPage(url, null);
+  }
+}
\ No newline at end of file

Added: nutch/trunk/src/plugin/protocol-selenium/build-ivy.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-selenium/build-ivy.xml?rev=1662530&view=auto
==============================================================================
--- nutch/trunk/src/plugin/protocol-selenium/build-ivy.xml (added)
+++ nutch/trunk/src/plugin/protocol-selenium/build-ivy.xml Thu Feb 26 18:31:39 2015
@@ -0,0 +1,54 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="protocol-selenium" default="deps-jar" xmlns:ivy="antlib:org.apache.ivy.ant">
+
+    <property name="ivy.install.version" value="2.1.0" />
+    <condition property="ivy.home" value="${env.IVY_HOME}">
+      <isset property="env.IVY_HOME" />
+    </condition>
+    <property name="ivy.home" value="${user.home}/.ant" />
+    <property name="ivy.checksums" value="" />
+    <property name="ivy.jar.dir" value="${ivy.home}/lib" />
+    <property name="ivy.jar.file" value="${ivy.jar.dir}/ivy.jar" />
+
+    <target name="download-ivy" unless="offline">
+
+        <mkdir dir="${ivy.jar.dir}"/>
+        <!-- download Ivy from web site so that it can be used even without any special installation -->
+        <get src="http://repo2.maven.org/maven2/org/apache/ivy/ivy/${ivy.install.version}/ivy-${ivy.install.version}.jar"
+             dest="${ivy.jar.file}" usetimestamp="true"/>
+    </target>
+
+    <target name="init-ivy" depends="download-ivy">
+      <!-- try to load ivy here from ivy home, in case the user has not already dropped
+              it into ant's lib dir (note that the latter copy will always take precedence).
+              We will not fail as long as local lib dir exists (it may be empty) and
+              ivy is in at least one of ant's lib dir or the local lib dir. -->
+        <path id="ivy.lib.path">
+            <fileset dir="${ivy.jar.dir}" includes="*.jar"/>
+
+        </path>
+        <taskdef resource="org/apache/ivy/ant/antlib.xml"
+                 uri="antlib:org.apache.ivy.ant" classpathref="ivy.lib.path"/>
+    </target>
+
+  <target name="deps-jar" depends="init-ivy">
+    <ivy:retrieve pattern="lib/[artifact]-[revision].[ext]"/>
+  </target>
+
+</project>

Added: nutch/trunk/src/plugin/protocol-selenium/build.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-selenium/build.xml?rev=1662530&view=auto
==============================================================================
--- nutch/trunk/src/plugin/protocol-selenium/build.xml (added)
+++ nutch/trunk/src/plugin/protocol-selenium/build.xml Thu Feb 26 18:31:39 2015
@@ -0,0 +1,36 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="protocol-selenium" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+  <!-- Build compilation dependencies -->
+  <target name="deps-jar">
+    <ant target="jar" inheritall="false" dir="../lib-http"/>
+    <ant target="jar" inheritall="false" dir="../lib-selenium"/>
+  </target>
+
+  <!-- Add compilation dependencies to classpath -->
+  <path id="plugin.deps">
+    <fileset dir="${nutch.root}/build">
+      <include name="**/lib-http/*.jar" />
+      <include name="**/lib-selenium/*.jar" />
+    </fileset>
+  </path>
+
+</project>

Added: nutch/trunk/src/plugin/protocol-selenium/ivy.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-selenium/ivy.xml?rev=1662530&view=auto
==============================================================================
--- nutch/trunk/src/plugin/protocol-selenium/ivy.xml (added)
+++ nutch/trunk/src/plugin/protocol-selenium/ivy.xml Thu Feb 26 18:31:39 2015
@@ -0,0 +1,48 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="default"/>
+  </publications>
+
+  <dependencies>
+    <!-- begin selenium dependencies -->
+    <dependency org="org.seleniumhq.selenium" name="selenium-java" rev="2.44.0" />
+
+    <dependency org="com.opera" name="operadriver" rev="1.5">
+      <exclude org="org.seleniumhq.selenium" name="selenium-remote-driver" />
+    </dependency>
+    <!-- end selenium dependencies -->
+  </dependencies>
+
+</ivy-module>

Added: nutch/trunk/src/plugin/protocol-selenium/plugin.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-selenium/plugin.xml?rev=1662530&view=auto
==============================================================================
--- nutch/trunk/src/plugin/protocol-selenium/plugin.xml (added)
+++ nutch/trunk/src/plugin/protocol-selenium/plugin.xml Thu Feb 26 18:31:39 2015
@@ -0,0 +1,90 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="protocol-selenium"
+   name="Http Protocol Plug-in"
+   version="1.0.0"
+   provider-name="nutch.org<http://nutch.org>">
+
+   <runtime>
+      <library name="protocol-selenium.jar">
+         <export name="*"/>
+      </library>
+      <library name="cglib-nodep-2.1_3.jar"/>
+      <library name="commons-codec-1.9.jar"/>
+      <library name="commons-collections-3.2.1.jar"/>
+      <library name="commons-exec-1.1.jar"/>
+      <library name="commons-io-2.4.jar"/>
+      <library name="commons-jxpath-1.3.jar"/>
+      <library name="commons-lang3-3.3.2.jar"/>
+      <library name="commons-logging-1.1.3.jar"/>
+      <library name="cssparser-0.9.14.jar"/>
+      <library name="gson-2.3.jar"/>
+      <library name="guava-18.0.jar"/>
+      <library name="htmlunit-2.15.jar"/>
+      <library name="htmlunit-core-js-2.15.jar"/>
+      <library name="httpclient-4.3.4.jar"/>
+      <library name="httpcore-4.3.2.jar"/>
+      <library name="httpmime-4.3.3.jar"/>
+      <library name="ini4j-0.5.2.jar"/>
+      <library name="jetty-http-8.1.15.v20140411.jar"/>
+      <library name="jetty-io-8.1.15.v20140411.jar"/>
+      <library name="jetty-util-8.1.15.v20140411.jar"/>
+      <library name="jetty-websocket-8.1.15.v20140411.jar"/>
+      <library name="jna-3.4.0.jar"/>
+      <library name="nekohtml-1.9.21.jar"/>
+      <library name="netty-3.5.2.Final.jar"/>
+      <library name="operadriver-1.5.jar"/>
+      <library name="operalaunchers-1.1.jar"/>
+      <library name="platform-3.4.0.jar"/>
+      <library name="protobuf-java-2.4.1.jar"/>
+      <library name="sac-1.3.jar"/>
+      <library name="selenium-api-2.44.0.jar"/>
+      <library name="selenium-chrome-driver-2.44.0.jar"/>
+      <library name="selenium-firefox-driver-2.44.0.jar"/>
+      <library name="selenium-htmlunit-driver-2.44.0.jar"/>
+      <library name="selenium-ie-driver-2.44.0.jar"/>
+      <library name="selenium-java-2.44.0.jar"/>
+      <library name="selenium-remote-driver-2.44.0.jar"/>
+      <library name="selenium-safari-driver-2.44.0.jar"/>
+      <library name="selenium-support-2.44.0.jar"/>
+      <library name="serializer-2.7.1.jar"/>
+      <library name="webbit-0.4.14.jar"/>
+      <library name="xalan-2.7.1.jar"/>
+      <library name="xercesImpl-2.11.0.jar"/>
+      <library name="xml-apis-1.4.01.jar"/>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+      <import plugin="lib-http"/>
+      <import plugin="lib-selenium"/>
+   </requires>
+
+   <extension id="org.apache.nutch.protocol.selenium"
+              name="HttpProtocol"
+              point="org.apache.nutch.protocol.Protocol">
+
+      <implementation id="org.apache.nutch.protocol.selenium.Http"
+                      class="org.apache.nutch.protocol.selenium.Http">
+        <parameter name="protocolName" value="http"/>
+      </implementation>
+
+   </extension>
+
+</plugin>

Added: nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/Http.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/Http.java?rev=1662530&view=auto
==============================================================================
--- nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/Http.java (added)
+++ nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/Http.java Thu Feb 26 18:31:39 2015
@@ -0,0 +1,59 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.selenium;
+
+// JDK imports
+import java.io.IOException;
+import java.net.URL;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.http.api.HttpBase;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.util.NutchConfiguration;
+
+import org.apache.nutch.protocol.selenium.HttpResponse;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class Http extends HttpBase {
+
+  public static final Logger LOG = LoggerFactory.getLogger(Http.class);
+
+  public Http() {
+    super(LOG);
+  }
+
+  @Override
+  public void setConf(Configuration conf) {
+    super.setConf(conf);
+  }
+
+  public static void main(String[] args) throws Exception {
+    Http http = new Http();
+    http.setConf(NutchConfiguration.create());
+    main(http, args);
+  }
+
+  @Override
+  protected Response getResponse(URL url, CrawlDatum datum, boolean redirect)
+      throws ProtocolException, IOException {
+    return new HttpResponse(this, url, datum);
+  }
+
+}

Added: nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java?rev=1662530&view=auto
==============================================================================
--- nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java (added)
+++ nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java Thu Feb 26 18:31:39 2015
@@ -0,0 +1,360 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.selenium;
+
+// JDK imports
+import java.io.BufferedInputStream;
+import java.io.EOFException;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.PushbackInputStream;
+import java.net.InetSocketAddress;
+import java.net.Socket;
+import java.net.URL;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.metadata.SpellCheckedMetadata;
+import org.apache.nutch.net.protocols.HttpDateFormat;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.protocol.http.api.HttpException;
+import org.apache.nutch.protocol.http.api.HttpBase;
+
+/* Most of this code was borrowed from protocol-htmlunit; which in turn borrowed it from protocol-httpclient */
+
+public class HttpResponse implements Response {
+
+  private Http http;
+  private URL url;
+  private String orig;
+  private String base;
+  private byte[] content;
+  private int code;
+  private Metadata headers = new SpellCheckedMetadata();
+
+  /** The nutch configuration */
+  private Configuration conf = null;
+
+  public HttpResponse(Http http, URL url, CrawlDatum datum) throws ProtocolException, IOException {
+
+    this.conf = http.getConf();
+    this.http = http;
+    this.url = url;
+    this.orig = url.toString();
+    this.base = url.toString();
+
+    if (!"http".equals(url.getProtocol()))
+      throw new HttpException("Not an HTTP url:" + url);
+
+    if (Http.LOG.isTraceEnabled()) {
+      Http.LOG.trace("fetching " + url);
+    }
+
+    String path = "".equals(url.getFile()) ? "/" : url.getFile();
+
+    // some servers will redirect a request with a host line like
+    // "Host: <hostname>:80" to "http://<hpstname>/<orig_path>"- they
+    // don't want the :80...
+
+    String host = url.getHost();
+    int port;
+    String portString;
+    if (url.getPort() == -1) {
+      port = 80;
+      portString = "";
+    } else {
+      port = url.getPort();
+      portString = ":" + port;
+    }
+    Socket socket = null;
+
+    try {
+      socket = new Socket(); // create the socket
+      socket.setSoTimeout(http.getTimeout());
+
+      // connect
+      String sockHost = http.useProxy() ? http.getProxyHost() : host;
+      int sockPort = http.useProxy() ? http.getProxyPort() : port;
+      InetSocketAddress sockAddr = new InetSocketAddress(sockHost, sockPort);
+      socket.connect(sockAddr, http.getTimeout());
+
+      // make request
+      OutputStream req = socket.getOutputStream();
+
+      StringBuffer reqStr = new StringBuffer("GET ");
+      if (http.useProxy()) {
+        reqStr.append(url.getProtocol() + "://" + host + portString + path);
+      } else {
+        reqStr.append(path);
+      }
+
+      reqStr.append(" HTTP/1.0\r\n");
+
+      reqStr.append("Host: ");
+      reqStr.append(host);
+      reqStr.append(portString);
+      reqStr.append("\r\n");
+
+      reqStr.append("Accept-Encoding: x-gzip, gzip, deflate\r\n");
+
+      String userAgent = http.getUserAgent();
+      if ((userAgent == null) || (userAgent.length() == 0)) {
+        if (Http.LOG.isErrorEnabled()) {
+          Http.LOG.error("User-agent is not set!");
+        }
+      } else {
+        reqStr.append("User-Agent: ");
+        reqStr.append(userAgent);
+        reqStr.append("\r\n");
+      }
+
+      reqStr.append("Accept-Language: ");
+      reqStr.append(this.http.getAcceptLanguage());
+      reqStr.append("\r\n");
+
+      reqStr.append("Accept: ");
+      reqStr.append(this.http.getAccept());
+      reqStr.append("\r\n");
+
+      if (datum.getModifiedTime() > 0) {
+        reqStr.append("If-Modified-Since: " + HttpDateFormat.toString(datum.getModifiedTime()));
+        reqStr.append("\r\n");
+      }
+      reqStr.append("\r\n");
+
+      byte[] reqBytes = reqStr.toString().getBytes();
+
+      req.write(reqBytes);
+      req.flush();
+
+      PushbackInputStream in = // process response
+          new PushbackInputStream(new BufferedInputStream(socket.getInputStream(), Http.BUFFER_SIZE),
+              Http.BUFFER_SIZE);
+
+      StringBuffer line = new StringBuffer();
+
+      boolean haveSeenNonContinueStatus = false;
+      while (!haveSeenNonContinueStatus) {
+        // parse status code line
+        this.code = parseStatusLine(in, line);
+        // parse headers
+        parseHeaders(in, line);
+        haveSeenNonContinueStatus = code != 100; // 100 is "Continue"
+      }
+
+      // Get Content type header
+      String contentType = getHeader(Response.CONTENT_TYPE);
+
+      // handle with Selenium only if content type in HTML or XHTML
+      if (contentType != null) {
+        if (contentType.contains("text/html") || contentType.contains("application/xhtml")) {
+          readPlainContent(url);
+        } else {
+          try {
+            int contentLength = Integer.MAX_VALUE;
+            String contentLengthString = headers.get(Response.CONTENT_LENGTH);
+            if (contentLengthString != null) {
+              try {
+                contentLength = Integer.parseInt(contentLengthString.trim());
+              } catch (NumberFormatException ex) {
+                throw new HttpException("bad content length: " + contentLengthString);
+              }
+            }
+
+            if (http.getMaxContent() >= 0 && contentLength > http.getMaxContent()) {
+              contentLength = http.getMaxContent();
+            }
+
+            byte[] buffer = new byte[HttpBase.BUFFER_SIZE];
+            int bufferFilled = 0;
+            int totalRead = 0;
+            ByteArrayOutputStream out = new ByteArrayOutputStream();
+            while ((bufferFilled = in.read(buffer, 0, buffer.length)) != -1
+                && totalRead + bufferFilled <= contentLength) {
+              totalRead += bufferFilled;
+              out.write(buffer, 0, bufferFilled);
+            }
+
+            content = out.toByteArray();
+
+          } catch (Exception e) {
+            if (code == 200)
+              throw new IOException(e.toString());
+            // for codes other than 200 OK, we are fine with empty content
+          } finally {
+            if (in != null) {
+              in.close();
+            }
+          }
+        }
+      }
+
+    } finally {
+      if (socket != null)
+        socket.close();
+    }
+  }
+
+  /* ------------------------- *
+   * <implementation:Response> *
+   * ------------------------- */
+
+  public URL getUrl() {
+    return url;
+  }
+
+  public int getCode() {
+    return code;
+  }
+
+  public String getHeader(String name) {
+    return headers.get(name);
+  }
+
+  public Metadata getHeaders() {
+    return headers;
+  }
+
+  public byte[] getContent() {
+    return content;
+  }
+
+  /* ------------------------- *
+   * <implementation:Response> *
+   * ------------------------- */
+
+  private void readPlainContent(URL url) throws IOException {
+    String page = HttpWebClient.getHtmlPage(url.toString(), conf);
+
+    content = page.getBytes("UTF-8");
+  }
+
+  private int parseStatusLine(PushbackInputStream in, StringBuffer line) throws IOException, HttpException {
+    readLine(in, line, false);
+
+    int codeStart = line.indexOf(" ");
+    int codeEnd = line.indexOf(" ", codeStart + 1);
+
+    // handle lines with no plaintext result code, ie:
+    // "HTTP/1.1 200" vs "HTTP/1.1 200 OK"
+    if (codeEnd == -1)
+      codeEnd = line.length();
+
+    int code;
+    try {
+      code = Integer.parseInt(line.substring(codeStart + 1, codeEnd));
+    } catch (NumberFormatException e) {
+      throw new HttpException("bad status line '" + line + "': " + e.getMessage(), e);
+    }
+
+    return code;
+  }
+
+  private void processHeaderLine(StringBuffer line) throws IOException, HttpException {
+
+    int colonIndex = line.indexOf(":"); // key is up to colon
+    if (colonIndex == -1) {
+      int i;
+      for (i = 0; i < line.length(); i++)
+        if (!Character.isWhitespace(line.charAt(i)))
+          break;
+      if (i == line.length())
+        return;
+      throw new HttpException("No colon in header:" + line);
+    }
+    String key = line.substring(0, colonIndex);
+
+    int valueStart = colonIndex + 1; // skip whitespace
+    while (valueStart < line.length()) {
+      int c = line.charAt(valueStart);
+      if (c != ' ' && c != '\t')
+        break;
+      valueStart++;
+    }
+    String value = line.substring(valueStart);
+    headers.set(key, value);
+  }
+
+  // Adds headers to our headers Metadata
+  private void parseHeaders(PushbackInputStream in, StringBuffer line) throws IOException, HttpException {
+
+    while (readLine(in, line, true) != 0) {
+
+      // handle HTTP responses with missing blank line after headers
+      int pos;
+      if (((pos = line.indexOf("<!DOCTYPE")) != -1) || ((pos = line.indexOf("<HTML")) != -1)
+          || ((pos = line.indexOf("<html")) != -1)) {
+
+        in.unread(line.substring(pos).getBytes("UTF-8"));
+        line.setLength(pos);
+
+        try {
+          //TODO: (CM) We don't know the header names here
+          //since we're just handling them generically. It would
+          //be nice to provide some sort of mapping function here
+          //for the returned header names to the standard metadata
+          //names in the ParseData class
+          processHeaderLine(line);
+        } catch (Exception e) {
+          // fixme:
+          Http.LOG.warn("Error: ", e);
+        }
+        return;
+      }
+
+      processHeaderLine(line);
+    }
+  }
+
+  private static int readLine(PushbackInputStream in, StringBuffer line, boolean allowContinuedLine)
+      throws IOException {
+    line.setLength(0);
+    for (int c = in.read(); c != -1; c = in.read()) {
+      switch (c) {
+      case '\r':
+        if (peek(in) == '\n') {
+          in.read();
+        }
+      case '\n':
+        if (line.length() > 0) {
+          // at EOL -- check for continued line if the current
+          // (possibly continued) line wasn't blank
+          if (allowContinuedLine)
+            switch (peek(in)) {
+            case ' ':
+            case '\t': // line is continued
+              in.read();
+              continue;
+            }
+        }
+        return line.length(); // else complete
+      default:
+        line.append((char) c);
+      }
+    }
+    throw new EOFException();
+  }
+
+  private static int peek(PushbackInputStream in) throws IOException {
+    int value = in.read();
+    in.unread(value);
+    return value;
+  }
+}

Added: nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/package.html
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/package.html?rev=1662530&view=auto
==============================================================================
--- nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/package.html (added)
+++ nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/package.html Thu Feb 26 18:31:39 2015
@@ -0,0 +1,5 @@
+<html>
+<body>
+<p>Protocol plugin which supports retrieving documents via selenium.</p><p></p>
+</body>
+</html>

Added: nutch/trunk/src/plugin/protocol-selenium/src/target/classes/org/apache/nutch/protocol/htmlunit/package.html
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-selenium/src/target/classes/org/apache/nutch/protocol/htmlunit/package.html?rev=1662530&view=auto
==============================================================================
--- nutch/trunk/src/plugin/protocol-selenium/src/target/classes/org/apache/nutch/protocol/htmlunit/package.html (added)
+++ nutch/trunk/src/plugin/protocol-selenium/src/target/classes/org/apache/nutch/protocol/htmlunit/package.html Thu Feb 26 18:31:39 2015
@@ -0,0 +1,5 @@
+<html>
+<body>
+<p>Protocol plugin which supports retrieving documents via the htmlunit.</p><p></p>
+</body>
+</html>



______________________________________________________________________
This email has been scanned by the Symantec Email Security.cloud service.
For more information please visit http://www.symanteccloud.com
______________________________________________________________________

Gioele Zanzico
Senior Analyst
Vitec Group Imaging Division
Direct Line: +390424555507
Mobile:
Skype: gzanzico

[http://mediacdn.shopatron.com/media/mfg/2747/media_image/dev_2/mail_logo_vitec.png?1366390318]

Vitec Group Imaging Division, Via Valsugana 100, I-36022 Cassola (VI); Italy
T +39 (0424) 555 855 F +39 (0424) 808 999 www.vitecgroup.com<http://www.vitecgroup.com>
..............................................................................................................................................................................................................................................
You'll know us by the company we keep:
www.avenger-grip.com<http://www.avenger-grip.com>
www.colorama-photo.com<http://www.colorama-photo.com>
www.gitzo.com<http://www.gitzo.com>
www.lastolite.com<http://www.lastolite.com>
www.manfrotto.com<http://www.manfrotto.com>
www.geographicbags.com<http://www.geographicbags.com>
..............................................................................................................................................................................................................................................

A division of the Vitec Group plc. Registered office: Bridge house, Heron Square, Richmond, TW9 1EN, United Kingdom.
Registered in England no 227691
..............................................................................................................................................................................................................................................
This email and any attachments may contain privileged or confidential information and you must not read, copy, store or disclose them unless they are intended for you or
your organisation or you have received prior permission. If received in error, please delete and contact the sender. Unless stated to the contrary, any opinions or comments
are personal to the writer and do not necessarily represent the official view of the Company. While we have taken every precaution to minimise the risk of computer viruses,
we cannot accept any liability for such viruses and you should carry out your own virus checks before opening any attachments contained in this email.
..............................................................................................................................................................................................................................................
Please consider the environment before printing this email

Re: Unsubscribe

Posted by Julien Nioche <li...@gmail.com>.
Massimo,

http://nutch.apache.org/mailing_lists.html

=> dev-unsubscribe@nutch.apache.org

Thanks

On 26 February 2015 at 19:11, Massimo Miccoli <mm...@iltrovatore.it>
wrote:

>
>
> Massimo
>
> > Il giorno 26/feb/2015, alle ore 19:31, lewismc@apache.org ha scritto:
> >
> > Author: lewismc
> > Date: Thu Feb 26 18:31:39 2015
> > New Revision: 1662530
> >
> > URL: http://svn.apache.org/r1662530
> > Log:
> > NUTCH-1933 nutch-selenium plugin
> >
> > Added:
> >    nutch/trunk/src/plugin/lib-selenium/
> >    nutch/trunk/src/plugin/lib-selenium/build.xml
> >    nutch/trunk/src/plugin/lib-selenium/ivy.xml
> >    nutch/trunk/src/plugin/lib-selenium/plugin.xml
> >    nutch/trunk/src/plugin/lib-selenium/src/
> >    nutch/trunk/src/plugin/lib-selenium/src/java/
> >    nutch/trunk/src/plugin/lib-selenium/src/java/org/
> >    nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/
> >    nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/nutch/
> >
> nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/
> >
> nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/
> >
> nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java
> >    nutch/trunk/src/plugin/protocol-selenium/
> >    nutch/trunk/src/plugin/protocol-selenium/build-ivy.xml
> >    nutch/trunk/src/plugin/protocol-selenium/build.xml
> >    nutch/trunk/src/plugin/protocol-selenium/ivy.xml
> >    nutch/trunk/src/plugin/protocol-selenium/plugin.xml
> >    nutch/trunk/src/plugin/protocol-selenium/src/
> >    nutch/trunk/src/plugin/protocol-selenium/src/java/
> >    nutch/trunk/src/plugin/protocol-selenium/src/java/org/
> >    nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/
> >    nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/
> >
> nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/
> >
> nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/
> >
> nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/Http.java
> >
> nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java
> >
> nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/package.html
> >    nutch/trunk/src/plugin/protocol-selenium/src/target/
> >    nutch/trunk/src/plugin/protocol-selenium/src/target/classes/
> >    nutch/trunk/src/plugin/protocol-selenium/src/target/classes/org/
> >
> nutch/trunk/src/plugin/protocol-selenium/src/target/classes/org/apache/
> >
> nutch/trunk/src/plugin/protocol-selenium/src/target/classes/org/apache/nutch/
> >
> nutch/trunk/src/plugin/protocol-selenium/src/target/classes/org/apache/nutch/protocol/
> >
> nutch/trunk/src/plugin/protocol-selenium/src/target/classes/org/apache/nutch/protocol/htmlunit/
> >
> nutch/trunk/src/plugin/protocol-selenium/src/target/classes/org/apache/nutch/protocol/htmlunit/package.html
> > Modified:
> >    nutch/trunk/CHANGES.txt
> >    nutch/trunk/build.xml
> >    nutch/trunk/ivy/ivy.xml
> >    nutch/trunk/src/plugin/build.xml
> >
> > Modified: nutch/trunk/CHANGES.txt
> > URL:
> http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1662530&r1=1662529&r2=1662530&view=diff
> >
> ==============================================================================
> > --- nutch/trunk/CHANGES.txt (original)
> > +++ nutch/trunk/CHANGES.txt Thu Feb 26 18:31:39 2015
> > @@ -2,6 +2,8 @@ Nutch Change Log
> >
> > Nutch Current Development 1.10-SNAPSHOT
> >
> > +* NUTCH-1933 nutch-selenium plugin (Mo Omer, Mohammad Al-Moshin,
> lewismc)
> > +
> > * NUTCH-827 HTTP POST Authentication (Jasper van Veghel, yuanyun.cn,
> snagel, lewismc)
> >
> > * NUTCH-1724 LinkDBReader to support regex output filtering (markus)
> >
> > Modified: nutch/trunk/build.xml
> > URL:
> http://svn.apache.org/viewvc/nutch/trunk/build.xml?rev=1662530&r1=1662529&r2=1662530&view=diff
> >
> ==============================================================================
> > --- nutch/trunk/build.xml (original)
> > +++ nutch/trunk/build.xml Thu Feb 26 18:31:39 2015
> > @@ -184,6 +184,7 @@
> >       <packageset dir="${plugins.dir}/indexer-solr/src/java"/>
> >       <packageset dir="${plugins.dir}/language-identifier/src/java"/>
> >       <packageset dir="${plugins.dir}/lib-http/src/java"/>
> > +      <packageset dir="${plugins.dir}/lib-selenium/src/java"/>
> >       <packageset dir="${plugins.dir}/lib-regex-filter/src/java"/>
> >       <packageset dir="${plugins.dir}/microformats-reltag/src/java"/>
> >       <packageset dir="${plugins.dir}/parse-ext/src/java"/>
> > @@ -197,6 +198,7 @@
> >       <packageset dir="${plugins.dir}/protocol-ftp/src/java"/>
> >       <packageset dir="${plugins.dir}/protocol-http/src/java"/>
> >       <packageset dir="${plugins.dir}/protocol-httpclient/src/java"/>
> > +      <packageset dir="${plugins.dir}/protocol-selenium/src/java"/>
> >       <packageset dir="${plugins.dir}/scoring-depth/src/java"/>
> >       <packageset dir="${plugins.dir}/scoring-link/src/java"/>
> >       <packageset dir="${plugins.dir}/scoring-opic/src/java"/>
> > @@ -591,6 +593,7 @@
> >       <packageset dir="${plugins.dir}/indexer-solr/src/java"/>
> >       <packageset dir="${plugins.dir}/language-identifier/src/java"/>
> >       <packageset dir="${plugins.dir}/lib-http/src/java"/>
> > +      <packageset dir="${plugins.dir}/lib-selenium/src/java"/>
> >       <packageset dir="${plugins.dir}/lib-regex-filter/src/java"/>
> >       <packageset dir="${plugins.dir}/microformats-reltag/src/java"/>
> >       <packageset dir="${plugins.dir}/parse-ext/src/java"/>
> > @@ -604,6 +607,7 @@
> >       <packageset dir="${plugins.dir}/protocol-ftp/src/java"/>
> >       <packageset dir="${plugins.dir}/protocol-http/src/java"/>
> >       <packageset dir="${plugins.dir}/protocol-httpclient/src/java"/>
> > +      <packageset dir="${plugins.dir}/protocol-selenium/src/java"/>
> >       <packageset dir="${plugins.dir}/scoring-depth/src/java"/>
> >       <packageset dir="${plugins.dir}/scoring-link/src/java"/>
> >       <packageset dir="${plugins.dir}/scoring-opic/src/java"/>
> > @@ -985,6 +989,8 @@
> >         <source path="${plugins.dir}/language-identifier/src/test/" />
> >         <source path="${plugins.dir}/lib-http/src/java/" />
> >         <source path="${plugins.dir}/lib-http/src/test/" />
> > +        <source path="${plugins.dir}/lib-selenium/src/java/" />
> > +        <source path="${plugins.dir}/lib-selenium/src/test/" />
> >         <source path="${plugins.dir}/lib-regex-filter/src/java/" />
> >         <source path="${plugins.dir}/lib-regex-filter/src/test/" />
> >         <source path="${plugins.dir}/microformats-reltag/src/java/" />
> > @@ -1008,6 +1014,8 @@
> >         <source path="${plugins.dir}/protocol-httpclient/src/test/" />
> >         <source path="${plugins.dir}/protocol-http/src/java/" />
> >         <source path="${plugins.dir}/protocol-http/src/test/" />
> > +        <source path="${plugins.dir}/protocol-selenium/src/java"/>
> > +        <source path="${plugins.dir}/protocol-selenium/src/test"/>
> >         <source path="${plugins.dir}/scoring-depth/src/java/" />
> >         <source path="${plugins.dir}/scoring-link/src/java/" />
> >         <source path="${plugins.dir}/scoring-opic/src/java/" />
> >
> > Modified: nutch/trunk/ivy/ivy.xml
> > URL:
> http://svn.apache.org/viewvc/nutch/trunk/ivy/ivy.xml?rev=1662530&r1=1662529&r2=1662530&view=diff
> >
> ==============================================================================
> > --- nutch/trunk/ivy/ivy.xml (original)
> > +++ nutch/trunk/ivy/ivy.xml Thu Feb 26 18:31:39 2015
> > @@ -23,24 +23,24 @@
> >            database etc.
> >        </description>
> >    </info>
> > -
> > +
> >    <configurations>
> >        <include file="${basedir}/ivy/ivy-configurations.xml" />
> >    </configurations>
> > -
> > +
> >    <publications>
> >        <!--get the artifact from our module name -->
> >        <artifact conf="master" />
> >    </publications>
> > -
> > +
> >    <dependencies>
> >        <dependency org="org.slf4j" name="slf4j-api" rev="1.6.1"
> >            conf="*->master" />
> >        <dependency org="org.slf4j" name="slf4j-log4j12" rev="1.6.1"
> >            conf="*->master" />
> > -
> > +
> >        <dependency org="log4j" name="log4j" rev="1.2.15"
> conf="*->master" />
> > -
> > +
> >        <dependency org="commons-lang" name="commons-lang" rev="2.6"
> >            conf="*->default" />
> >        <dependency org="commons-collections" name="commons-collections"
> > @@ -49,7 +49,7 @@
> >            rev="3.1" conf="*->master" />
> >        <dependency org="commons-codec" name="commons-codec" rev="1.3"
> >            conf="*->default" />
> > -
> > +
> >        <dependency org="org.apache.hadoop" name="hadoop-core" rev="1.2.0"
> >            conf="*->default">
> >            <exclude org="hsqldb" name="hsqldb" />
> >
> > Modified: nutch/trunk/src/plugin/build.xml
> > URL:
> http://svn.apache.org/viewvc/nutch/trunk/src/plugin/build.xml?rev=1662530&r1=1662529&r2=1662530&view=diff
> >
> ==============================================================================
> > --- nutch/trunk/src/plugin/build.xml (original)
> > +++ nutch/trunk/src/plugin/build.xml Thu Feb 26 18:31:39 2015
> > @@ -50,6 +50,8 @@
> >      <ant dir="protocol-ftp" target="deploy"/>
> >      <ant dir="protocol-http" target="deploy"/>
> >      <ant dir="protocol-httpclient" target="deploy"/>
> > +     <ant dir="lib-selenium" target="deploy"/>
> > +     <ant dir="protocol-selenium" target="deploy" />
> >      <ant dir="parse-ext" target="deploy"/>
> >      <ant dir="parse-js" target="deploy"/>
> >      <ant dir="parse-html" target="deploy"/>
> > @@ -149,6 +151,8 @@
> >     <ant dir="protocol-ftp" target="clean"/>
> >     <ant dir="protocol-http" target="clean"/>
> >     <ant dir="protocol-httpclient" target="clean"/>
> > +    <ant dir="lib-selenium" target="clean"/>
> > +    <ant dir="protocol-selenium" target="clean" />
> >     <ant dir="parse-ext" target="clean"/>
> >     <ant dir="parse-js" target="clean"/>
> >     <ant dir="parse-html" target="clean"/>
> >
> > Added: nutch/trunk/src/plugin/lib-selenium/build.xml
> > URL:
> http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-selenium/build.xml?rev=1662530&view=auto
> >
> ==============================================================================
> > --- nutch/trunk/src/plugin/lib-selenium/build.xml (added)
> > +++ nutch/trunk/src/plugin/lib-selenium/build.xml Thu Feb 26 18:31:39
> 2015
> > @@ -0,0 +1,28 @@
> > +<?xml version="1.0"?>
> > +<!--
> > + Licensed to the Apache Software Foundation (ASF) under one or more
> > + contributor license agreements.  See the NOTICE file distributed with
> > + this work for additional information regarding copyright ownership.
> > + The ASF licenses this file to You under the Apache License, Version 2.0
> > + (the "License"); you may not use this file except in compliance with
> > + the License.  You may obtain a copy of the License at
> > +
> > +     http://www.apache.org/licenses/LICENSE-2.0
> > +
> > + Unless required by applicable law or agreed to in writing, software
> > + distributed under the License is distributed on an "AS IS" BASIS,
> > + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> implied.
> > + See the License for the specific language governing permissions and
> > + limitations under the License.
> > +-->
> > +<project name="lib-selenium" default="jar-core">
> > +
> > +  <import file="../build-plugin.xml"/>
> > +
> > +  <!-- Add compilation dependencies to classpath -->
> > +  <path id="plugin.deps">
> > +    <fileset dir="${nutch.root}/build">
> > +      <include name="**/lib-http/*.jar" />
> > +    </fileset>
> > +  </path>
> > +</project>
> >
> > Added: nutch/trunk/src/plugin/lib-selenium/ivy.xml
> > URL:
> http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-selenium/ivy.xml?rev=1662530&view=auto
> >
> ==============================================================================
> > --- nutch/trunk/src/plugin/lib-selenium/ivy.xml (added)
> > +++ nutch/trunk/src/plugin/lib-selenium/ivy.xml Thu Feb 26 18:31:39 2015
> > @@ -0,0 +1,48 @@
> > +<?xml version="1.0" ?>
> > +
> > +<!--
> > +   Licensed to the Apache Software Foundation (ASF) under one or more
> > +   contributor license agreements.  See the NOTICE file distributed with
> > +   this work for additional information regarding copyright ownership.
> > +   The ASF licenses this file to You under the Apache License, Version
> 2.0
> > +   (the "License"); you may not use this file except in compliance with
> > +   the License.  You may obtain a copy of the License at
> > +
> > +       http://www.apache.org/licenses/LICENSE-2.0
> > +
> > +   Unless required by applicable law or agreed to in writing, software
> > +   distributed under the License is distributed on an "AS IS" BASIS,
> > +   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> implied.
> > +   See the License for the specific language governing permissions and
> > +   limitations under the License.
> > +-->
> > +
> > +<ivy-module version="1.0">
> > +  <info organisation="org.apache.nutch" module="${ant.project.name}">
> > +    <license name="Apache 2.0"/>
> > +    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
> > +    <description>
> > +        Apache Nutch
> > +    </description>
> > +  </info>
> > +
> > +  <configurations>
> > +    <include file="../../..//ivy/ivy-configurations.xml"/>
> > +  </configurations>
> > +
> > +  <publications>
> > +    <!--get the artifact from our module name-->
> > +    <artifact conf="master"/>
> > +  </publications>
> > +
> > +  <dependencies>
> > +    <!-- begin selenium dependencies -->
> > +    <dependency org="org.seleniumhq.selenium" name="selenium-java"
> rev="2.44.0" />
> > +
> > +    <dependency org="com.opera" name="operadriver" rev="1.5">
> > +      <exclude org="org.seleniumhq.selenium"
> name="selenium-remote-driver" />
> > +    </dependency>
> > +    <!-- end selenium dependencies -->
> > +  </dependencies>
> > +
> > +</ivy-module>
> >
> > Added: nutch/trunk/src/plugin/lib-selenium/plugin.xml
> > URL:
> http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-selenium/plugin.xml?rev=1662530&view=auto
> >
> ==============================================================================
> > --- nutch/trunk/src/plugin/lib-selenium/plugin.xml (added)
> > +++ nutch/trunk/src/plugin/lib-selenium/plugin.xml Thu Feb 26 18:31:39
> 2015
> > @@ -0,0 +1,42 @@
> > +<?xml version="1.0" encoding="UTF-8"?>
> > +<!--
> > + Licensed to the Apache Software Foundation (ASF) under one or more
> > + contributor license agreements.  See the NOTICE file distributed with
> > + this work for additional information regarding copyright ownership.
> > + The ASF licenses this file to You under the Apache License, Version 2.0
> > + (the "License"); you may not use this file except in compliance with
> > + the License.  You may obtain a copy of the License at
> > +
> > +     http://www.apache.org/licenses/LICENSE-2.0
> > +
> > + Unless required by applicable law or agreed to in writing, software
> > + distributed under the License is distributed on an "AS IS" BASIS,
> > + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> implied.
> > + See the License for the specific language governing permissions and
> > + limitations under the License.
> > +-->
> > +<!--
> > + ! A common framework for http protocol implementations
> > + !-->
> > +<plugin
> > +   id="lib-selenium"
> > +   name="HTTP Framework"
> > +   version="1.0"
> > +   provider-name="org.apache.nutch">
> > +
> > +   <runtime>
> > +     <library name="lib-selenium.jar">
> > +        <export name="*"/>
> > +     </library>
> > +   </runtime>
> > +
> > +   <requires>
> > +     <library name="selenium-java-2.4.0.jar">
> > +       <export name="*"/>
> > +     </library>
> > +     <library name="operadriver-1.5.jar">
> > +       <export name="*"/>
> > +       <exclude name="selenium-remote-driver" />
> > +     </library>
> > +   </requires>
> > +</plugin>
> >
> > Added:
> nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java
> > URL:
> http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java?rev=1662530&view=auto
> >
> ==============================================================================
> > ---
> nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java
> (added)
> > +++
> nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java
> Thu Feb 26 18:31:39 2015
> > @@ -0,0 +1,78 @@
> > +/**
> > + * Licensed to the Apache Software Foundation (ASF) under one or more
> > + * contributor license agreements.  See the NOTICE file distributed with
> > + * this work for additional information regarding copyright ownership.
> > + * The ASF licenses this file to You under the Apache License, Version
> 2.0
> > + * (the "License"); you may not use this file except in compliance with
> > + * the License.  You may obtain a copy of the License at
> > + *
> > + *     http://www.apache.org/licenses/LICENSE-2.0
> > + *
> > + * Unless required by applicable law or agreed to in writing, software
> > + * distributed under the License is distributed on an "AS IS" BASIS,
> > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> implied.
> > + * See the License for the specific language governing permissions and
> > + * limitations under the License.
> > + */
> > +package org.apache.nutch.protocol.selenium;
> > +
> > +import org.apache.hadoop.conf.Configuration;
> > +import org.slf4j.Logger;
> > +import org.slf4j.LoggerFactory;
> > +import org.openqa.selenium.By;
> > +import org.openqa.selenium.WebDriver;
> > +import org.openqa.selenium.firefox.FirefoxDriver;
> > +import org.openqa.selenium.firefox.FirefoxProfile;
> > +import org.openqa.selenium.support.ui.WebDriverWait;
> > +
> > +import java.lang.String;
> > +
> > +public class HttpWebClient {
> > +
> > +  private static final Logger LOG =
> LoggerFactory.getLogger("org.apache.nutch.protocol");
> > +
> > +  public static ThreadLocal<WebDriver> threadWebDriver = new
> ThreadLocal<WebDriver>() {
> > +
> > +    @Override
> > +    protected WebDriver initialValue()
> > +    {
> > +      FirefoxProfile profile = new FirefoxProfile();
> > +      profile.setPreference("permissions.default.stylesheet", 2);
> > +      profile.setPreference("permissions.default.image", 2);
> > +      profile.setPreference("dom.ipc.plugins.enabled.libflashplayer.so",
> "false");
> > +      WebDriver driver = new FirefoxDriver(profile);
> > +      return driver;
> > +    };
> > +  };
> > +
> > +  public static String getHtmlPage(String url, Configuration conf) {
> > +    WebDriver driver = null;
> > +
> > +    try {
> > +      driver = new FirefoxDriver();
> > +      //} WebDriver driver = threadWebDriver.get();
> > +      //  if (driver == null) {
> > +      //    driver = new FirefoxDriver();
> > +      //  }
> > +
> > +      driver.get(url);
> > +
> > +      // Wait for the page to load, timeout after 3 seconds
> > +      new WebDriverWait(driver, 3);
> > +
> > +      String innerHtml =
> driver.findElement(By.tagName("body")).getAttribute("innerHTML");
> > +
> > +      return innerHtml;
> > +
> > +      // I'm sure this catch statement is a code smell ; borrowing it
> from lib-htmlunit
> > +    } catch (Exception e) {
> > +      throw new RuntimeException(e);
> > +    } finally {
> > +      if (driver != null) try { driver.quit(); } catch (Exception e) {
> throw new RuntimeException(e); }
> > +    }
> > +  };
> > +
> > +  public static String getHtmlPage(String url) {
> > +    return getHtmlPage(url, null);
> > +  }
> > +}
> > \ No newline at end of file
> >
> > Added: nutch/trunk/src/plugin/protocol-selenium/build-ivy.xml
> > URL:
> http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-selenium/build-ivy.xml?rev=1662530&view=auto
> >
> ==============================================================================
> > --- nutch/trunk/src/plugin/protocol-selenium/build-ivy.xml (added)
> > +++ nutch/trunk/src/plugin/protocol-selenium/build-ivy.xml Thu Feb 26
> 18:31:39 2015
> > @@ -0,0 +1,54 @@
> > +<?xml version="1.0"?>
> > +<!--
> > + Licensed to the Apache Software Foundation (ASF) under one or more
> > + contributor license agreements.  See the NOTICE file distributed with
> > + this work for additional information regarding copyright ownership.
> > + The ASF licenses this file to You under the Apache License, Version 2.0
> > + (the "License"); you may not use this file except in compliance with
> > + the License.  You may obtain a copy of the License at
> > +
> > +     http://www.apache.org/licenses/LICENSE-2.0
> > +
> > + Unless required by applicable law or agreed to in writing, software
> > + distributed under the License is distributed on an "AS IS" BASIS,
> > + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> implied.
> > + See the License for the specific language governing permissions and
> > + limitations under the License.
> > +-->
> > +<project name="protocol-selenium" default="deps-jar"
> xmlns:ivy="antlib:org.apache.ivy.ant">
> > +
> > +    <property name="ivy.install.version" value="2.1.0" />
> > +    <condition property="ivy.home" value="${env.IVY_HOME}">
> > +      <isset property="env.IVY_HOME" />
> > +    </condition>
> > +    <property name="ivy.home" value="${user.home}/.ant" />
> > +    <property name="ivy.checksums" value="" />
> > +    <property name="ivy.jar.dir" value="${ivy.home}/lib" />
> > +    <property name="ivy.jar.file" value="${ivy.jar.dir}/ivy.jar" />
> > +
> > +    <target name="download-ivy" unless="offline">
> > +
> > +        <mkdir dir="${ivy.jar.dir}"/>
> > +        <!-- download Ivy from web site so that it can be used even
> without any special installation -->
> > +        <get src="
> http://repo2.maven.org/maven2/org/apache/ivy/ivy/${ivy.install.version}/ivy-${ivy.install.version}.jar
> "
> > +             dest="${ivy.jar.file}" usetimestamp="true"/>
> > +    </target>
> > +
> > +    <target name="init-ivy" depends="download-ivy">
> > +      <!-- try to load ivy here from ivy home, in case the user has not
> already dropped
> > +              it into ant's lib dir (note that the latter copy will
> always take precedence).
> > +              We will not fail as long as local lib dir exists (it may
> be empty) and
> > +              ivy is in at least one of ant's lib dir or the local lib
> dir. -->
> > +        <path id="ivy.lib.path">
> > +            <fileset dir="${ivy.jar.dir}" includes="*.jar"/>
> > +
> > +        </path>
> > +        <taskdef resource="org/apache/ivy/ant/antlib.xml"
> > +                 uri="antlib:org.apache.ivy.ant"
> classpathref="ivy.lib.path"/>
> > +    </target>
> > +
> > +  <target name="deps-jar" depends="init-ivy">
> > +    <ivy:retrieve pattern="lib/[artifact]-[revision].[ext]"/>
> > +  </target>
> > +
> > +</project>
> >
> > Added: nutch/trunk/src/plugin/protocol-selenium/build.xml
> > URL:
> http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-selenium/build.xml?rev=1662530&view=auto
> >
> ==============================================================================
> > --- nutch/trunk/src/plugin/protocol-selenium/build.xml (added)
> > +++ nutch/trunk/src/plugin/protocol-selenium/build.xml Thu Feb 26
> 18:31:39 2015
> > @@ -0,0 +1,36 @@
> > +<?xml version="1.0"?>
> > +<!--
> > + Licensed to the Apache Software Foundation (ASF) under one or more
> > + contributor license agreements.  See the NOTICE file distributed with
> > + this work for additional information regarding copyright ownership.
> > + The ASF licenses this file to You under the Apache License, Version 2.0
> > + (the "License"); you may not use this file except in compliance with
> > + the License.  You may obtain a copy of the License at
> > +
> > +     http://www.apache.org/licenses/LICENSE-2.0
> > +
> > + Unless required by applicable law or agreed to in writing, software
> > + distributed under the License is distributed on an "AS IS" BASIS,
> > + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> implied.
> > + See the License for the specific language governing permissions and
> > + limitations under the License.
> > +-->
> > +<project name="protocol-selenium" default="jar-core">
> > +
> > +  <import file="../build-plugin.xml"/>
> > +
> > +  <!-- Build compilation dependencies -->
> > +  <target name="deps-jar">
> > +    <ant target="jar" inheritall="false" dir="../lib-http"/>
> > +    <ant target="jar" inheritall="false" dir="../lib-selenium"/>
> > +  </target>
> > +
> > +  <!-- Add compilation dependencies to classpath -->
> > +  <path id="plugin.deps">
> > +    <fileset dir="${nutch.root}/build">
> > +      <include name="**/lib-http/*.jar" />
> > +      <include name="**/lib-selenium/*.jar" />
> > +    </fileset>
> > +  </path>
> > +
> > +</project>
> >
> > Added: nutch/trunk/src/plugin/protocol-selenium/ivy.xml
> > URL:
> http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-selenium/ivy.xml?rev=1662530&view=auto
> >
> ==============================================================================
> > --- nutch/trunk/src/plugin/protocol-selenium/ivy.xml (added)
> > +++ nutch/trunk/src/plugin/protocol-selenium/ivy.xml Thu Feb 26 18:31:39
> 2015
> > @@ -0,0 +1,48 @@
> > +<?xml version="1.0" ?>
> > +
> > +<!--
> > +   Licensed to the Apache Software Foundation (ASF) under one or more
> > +   contributor license agreements.  See the NOTICE file distributed with
> > +   this work for additional information regarding copyright ownership.
> > +   The ASF licenses this file to You under the Apache License, Version
> 2.0
> > +   (the "License"); you may not use this file except in compliance with
> > +   the License.  You may obtain a copy of the License at
> > +
> > +       http://www.apache.org/licenses/LICENSE-2.0
> > +
> > +   Unless required by applicable law or agreed to in writing, software
> > +   distributed under the License is distributed on an "AS IS" BASIS,
> > +   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> implied.
> > +   See the License for the specific language governing permissions and
> > +   limitations under the License.
> > +-->
> > +
> > +<ivy-module version="1.0">
> > +  <info organisation="org.apache.nutch" module="${ant.project.name}">
> > +    <license name="Apache 2.0"/>
> > +    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
> > +    <description>
> > +        Apache Nutch
> > +    </description>
> > +  </info>
> > +
> > +  <configurations>
> > +    <include file="../../..//ivy/ivy-configurations.xml"/>
> > +  </configurations>
> > +
> > +  <publications>
> > +    <!--get the artifact from our module name-->
> > +    <artifact conf="default"/>
> > +  </publications>
> > +
> > +  <dependencies>
> > +    <!-- begin selenium dependencies -->
> > +    <dependency org="org.seleniumhq.selenium" name="selenium-java"
> rev="2.44.0" />
> > +
> > +    <dependency org="com.opera" name="operadriver" rev="1.5">
> > +      <exclude org="org.seleniumhq.selenium"
> name="selenium-remote-driver" />
> > +    </dependency>
> > +    <!-- end selenium dependencies -->
> > +  </dependencies>
> > +
> > +</ivy-module>
> >
> > Added: nutch/trunk/src/plugin/protocol-selenium/plugin.xml
> > URL:
> http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-selenium/plugin.xml?rev=1662530&view=auto
> >
> ==============================================================================
> > --- nutch/trunk/src/plugin/protocol-selenium/plugin.xml (added)
> > +++ nutch/trunk/src/plugin/protocol-selenium/plugin.xml Thu Feb 26
> 18:31:39 2015
> > @@ -0,0 +1,90 @@
> > +<?xml version="1.0" encoding="UTF-8"?>
> > +<!--
> > + Licensed to the Apache Software Foundation (ASF) under one or more
> > + contributor license agreements.  See the NOTICE file distributed with
> > + this work for additional information regarding copyright ownership.
> > + The ASF licenses this file to You under the Apache License, Version 2.0
> > + (the "License"); you may not use this file except in compliance with
> > + the License.  You may obtain a copy of the License at
> > +
> > +     http://www.apache.org/licenses/LICENSE-2.0
> > +
> > + Unless required by applicable law or agreed to in writing, software
> > + distributed under the License is distributed on an "AS IS" BASIS,
> > + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> implied.
> > + See the License for the specific language governing permissions and
> > + limitations under the License.
> > +-->
> > +<plugin
> > +   id="protocol-selenium"
> > +   name="Http Protocol Plug-in"
> > +   version="1.0.0"
> > +   provider-name="nutch.org">
> > +
> > +   <runtime>
> > +      <library name="protocol-selenium.jar">
> > +         <export name="*"/>
> > +      </library>
> > +      <library name="cglib-nodep-2.1_3.jar"/>
> > +      <library name="commons-codec-1.9.jar"/>
> > +      <library name="commons-collections-3.2.1.jar"/>
> > +      <library name="commons-exec-1.1.jar"/>
> > +      <library name="commons-io-2.4.jar"/>
> > +      <library name="commons-jxpath-1.3.jar"/>
> > +      <library name="commons-lang3-3.3.2.jar"/>
> > +      <library name="commons-logging-1.1.3.jar"/>
> > +      <library name="cssparser-0.9.14.jar"/>
> > +      <library name="gson-2.3.jar"/>
> > +      <library name="guava-18.0.jar"/>
> > +      <library name="htmlunit-2.15.jar"/>
> > +      <library name="htmlunit-core-js-2.15.jar"/>
> > +      <library name="httpclient-4.3.4.jar"/>
> > +      <library name="httpcore-4.3.2.jar"/>
> > +      <library name="httpmime-4.3.3.jar"/>
> > +      <library name="ini4j-0.5.2.jar"/>
> > +      <library name="jetty-http-8.1.15.v20140411.jar"/>
> > +      <library name="jetty-io-8.1.15.v20140411.jar"/>
> > +      <library name="jetty-util-8.1.15.v20140411.jar"/>
> > +      <library name="jetty-websocket-8.1.15.v20140411.jar"/>
> > +      <library name="jna-3.4.0.jar"/>
> > +      <library name="nekohtml-1.9.21.jar"/>
> > +      <library name="netty-3.5.2.Final.jar"/>
> > +      <library name="operadriver-1.5.jar"/>
> > +      <library name="operalaunchers-1.1.jar"/>
> > +      <library name="platform-3.4.0.jar"/>
> > +      <library name="protobuf-java-2.4.1.jar"/>
> > +      <library name="sac-1.3.jar"/>
> > +      <library name="selenium-api-2.44.0.jar"/>
> > +      <library name="selenium-chrome-driver-2.44.0.jar"/>
> > +      <library name="selenium-firefox-driver-2.44.0.jar"/>
> > +      <library name="selenium-htmlunit-driver-2.44.0.jar"/>
> > +      <library name="selenium-ie-driver-2.44.0.jar"/>
> > +      <library name="selenium-java-2.44.0.jar"/>
> > +      <library name="selenium-remote-driver-2.44.0.jar"/>
> > +      <library name="selenium-safari-driver-2.44.0.jar"/>
> > +      <library name="selenium-support-2.44.0.jar"/>
> > +      <library name="serializer-2.7.1.jar"/>
> > +      <library name="webbit-0.4.14.jar"/>
> > +      <library name="xalan-2.7.1.jar"/>
> > +      <library name="xercesImpl-2.11.0.jar"/>
> > +      <library name="xml-apis-1.4.01.jar"/>
> > +   </runtime>
> > +
> > +   <requires>
> > +      <import plugin="nutch-extensionpoints"/>
> > +      <import plugin="lib-http"/>
> > +      <import plugin="lib-selenium"/>
> > +   </requires>
> > +
> > +   <extension id="org.apache.nutch.protocol.selenium"
> > +              name="HttpProtocol"
> > +              point="org.apache.nutch.protocol.Protocol">
> > +
> > +      <implementation id="org.apache.nutch.protocol.selenium.Http"
> > +                      class="org.apache.nutch.protocol.selenium.Http">
> > +        <parameter name="protocolName" value="http"/>
> > +      </implementation>
> > +
> > +   </extension>
> > +
> > +</plugin>
> >
> > Added:
> nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/Http.java
> > URL:
> http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/Http.java?rev=1662530&view=auto
> >
> ==============================================================================
> > ---
> nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/Http.java
> (added)
> > +++
> nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/Http.java
> Thu Feb 26 18:31:39 2015
> > @@ -0,0 +1,59 @@
> > +/**
> > + * Licensed to the Apache Software Foundation (ASF) under one or more
> > + * contributor license agreements.  See the NOTICE file distributed with
> > + * this work for additional information regarding copyright ownership.
> > + * The ASF licenses this file to You under the Apache License, Version
> 2.0
> > + * (the "License"); you may not use this file except in compliance with
> > + * the License.  You may obtain a copy of the License at
> > + *
> > + *     http://www.apache.org/licenses/LICENSE-2.0
> > + *
> > + * Unless required by applicable law or agreed to in writing, software
> > + * distributed under the License is distributed on an "AS IS" BASIS,
> > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> implied.
> > + * See the License for the specific language governing permissions and
> > + * limitations under the License.
> > + */
> > +package org.apache.nutch.protocol.selenium;
> > +
> > +// JDK imports
> > +import java.io.IOException;
> > +import java.net.URL;
> > +import org.apache.hadoop.conf.Configuration;
> > +import org.apache.nutch.crawl.CrawlDatum;
> > +import org.apache.nutch.net.protocols.Response;
> > +import org.apache.nutch.protocol.http.api.HttpBase;
> > +import org.apache.nutch.protocol.ProtocolException;
> > +import org.apache.nutch.util.NutchConfiguration;
> > +
> > +import org.apache.nutch.protocol.selenium.HttpResponse;
> > +
> > +import org.slf4j.Logger;
> > +import org.slf4j.LoggerFactory;
> > +
> > +public class Http extends HttpBase {
> > +
> > +  public static final Logger LOG = LoggerFactory.getLogger(Http.class);
> > +
> > +  public Http() {
> > +    super(LOG);
> > +  }
> > +
> > +  @Override
> > +  public void setConf(Configuration conf) {
> > +    super.setConf(conf);
> > +  }
> > +
> > +  public static void main(String[] args) throws Exception {
> > +    Http http = new Http();
> > +    http.setConf(NutchConfiguration.create());
> > +    main(http, args);
> > +  }
> > +
> > +  @Override
> > +  protected Response getResponse(URL url, CrawlDatum datum, boolean
> redirect)
> > +      throws ProtocolException, IOException {
> > +    return new HttpResponse(this, url, datum);
> > +  }
> > +
> > +}
> >
> > Added:
> nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java
> > URL:
> http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java?rev=1662530&view=auto
> >
> ==============================================================================
> > ---
> nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java
> (added)
> > +++
> nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java
> Thu Feb 26 18:31:39 2015
> > @@ -0,0 +1,360 @@
> > +/**
> > + * Licensed to the Apache Software Foundation (ASF) under one or more
> > + * contributor license agreements.  See the NOTICE file distributed with
> > + * this work for additional information regarding copyright ownership.
> > + * The ASF licenses this file to You under the Apache License, Version
> 2.0
> > + * (the "License"); you may not use this file except in compliance with
> > + * the License.  You may obtain a copy of the License at
> > + *
> > + *     http://www.apache.org/licenses/LICENSE-2.0
> > + *
> > + * Unless required by applicable law or agreed to in writing, software
> > + * distributed under the License is distributed on an "AS IS" BASIS,
> > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> implied.
> > + * See the License for the specific language governing permissions and
> > + * limitations under the License.
> > + */
> > +package org.apache.nutch.protocol.selenium;
> > +
> > +// JDK imports
> > +import java.io.BufferedInputStream;
> > +import java.io.EOFException;
> > +import java.io.IOException;
> > +import java.io.OutputStream;
> > +import java.io.ByteArrayOutputStream;
> > +import java.io.PushbackInputStream;
> > +import java.net.InetSocketAddress;
> > +import java.net.Socket;
> > +import java.net.URL;
> > +
> > +import org.apache.hadoop.conf.Configuration;
> > +import org.apache.nutch.crawl.CrawlDatum;
> > +import org.apache.nutch.metadata.Metadata;
> > +import org.apache.nutch.metadata.SpellCheckedMetadata;
> > +import org.apache.nutch.net.protocols.HttpDateFormat;
> > +import org.apache.nutch.net.protocols.Response;
> > +import org.apache.nutch.protocol.ProtocolException;
> > +import org.apache.nutch.protocol.http.api.HttpException;
> > +import org.apache.nutch.protocol.http.api.HttpBase;
> > +
> > +/* Most of this code was borrowed from protocol-htmlunit; which in turn
> borrowed it from protocol-httpclient */
> > +
> > +public class HttpResponse implements Response {
> > +
> > +  private Http http;
> > +  private URL url;
> > +  private String orig;
> > +  private String base;
> > +  private byte[] content;
> > +  private int code;
> > +  private Metadata headers = new SpellCheckedMetadata();
> > +
> > +  /** The nutch configuration */
> > +  private Configuration conf = null;
> > +
> > +  public HttpResponse(Http http, URL url, CrawlDatum datum) throws
> ProtocolException, IOException {
> > +
> > +    this.conf = http.getConf();
> > +    this.http = http;
> > +    this.url = url;
> > +    this.orig = url.toString();
> > +    this.base = url.toString();
> > +
> > +    if (!"http".equals(url.getProtocol()))
> > +      throw new HttpException("Not an HTTP url:" + url);
> > +
> > +    if (Http.LOG.isTraceEnabled()) {
> > +      Http.LOG.trace("fetching " + url);
> > +    }
> > +
> > +    String path = "".equals(url.getFile()) ? "/" : url.getFile();
> > +
> > +    // some servers will redirect a request with a host line like
> > +    // "Host: <hostname>:80" to "http://<hpstname>/<orig_path>"- they
> > +    // don't want the :80...
> > +
> > +    String host = url.getHost();
> > +    int port;
> > +    String portString;
> > +    if (url.getPort() == -1) {
> > +      port = 80;
> > +      portString = "";
> > +    } else {
> > +      port = url.getPort();
> > +      portString = ":" + port;
> > +    }
> > +    Socket socket = null;
> > +
> > +    try {
> > +      socket = new Socket(); // create the socket
> > +      socket.setSoTimeout(http.getTimeout());
> > +
> > +      // connect
> > +      String sockHost = http.useProxy() ? http.getProxyHost() : host;
> > +      int sockPort = http.useProxy() ? http.getProxyPort() : port;
> > +      InetSocketAddress sockAddr = new InetSocketAddress(sockHost,
> sockPort);
> > +      socket.connect(sockAddr, http.getTimeout());
> > +
> > +      // make request
> > +      OutputStream req = socket.getOutputStream();
> > +
> > +      StringBuffer reqStr = new StringBuffer("GET ");
> > +      if (http.useProxy()) {
> > +        reqStr.append(url.getProtocol() + "://" + host + portString +
> path);
> > +      } else {
> > +        reqStr.append(path);
> > +      }
> > +
> > +      reqStr.append(" HTTP/1.0\r\n");
> > +
> > +      reqStr.append("Host: ");
> > +      reqStr.append(host);
> > +      reqStr.append(portString);
> > +      reqStr.append("\r\n");
> > +
> > +      reqStr.append("Accept-Encoding: x-gzip, gzip, deflate\r\n");
> > +
> > +      String userAgent = http.getUserAgent();
> > +      if ((userAgent == null) || (userAgent.length() == 0)) {
> > +        if (Http.LOG.isErrorEnabled()) {
> > +          Http.LOG.error("User-agent is not set!");
> > +        }
> > +      } else {
> > +        reqStr.append("User-Agent: ");
> > +        reqStr.append(userAgent);
> > +        reqStr.append("\r\n");
> > +      }
> > +
> > +      reqStr.append("Accept-Language: ");
> > +      reqStr.append(this.http.getAcceptLanguage());
> > +      reqStr.append("\r\n");
> > +
> > +      reqStr.append("Accept: ");
> > +      reqStr.append(this.http.getAccept());
> > +      reqStr.append("\r\n");
> > +
> > +      if (datum.getModifiedTime() > 0) {
> > +        reqStr.append("If-Modified-Since: " +
> HttpDateFormat.toString(datum.getModifiedTime()));
> > +        reqStr.append("\r\n");
> > +      }
> > +      reqStr.append("\r\n");
> > +
> > +      byte[] reqBytes = reqStr.toString().getBytes();
> > +
> > +      req.write(reqBytes);
> > +      req.flush();
> > +
> > +      PushbackInputStream in = // process response
> > +          new PushbackInputStream(new
> BufferedInputStream(socket.getInputStream(), Http.BUFFER_SIZE),
> > +              Http.BUFFER_SIZE);
> > +
> > +      StringBuffer line = new StringBuffer();
> > +
> > +      boolean haveSeenNonContinueStatus = false;
> > +      while (!haveSeenNonContinueStatus) {
> > +        // parse status code line
> > +        this.code = parseStatusLine(in, line);
> > +        // parse headers
> > +        parseHeaders(in, line);
> > +        haveSeenNonContinueStatus = code != 100; // 100 is "Continue"
> > +      }
> > +
> > +      // Get Content type header
> > +      String contentType = getHeader(Response.CONTENT_TYPE);
> > +
> > +      // handle with Selenium only if content type in HTML or XHTML
> > +      if (contentType != null) {
> > +        if (contentType.contains("text/html") ||
> contentType.contains("application/xhtml")) {
> > +          readPlainContent(url);
> > +        } else {
> > +          try {
> > +            int contentLength = Integer.MAX_VALUE;
> > +            String contentLengthString =
> headers.get(Response.CONTENT_LENGTH);
> > +            if (contentLengthString != null) {
> > +              try {
> > +                contentLength =
> Integer.parseInt(contentLengthString.trim());
> > +              } catch (NumberFormatException ex) {
> > +                throw new HttpException("bad content length: " +
> contentLengthString);
> > +              }
> > +            }
> > +
> > +            if (http.getMaxContent() >= 0 && contentLength >
> http.getMaxContent()) {
> > +              contentLength = http.getMaxContent();
> > +            }
> > +
> > +            byte[] buffer = new byte[HttpBase.BUFFER_SIZE];
> > +            int bufferFilled = 0;
> > +            int totalRead = 0;
> > +            ByteArrayOutputStream out = new ByteArrayOutputStream();
> > +            while ((bufferFilled = in.read(buffer, 0, buffer.length))
> != -1
> > +                && totalRead + bufferFilled <= contentLength) {
> > +              totalRead += bufferFilled;
> > +              out.write(buffer, 0, bufferFilled);
> > +            }
> > +
> > +            content = out.toByteArray();
> > +
> > +          } catch (Exception e) {
> > +            if (code == 200)
> > +              throw new IOException(e.toString());
> > +            // for codes other than 200 OK, we are fine with empty
> content
> > +          } finally {
> > +            if (in != null) {
> > +              in.close();
> > +            }
> > +          }
> > +        }
> > +      }
> > +
> > +    } finally {
> > +      if (socket != null)
> > +        socket.close();
> > +    }
> > +  }
> > +
> > +  /* ------------------------- *
> > +   * <implementation:Response> *
> > +   * ------------------------- */
> > +
> > +  public URL getUrl() {
> > +    return url;
> > +  }
> > +
> > +  public int getCode() {
> > +    return code;
> > +  }
> > +
> > +  public String getHeader(String name) {
> > +    return headers.get(name);
> > +  }
> > +
> > +  public Metadata getHeaders() {
> > +    return headers;
> > +  }
> > +
> > +  public byte[] getContent() {
> > +    return content;
> > +  }
> > +
> > +  /* ------------------------- *
> > +   * <implementation:Response> *
> > +   * ------------------------- */
> > +
> > +  private void readPlainContent(URL url) throws IOException {
> > +    String page = HttpWebClient.getHtmlPage(url.toString(), conf);
> > +
> > +    content = page.getBytes("UTF-8");
> > +  }
> > +
> > +  private int parseStatusLine(PushbackInputStream in, StringBuffer
> line) throws IOException, HttpException {
> > +    readLine(in, line, false);
> > +
> > +    int codeStart = line.indexOf(" ");
> > +    int codeEnd = line.indexOf(" ", codeStart + 1);
> > +
> > +    // handle lines with no plaintext result code, ie:
> > +    // "HTTP/1.1 200" vs "HTTP/1.1 200 OK"
> > +    if (codeEnd == -1)
> > +      codeEnd = line.length();
> > +
> > +    int code;
> > +    try {
> > +      code = Integer.parseInt(line.substring(codeStart + 1, codeEnd));
> > +    } catch (NumberFormatException e) {
> > +      throw new HttpException("bad status line '" + line + "': " +
> e.getMessage(), e);
> > +    }
> > +
> > +    return code;
> > +  }
> > +
> > +  private void processHeaderLine(StringBuffer line) throws IOException,
> HttpException {
> > +
> > +    int colonIndex = line.indexOf(":"); // key is up to colon
> > +    if (colonIndex == -1) {
> > +      int i;
> > +      for (i = 0; i < line.length(); i++)
> > +        if (!Character.isWhitespace(line.charAt(i)))
> > +          break;
> > +      if (i == line.length())
> > +        return;
> > +      throw new HttpException("No colon in header:" + line);
> > +    }
> > +    String key = line.substring(0, colonIndex);
> > +
> > +    int valueStart = colonIndex + 1; // skip whitespace
> > +    while (valueStart < line.length()) {
> > +      int c = line.charAt(valueStart);
> > +      if (c != ' ' && c != '\t')
> > +        break;
> > +      valueStart++;
> > +    }
> > +    String value = line.substring(valueStart);
> > +    headers.set(key, value);
> > +  }
> > +
> > +  // Adds headers to our headers Metadata
> > +  private void parseHeaders(PushbackInputStream in, StringBuffer line)
> throws IOException, HttpException {
> > +
> > +    while (readLine(in, line, true) != 0) {
> > +
> > +      // handle HTTP responses with missing blank line after headers
> > +      int pos;
> > +      if (((pos = line.indexOf("<!DOCTYPE")) != -1) || ((pos =
> line.indexOf("<HTML")) != -1)
> > +          || ((pos = line.indexOf("<html")) != -1)) {
> > +
> > +        in.unread(line.substring(pos).getBytes("UTF-8"));
> > +        line.setLength(pos);
> > +
> > +        try {
> > +          //TODO: (CM) We don't know the header names here
> > +          //since we're just handling them generically. It would
> > +          //be nice to provide some sort of mapping function here
> > +          //for the returned header names to the standard metadata
> > +          //names in the ParseData class
> > +          processHeaderLine(line);
> > +        } catch (Exception e) {
> > +          // fixme:
> > +          Http.LOG.warn("Error: ", e);
> > +        }
> > +        return;
> > +      }
> > +
> > +      processHeaderLine(line);
> > +    }
> > +  }
> > +
> > +  private static int readLine(PushbackInputStream in, StringBuffer
> line, boolean allowContinuedLine)
> > +      throws IOException {
> > +    line.setLength(0);
> > +    for (int c = in.read(); c != -1; c = in.read()) {
> > +      switch (c) {
> > +      case '\r':
> > +        if (peek(in) == '\n') {
> > +          in.read();
> > +        }
> > +      case '\n':
> > +        if (line.length() > 0) {
> > +          // at EOL -- check for continued line if the current
> > +          // (possibly continued) line wasn't blank
> > +          if (allowContinuedLine)
> > +            switch (peek(in)) {
> > +            case ' ':
> > +            case '\t': // line is continued
> > +              in.read();
> > +              continue;
> > +            }
> > +        }
> > +        return line.length(); // else complete
> > +      default:
> > +        line.append((char) c);
> > +      }
> > +    }
> > +    throw new EOFException();
> > +  }
> > +
> > +  private static int peek(PushbackInputStream in) throws IOException {
> > +    int value = in.read();
> > +    in.unread(value);
> > +    return value;
> > +  }
> > +}
> >
> > Added:
> nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/package.html
> > URL:
> http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/package.html?rev=1662530&view=auto
> >
> ==============================================================================
> > ---
> nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/package.html
> (added)
> > +++
> nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/package.html
> Thu Feb 26 18:31:39 2015
> > @@ -0,0 +1,5 @@
> > +<html>
> > +<body>
> > +<p>Protocol plugin which supports retrieving documents via
> selenium.</p><p></p>
> > +</body>
> > +</html>
> >
> > Added:
> nutch/trunk/src/plugin/protocol-selenium/src/target/classes/org/apache/nutch/protocol/htmlunit/package.html
> > URL:
> http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-selenium/src/target/classes/org/apache/nutch/protocol/htmlunit/package.html?rev=1662530&view=auto
> >
> ==============================================================================
> > ---
> nutch/trunk/src/plugin/protocol-selenium/src/target/classes/org/apache/nutch/protocol/htmlunit/package.html
> (added)
> > +++
> nutch/trunk/src/plugin/protocol-selenium/src/target/classes/org/apache/nutch/protocol/htmlunit/package.html
> Thu Feb 26 18:31:39 2015
> > @@ -0,0 +1,5 @@
> > +<html>
> > +<body>
> > +<p>Protocol plugin which supports retrieving documents via the
> htmlunit.</p><p></p>
> > +</body>
> > +</html>
> >
> >
>



-- 

Open Source Solutions for Text Engineering

http://digitalpebble.blogspot.com/
http://www.digitalpebble.com
http://twitter.com/digitalpebble

Unsubscribe

Posted by Massimo Miccoli <mm...@iltrovatore.it>.

Massimo

> Il giorno 26/feb/2015, alle ore 19:31, lewismc@apache.org ha scritto:
> 
> Author: lewismc
> Date: Thu Feb 26 18:31:39 2015
> New Revision: 1662530
> 
> URL: http://svn.apache.org/r1662530
> Log:
> NUTCH-1933 nutch-selenium plugin
> 
> Added:
>    nutch/trunk/src/plugin/lib-selenium/
>    nutch/trunk/src/plugin/lib-selenium/build.xml
>    nutch/trunk/src/plugin/lib-selenium/ivy.xml
>    nutch/trunk/src/plugin/lib-selenium/plugin.xml
>    nutch/trunk/src/plugin/lib-selenium/src/
>    nutch/trunk/src/plugin/lib-selenium/src/java/
>    nutch/trunk/src/plugin/lib-selenium/src/java/org/
>    nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/
>    nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/nutch/
>    nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/
>    nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/
>    nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java
>    nutch/trunk/src/plugin/protocol-selenium/
>    nutch/trunk/src/plugin/protocol-selenium/build-ivy.xml
>    nutch/trunk/src/plugin/protocol-selenium/build.xml
>    nutch/trunk/src/plugin/protocol-selenium/ivy.xml
>    nutch/trunk/src/plugin/protocol-selenium/plugin.xml
>    nutch/trunk/src/plugin/protocol-selenium/src/
>    nutch/trunk/src/plugin/protocol-selenium/src/java/
>    nutch/trunk/src/plugin/protocol-selenium/src/java/org/
>    nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/
>    nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/
>    nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/
>    nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/
>    nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/Http.java
>    nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java
>    nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/package.html
>    nutch/trunk/src/plugin/protocol-selenium/src/target/
>    nutch/trunk/src/plugin/protocol-selenium/src/target/classes/
>    nutch/trunk/src/plugin/protocol-selenium/src/target/classes/org/
>    nutch/trunk/src/plugin/protocol-selenium/src/target/classes/org/apache/
>    nutch/trunk/src/plugin/protocol-selenium/src/target/classes/org/apache/nutch/
>    nutch/trunk/src/plugin/protocol-selenium/src/target/classes/org/apache/nutch/protocol/
>    nutch/trunk/src/plugin/protocol-selenium/src/target/classes/org/apache/nutch/protocol/htmlunit/
>    nutch/trunk/src/plugin/protocol-selenium/src/target/classes/org/apache/nutch/protocol/htmlunit/package.html
> Modified:
>    nutch/trunk/CHANGES.txt
>    nutch/trunk/build.xml
>    nutch/trunk/ivy/ivy.xml
>    nutch/trunk/src/plugin/build.xml
> 
> Modified: nutch/trunk/CHANGES.txt
> URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1662530&r1=1662529&r2=1662530&view=diff
> ==============================================================================
> --- nutch/trunk/CHANGES.txt (original)
> +++ nutch/trunk/CHANGES.txt Thu Feb 26 18:31:39 2015
> @@ -2,6 +2,8 @@ Nutch Change Log
> 
> Nutch Current Development 1.10-SNAPSHOT
> 
> +* NUTCH-1933 nutch-selenium plugin (Mo Omer, Mohammad Al-Moshin, lewismc)
> +
> * NUTCH-827 HTTP POST Authentication (Jasper van Veghel, yuanyun.cn, snagel, lewismc)
> 
> * NUTCH-1724 LinkDBReader to support regex output filtering (markus)
> 
> Modified: nutch/trunk/build.xml
> URL: http://svn.apache.org/viewvc/nutch/trunk/build.xml?rev=1662530&r1=1662529&r2=1662530&view=diff
> ==============================================================================
> --- nutch/trunk/build.xml (original)
> +++ nutch/trunk/build.xml Thu Feb 26 18:31:39 2015
> @@ -184,6 +184,7 @@
>       <packageset dir="${plugins.dir}/indexer-solr/src/java"/>
>       <packageset dir="${plugins.dir}/language-identifier/src/java"/>
>       <packageset dir="${plugins.dir}/lib-http/src/java"/>
> +      <packageset dir="${plugins.dir}/lib-selenium/src/java"/>
>       <packageset dir="${plugins.dir}/lib-regex-filter/src/java"/>
>       <packageset dir="${plugins.dir}/microformats-reltag/src/java"/>
>       <packageset dir="${plugins.dir}/parse-ext/src/java"/>
> @@ -197,6 +198,7 @@
>       <packageset dir="${plugins.dir}/protocol-ftp/src/java"/>
>       <packageset dir="${plugins.dir}/protocol-http/src/java"/>
>       <packageset dir="${plugins.dir}/protocol-httpclient/src/java"/>
> +      <packageset dir="${plugins.dir}/protocol-selenium/src/java"/>
>       <packageset dir="${plugins.dir}/scoring-depth/src/java"/>
>       <packageset dir="${plugins.dir}/scoring-link/src/java"/>
>       <packageset dir="${plugins.dir}/scoring-opic/src/java"/>
> @@ -591,6 +593,7 @@
>       <packageset dir="${plugins.dir}/indexer-solr/src/java"/>
>       <packageset dir="${plugins.dir}/language-identifier/src/java"/>
>       <packageset dir="${plugins.dir}/lib-http/src/java"/>
> +      <packageset dir="${plugins.dir}/lib-selenium/src/java"/>
>       <packageset dir="${plugins.dir}/lib-regex-filter/src/java"/>
>       <packageset dir="${plugins.dir}/microformats-reltag/src/java"/>
>       <packageset dir="${plugins.dir}/parse-ext/src/java"/>
> @@ -604,6 +607,7 @@
>       <packageset dir="${plugins.dir}/protocol-ftp/src/java"/>
>       <packageset dir="${plugins.dir}/protocol-http/src/java"/>
>       <packageset dir="${plugins.dir}/protocol-httpclient/src/java"/>
> +      <packageset dir="${plugins.dir}/protocol-selenium/src/java"/>
>       <packageset dir="${plugins.dir}/scoring-depth/src/java"/>
>       <packageset dir="${plugins.dir}/scoring-link/src/java"/>
>       <packageset dir="${plugins.dir}/scoring-opic/src/java"/>
> @@ -985,6 +989,8 @@
>         <source path="${plugins.dir}/language-identifier/src/test/" />
>         <source path="${plugins.dir}/lib-http/src/java/" />
>         <source path="${plugins.dir}/lib-http/src/test/" />
> +        <source path="${plugins.dir}/lib-selenium/src/java/" />
> +        <source path="${plugins.dir}/lib-selenium/src/test/" />
>         <source path="${plugins.dir}/lib-regex-filter/src/java/" />
>         <source path="${plugins.dir}/lib-regex-filter/src/test/" />
>         <source path="${plugins.dir}/microformats-reltag/src/java/" />
> @@ -1008,6 +1014,8 @@
>         <source path="${plugins.dir}/protocol-httpclient/src/test/" />
>         <source path="${plugins.dir}/protocol-http/src/java/" />
>         <source path="${plugins.dir}/protocol-http/src/test/" />
> +        <source path="${plugins.dir}/protocol-selenium/src/java"/>
> +        <source path="${plugins.dir}/protocol-selenium/src/test"/>
>         <source path="${plugins.dir}/scoring-depth/src/java/" />
>         <source path="${plugins.dir}/scoring-link/src/java/" />
>         <source path="${plugins.dir}/scoring-opic/src/java/" />
> 
> Modified: nutch/trunk/ivy/ivy.xml
> URL: http://svn.apache.org/viewvc/nutch/trunk/ivy/ivy.xml?rev=1662530&r1=1662529&r2=1662530&view=diff
> ==============================================================================
> --- nutch/trunk/ivy/ivy.xml (original)
> +++ nutch/trunk/ivy/ivy.xml Thu Feb 26 18:31:39 2015
> @@ -23,24 +23,24 @@
>            database etc.
>        </description>
>    </info>
> -
> +    
>    <configurations>
>        <include file="${basedir}/ivy/ivy-configurations.xml" />
>    </configurations>
> -
> +    
>    <publications>
>        <!--get the artifact from our module name -->
>        <artifact conf="master" />
>    </publications>
> -
> +    
>    <dependencies>
>        <dependency org="org.slf4j" name="slf4j-api" rev="1.6.1"
>            conf="*->master" />
>        <dependency org="org.slf4j" name="slf4j-log4j12" rev="1.6.1"
>            conf="*->master" />
> -
> +        
>        <dependency org="log4j" name="log4j" rev="1.2.15" conf="*->master" />
> -
> +        
>        <dependency org="commons-lang" name="commons-lang" rev="2.6"
>            conf="*->default" />
>        <dependency org="commons-collections" name="commons-collections"
> @@ -49,7 +49,7 @@
>            rev="3.1" conf="*->master" />
>        <dependency org="commons-codec" name="commons-codec" rev="1.3"
>            conf="*->default" />
> -
> +        
>        <dependency org="org.apache.hadoop" name="hadoop-core" rev="1.2.0"
>            conf="*->default">
>            <exclude org="hsqldb" name="hsqldb" />
> 
> Modified: nutch/trunk/src/plugin/build.xml
> URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/build.xml?rev=1662530&r1=1662529&r2=1662530&view=diff
> ==============================================================================
> --- nutch/trunk/src/plugin/build.xml (original)
> +++ nutch/trunk/src/plugin/build.xml Thu Feb 26 18:31:39 2015
> @@ -50,6 +50,8 @@
>      <ant dir="protocol-ftp" target="deploy"/>
>      <ant dir="protocol-http" target="deploy"/>
>      <ant dir="protocol-httpclient" target="deploy"/>
> +     <ant dir="lib-selenium" target="deploy"/>
> +     <ant dir="protocol-selenium" target="deploy" />
>      <ant dir="parse-ext" target="deploy"/>
>      <ant dir="parse-js" target="deploy"/>
>      <ant dir="parse-html" target="deploy"/>
> @@ -149,6 +151,8 @@
>     <ant dir="protocol-ftp" target="clean"/>
>     <ant dir="protocol-http" target="clean"/>
>     <ant dir="protocol-httpclient" target="clean"/>
> +    <ant dir="lib-selenium" target="clean"/>
> +    <ant dir="protocol-selenium" target="clean" />
>     <ant dir="parse-ext" target="clean"/>
>     <ant dir="parse-js" target="clean"/>
>     <ant dir="parse-html" target="clean"/>
> 
> Added: nutch/trunk/src/plugin/lib-selenium/build.xml
> URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-selenium/build.xml?rev=1662530&view=auto
> ==============================================================================
> --- nutch/trunk/src/plugin/lib-selenium/build.xml (added)
> +++ nutch/trunk/src/plugin/lib-selenium/build.xml Thu Feb 26 18:31:39 2015
> @@ -0,0 +1,28 @@
> +<?xml version="1.0"?>
> +<!--
> + Licensed to the Apache Software Foundation (ASF) under one or more
> + contributor license agreements.  See the NOTICE file distributed with
> + this work for additional information regarding copyright ownership.
> + The ASF licenses this file to You under the Apache License, Version 2.0
> + (the "License"); you may not use this file except in compliance with
> + the License.  You may obtain a copy of the License at
> +
> +     http://www.apache.org/licenses/LICENSE-2.0
> +
> + Unless required by applicable law or agreed to in writing, software
> + distributed under the License is distributed on an "AS IS" BASIS,
> + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> + See the License for the specific language governing permissions and
> + limitations under the License.
> +-->
> +<project name="lib-selenium" default="jar-core">
> +
> +  <import file="../build-plugin.xml"/>
> +
> +  <!-- Add compilation dependencies to classpath -->
> +  <path id="plugin.deps">    
> +    <fileset dir="${nutch.root}/build">
> +      <include name="**/lib-http/*.jar" />
> +    </fileset>
> +  </path>
> +</project>
> 
> Added: nutch/trunk/src/plugin/lib-selenium/ivy.xml
> URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-selenium/ivy.xml?rev=1662530&view=auto
> ==============================================================================
> --- nutch/trunk/src/plugin/lib-selenium/ivy.xml (added)
> +++ nutch/trunk/src/plugin/lib-selenium/ivy.xml Thu Feb 26 18:31:39 2015
> @@ -0,0 +1,48 @@
> +<?xml version="1.0" ?>
> +
> +<!--
> +   Licensed to the Apache Software Foundation (ASF) under one or more
> +   contributor license agreements.  See the NOTICE file distributed with
> +   this work for additional information regarding copyright ownership.
> +   The ASF licenses this file to You under the Apache License, Version 2.0
> +   (the "License"); you may not use this file except in compliance with
> +   the License.  You may obtain a copy of the License at
> +
> +       http://www.apache.org/licenses/LICENSE-2.0
> +
> +   Unless required by applicable law or agreed to in writing, software
> +   distributed under the License is distributed on an "AS IS" BASIS,
> +   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> +   See the License for the specific language governing permissions and
> +   limitations under the License.
> +-->
> +
> +<ivy-module version="1.0">
> +  <info organisation="org.apache.nutch" module="${ant.project.name}">
> +    <license name="Apache 2.0"/>
> +    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
> +    <description>
> +        Apache Nutch
> +    </description>
> +  </info>
> +
> +  <configurations>
> +    <include file="../../..//ivy/ivy-configurations.xml"/>
> +  </configurations>
> +
> +  <publications>
> +    <!--get the artifact from our module name-->
> +    <artifact conf="master"/>
> +  </publications>
> +
> +  <dependencies>
> +    <!-- begin selenium dependencies -->
> +    <dependency org="org.seleniumhq.selenium" name="selenium-java" rev="2.44.0" />
> +    
> +    <dependency org="com.opera" name="operadriver" rev="1.5">
> +      <exclude org="org.seleniumhq.selenium" name="selenium-remote-driver" />
> +    </dependency>
> +    <!-- end selenium dependencies -->
> +  </dependencies>
> +  
> +</ivy-module>
> 
> Added: nutch/trunk/src/plugin/lib-selenium/plugin.xml
> URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-selenium/plugin.xml?rev=1662530&view=auto
> ==============================================================================
> --- nutch/trunk/src/plugin/lib-selenium/plugin.xml (added)
> +++ nutch/trunk/src/plugin/lib-selenium/plugin.xml Thu Feb 26 18:31:39 2015
> @@ -0,0 +1,42 @@
> +<?xml version="1.0" encoding="UTF-8"?>
> +<!--
> + Licensed to the Apache Software Foundation (ASF) under one or more
> + contributor license agreements.  See the NOTICE file distributed with
> + this work for additional information regarding copyright ownership.
> + The ASF licenses this file to You under the Apache License, Version 2.0
> + (the "License"); you may not use this file except in compliance with
> + the License.  You may obtain a copy of the License at
> +
> +     http://www.apache.org/licenses/LICENSE-2.0
> +
> + Unless required by applicable law or agreed to in writing, software
> + distributed under the License is distributed on an "AS IS" BASIS,
> + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> + See the License for the specific language governing permissions and
> + limitations under the License.
> +-->
> +<!--
> + ! A common framework for http protocol implementations
> + !-->
> +<plugin
> +   id="lib-selenium"
> +   name="HTTP Framework"
> +   version="1.0"
> +   provider-name="org.apache.nutch">
> +
> +   <runtime>
> +     <library name="lib-selenium.jar">
> +        <export name="*"/>
> +     </library>       
> +   </runtime>
> +
> +   <requires>
> +     <library name="selenium-java-2.4.0.jar">
> +       <export name="*"/>
> +     </library>
> +     <library name="operadriver-1.5.jar">
> +       <export name="*"/>
> +       <exclude name="selenium-remote-driver" />
> +     </library>
> +   </requires>
> +</plugin>
> 
> Added: nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java
> URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java?rev=1662530&view=auto
> ==============================================================================
> --- nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java (added)
> +++ nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java Thu Feb 26 18:31:39 2015
> @@ -0,0 +1,78 @@
> +/**
> + * Licensed to the Apache Software Foundation (ASF) under one or more
> + * contributor license agreements.  See the NOTICE file distributed with
> + * this work for additional information regarding copyright ownership.
> + * The ASF licenses this file to You under the Apache License, Version 2.0
> + * (the "License"); you may not use this file except in compliance with
> + * the License.  You may obtain a copy of the License at
> + *
> + *     http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +package org.apache.nutch.protocol.selenium;
> +
> +import org.apache.hadoop.conf.Configuration;
> +import org.slf4j.Logger;
> +import org.slf4j.LoggerFactory;
> +import org.openqa.selenium.By;
> +import org.openqa.selenium.WebDriver;
> +import org.openqa.selenium.firefox.FirefoxDriver;
> +import org.openqa.selenium.firefox.FirefoxProfile;
> +import org.openqa.selenium.support.ui.WebDriverWait;
> +
> +import java.lang.String;
> +
> +public class HttpWebClient {
> +
> +  private static final Logger LOG = LoggerFactory.getLogger("org.apache.nutch.protocol");
> +
> +  public static ThreadLocal<WebDriver> threadWebDriver = new ThreadLocal<WebDriver>() {
> +
> +    @Override
> +    protected WebDriver initialValue()
> +    {
> +      FirefoxProfile profile = new FirefoxProfile();
> +      profile.setPreference("permissions.default.stylesheet", 2);
> +      profile.setPreference("permissions.default.image", 2);
> +      profile.setPreference("dom.ipc.plugins.enabled.libflashplayer.so", "false");
> +      WebDriver driver = new FirefoxDriver(profile);
> +      return driver;
> +    };
> +  };
> +
> +  public static String getHtmlPage(String url, Configuration conf) {
> +    WebDriver driver = null;
> +
> +    try {
> +      driver = new FirefoxDriver();
> +      //} WebDriver driver = threadWebDriver.get();
> +      //  if (driver == null) {
> +      //    driver = new FirefoxDriver();
> +      //  }
> +
> +      driver.get(url);
> +
> +      // Wait for the page to load, timeout after 3 seconds
> +      new WebDriverWait(driver, 3);
> +
> +      String innerHtml = driver.findElement(By.tagName("body")).getAttribute("innerHTML");
> +
> +      return innerHtml;
> +
> +      // I'm sure this catch statement is a code smell ; borrowing it from lib-htmlunit
> +    } catch (Exception e) {
> +      throw new RuntimeException(e);
> +    } finally {
> +      if (driver != null) try { driver.quit(); } catch (Exception e) { throw new RuntimeException(e); }
> +    }
> +  };
> +
> +  public static String getHtmlPage(String url) {
> +    return getHtmlPage(url, null);
> +  }
> +}
> \ No newline at end of file
> 
> Added: nutch/trunk/src/plugin/protocol-selenium/build-ivy.xml
> URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-selenium/build-ivy.xml?rev=1662530&view=auto
> ==============================================================================
> --- nutch/trunk/src/plugin/protocol-selenium/build-ivy.xml (added)
> +++ nutch/trunk/src/plugin/protocol-selenium/build-ivy.xml Thu Feb 26 18:31:39 2015
> @@ -0,0 +1,54 @@
> +<?xml version="1.0"?>
> +<!--
> + Licensed to the Apache Software Foundation (ASF) under one or more
> + contributor license agreements.  See the NOTICE file distributed with
> + this work for additional information regarding copyright ownership.
> + The ASF licenses this file to You under the Apache License, Version 2.0
> + (the "License"); you may not use this file except in compliance with
> + the License.  You may obtain a copy of the License at
> +
> +     http://www.apache.org/licenses/LICENSE-2.0
> +
> + Unless required by applicable law or agreed to in writing, software
> + distributed under the License is distributed on an "AS IS" BASIS,
> + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> + See the License for the specific language governing permissions and
> + limitations under the License.
> +-->
> +<project name="protocol-selenium" default="deps-jar" xmlns:ivy="antlib:org.apache.ivy.ant">
> +
> +    <property name="ivy.install.version" value="2.1.0" />
> +    <condition property="ivy.home" value="${env.IVY_HOME}">
> +      <isset property="env.IVY_HOME" />
> +    </condition>
> +    <property name="ivy.home" value="${user.home}/.ant" />
> +    <property name="ivy.checksums" value="" />
> +    <property name="ivy.jar.dir" value="${ivy.home}/lib" />
> +    <property name="ivy.jar.file" value="${ivy.jar.dir}/ivy.jar" />
> +
> +    <target name="download-ivy" unless="offline">
> +
> +        <mkdir dir="${ivy.jar.dir}"/>
> +        <!-- download Ivy from web site so that it can be used even without any special installation -->
> +        <get src="http://repo2.maven.org/maven2/org/apache/ivy/ivy/${ivy.install.version}/ivy-${ivy.install.version}.jar" 
> +             dest="${ivy.jar.file}" usetimestamp="true"/>
> +    </target>
> +
> +    <target name="init-ivy" depends="download-ivy">
> +      <!-- try to load ivy here from ivy home, in case the user has not already dropped
> +              it into ant's lib dir (note that the latter copy will always take precedence).
> +              We will not fail as long as local lib dir exists (it may be empty) and
> +              ivy is in at least one of ant's lib dir or the local lib dir. -->
> +        <path id="ivy.lib.path">
> +            <fileset dir="${ivy.jar.dir}" includes="*.jar"/>
> +
> +        </path>
> +        <taskdef resource="org/apache/ivy/ant/antlib.xml"
> +                 uri="antlib:org.apache.ivy.ant" classpathref="ivy.lib.path"/>
> +    </target>
> +
> +  <target name="deps-jar" depends="init-ivy">
> +    <ivy:retrieve pattern="lib/[artifact]-[revision].[ext]"/>
> +  </target>
> +
> +</project>
> 
> Added: nutch/trunk/src/plugin/protocol-selenium/build.xml
> URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-selenium/build.xml?rev=1662530&view=auto
> ==============================================================================
> --- nutch/trunk/src/plugin/protocol-selenium/build.xml (added)
> +++ nutch/trunk/src/plugin/protocol-selenium/build.xml Thu Feb 26 18:31:39 2015
> @@ -0,0 +1,36 @@
> +<?xml version="1.0"?>
> +<!--
> + Licensed to the Apache Software Foundation (ASF) under one or more
> + contributor license agreements.  See the NOTICE file distributed with
> + this work for additional information regarding copyright ownership.
> + The ASF licenses this file to You under the Apache License, Version 2.0
> + (the "License"); you may not use this file except in compliance with
> + the License.  You may obtain a copy of the License at
> +
> +     http://www.apache.org/licenses/LICENSE-2.0
> +
> + Unless required by applicable law or agreed to in writing, software
> + distributed under the License is distributed on an "AS IS" BASIS,
> + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> + See the License for the specific language governing permissions and
> + limitations under the License.
> +-->
> +<project name="protocol-selenium" default="jar-core">
> +
> +  <import file="../build-plugin.xml"/>
> +
> +  <!-- Build compilation dependencies -->
> +  <target name="deps-jar">
> +    <ant target="jar" inheritall="false" dir="../lib-http"/>
> +    <ant target="jar" inheritall="false" dir="../lib-selenium"/>
> +  </target>
> +
> +  <!-- Add compilation dependencies to classpath -->
> +  <path id="plugin.deps">
> +    <fileset dir="${nutch.root}/build">
> +      <include name="**/lib-http/*.jar" />
> +      <include name="**/lib-selenium/*.jar" />
> +    </fileset>
> +  </path>
> +
> +</project>
> 
> Added: nutch/trunk/src/plugin/protocol-selenium/ivy.xml
> URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-selenium/ivy.xml?rev=1662530&view=auto
> ==============================================================================
> --- nutch/trunk/src/plugin/protocol-selenium/ivy.xml (added)
> +++ nutch/trunk/src/plugin/protocol-selenium/ivy.xml Thu Feb 26 18:31:39 2015
> @@ -0,0 +1,48 @@
> +<?xml version="1.0" ?>
> +
> +<!--
> +   Licensed to the Apache Software Foundation (ASF) under one or more
> +   contributor license agreements.  See the NOTICE file distributed with
> +   this work for additional information regarding copyright ownership.
> +   The ASF licenses this file to You under the Apache License, Version 2.0
> +   (the "License"); you may not use this file except in compliance with
> +   the License.  You may obtain a copy of the License at
> +
> +       http://www.apache.org/licenses/LICENSE-2.0
> +
> +   Unless required by applicable law or agreed to in writing, software
> +   distributed under the License is distributed on an "AS IS" BASIS,
> +   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> +   See the License for the specific language governing permissions and
> +   limitations under the License.
> +-->
> +
> +<ivy-module version="1.0">
> +  <info organisation="org.apache.nutch" module="${ant.project.name}">
> +    <license name="Apache 2.0"/>
> +    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
> +    <description>
> +        Apache Nutch
> +    </description>
> +  </info>
> +
> +  <configurations>
> +    <include file="../../..//ivy/ivy-configurations.xml"/>
> +  </configurations>
> +
> +  <publications>
> +    <!--get the artifact from our module name-->
> +    <artifact conf="default"/>
> +  </publications>
> +
> +  <dependencies>
> +    <!-- begin selenium dependencies -->
> +    <dependency org="org.seleniumhq.selenium" name="selenium-java" rev="2.44.0" />
> +    
> +    <dependency org="com.opera" name="operadriver" rev="1.5">
> +      <exclude org="org.seleniumhq.selenium" name="selenium-remote-driver" />
> +    </dependency>
> +    <!-- end selenium dependencies -->
> +  </dependencies>
> +  
> +</ivy-module>
> 
> Added: nutch/trunk/src/plugin/protocol-selenium/plugin.xml
> URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-selenium/plugin.xml?rev=1662530&view=auto
> ==============================================================================
> --- nutch/trunk/src/plugin/protocol-selenium/plugin.xml (added)
> +++ nutch/trunk/src/plugin/protocol-selenium/plugin.xml Thu Feb 26 18:31:39 2015
> @@ -0,0 +1,90 @@
> +<?xml version="1.0" encoding="UTF-8"?>
> +<!--
> + Licensed to the Apache Software Foundation (ASF) under one or more
> + contributor license agreements.  See the NOTICE file distributed with
> + this work for additional information regarding copyright ownership.
> + The ASF licenses this file to You under the Apache License, Version 2.0
> + (the "License"); you may not use this file except in compliance with
> + the License.  You may obtain a copy of the License at
> +
> +     http://www.apache.org/licenses/LICENSE-2.0
> +
> + Unless required by applicable law or agreed to in writing, software
> + distributed under the License is distributed on an "AS IS" BASIS,
> + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> + See the License for the specific language governing permissions and
> + limitations under the License.
> +-->
> +<plugin
> +   id="protocol-selenium"
> +   name="Http Protocol Plug-in"
> +   version="1.0.0"
> +   provider-name="nutch.org">
> +
> +   <runtime>
> +      <library name="protocol-selenium.jar">
> +         <export name="*"/>
> +      </library>
> +      <library name="cglib-nodep-2.1_3.jar"/>
> +      <library name="commons-codec-1.9.jar"/>
> +      <library name="commons-collections-3.2.1.jar"/>
> +      <library name="commons-exec-1.1.jar"/>
> +      <library name="commons-io-2.4.jar"/>
> +      <library name="commons-jxpath-1.3.jar"/>
> +      <library name="commons-lang3-3.3.2.jar"/>
> +      <library name="commons-logging-1.1.3.jar"/>
> +      <library name="cssparser-0.9.14.jar"/>
> +      <library name="gson-2.3.jar"/>
> +      <library name="guava-18.0.jar"/>
> +      <library name="htmlunit-2.15.jar"/>
> +      <library name="htmlunit-core-js-2.15.jar"/>
> +      <library name="httpclient-4.3.4.jar"/>
> +      <library name="httpcore-4.3.2.jar"/>
> +      <library name="httpmime-4.3.3.jar"/>
> +      <library name="ini4j-0.5.2.jar"/>
> +      <library name="jetty-http-8.1.15.v20140411.jar"/>
> +      <library name="jetty-io-8.1.15.v20140411.jar"/>
> +      <library name="jetty-util-8.1.15.v20140411.jar"/>
> +      <library name="jetty-websocket-8.1.15.v20140411.jar"/>
> +      <library name="jna-3.4.0.jar"/>
> +      <library name="nekohtml-1.9.21.jar"/>
> +      <library name="netty-3.5.2.Final.jar"/>
> +      <library name="operadriver-1.5.jar"/>
> +      <library name="operalaunchers-1.1.jar"/>
> +      <library name="platform-3.4.0.jar"/>
> +      <library name="protobuf-java-2.4.1.jar"/>
> +      <library name="sac-1.3.jar"/>
> +      <library name="selenium-api-2.44.0.jar"/>
> +      <library name="selenium-chrome-driver-2.44.0.jar"/>
> +      <library name="selenium-firefox-driver-2.44.0.jar"/>
> +      <library name="selenium-htmlunit-driver-2.44.0.jar"/>
> +      <library name="selenium-ie-driver-2.44.0.jar"/>
> +      <library name="selenium-java-2.44.0.jar"/>
> +      <library name="selenium-remote-driver-2.44.0.jar"/>
> +      <library name="selenium-safari-driver-2.44.0.jar"/>
> +      <library name="selenium-support-2.44.0.jar"/>
> +      <library name="serializer-2.7.1.jar"/>
> +      <library name="webbit-0.4.14.jar"/>
> +      <library name="xalan-2.7.1.jar"/>
> +      <library name="xercesImpl-2.11.0.jar"/>
> +      <library name="xml-apis-1.4.01.jar"/>
> +   </runtime>
> +
> +   <requires>
> +      <import plugin="nutch-extensionpoints"/>
> +      <import plugin="lib-http"/>
> +      <import plugin="lib-selenium"/>
> +   </requires>
> +
> +   <extension id="org.apache.nutch.protocol.selenium"
> +              name="HttpProtocol"
> +              point="org.apache.nutch.protocol.Protocol">
> +
> +      <implementation id="org.apache.nutch.protocol.selenium.Http"
> +                      class="org.apache.nutch.protocol.selenium.Http">
> +        <parameter name="protocolName" value="http"/>
> +      </implementation>
> +
> +   </extension>
> +
> +</plugin>
> 
> Added: nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/Http.java
> URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/Http.java?rev=1662530&view=auto
> ==============================================================================
> --- nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/Http.java (added)
> +++ nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/Http.java Thu Feb 26 18:31:39 2015
> @@ -0,0 +1,59 @@
> +/**
> + * Licensed to the Apache Software Foundation (ASF) under one or more
> + * contributor license agreements.  See the NOTICE file distributed with
> + * this work for additional information regarding copyright ownership.
> + * The ASF licenses this file to You under the Apache License, Version 2.0
> + * (the "License"); you may not use this file except in compliance with
> + * the License.  You may obtain a copy of the License at
> + *
> + *     http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +package org.apache.nutch.protocol.selenium;
> +
> +// JDK imports
> +import java.io.IOException;
> +import java.net.URL;
> +import org.apache.hadoop.conf.Configuration;
> +import org.apache.nutch.crawl.CrawlDatum;
> +import org.apache.nutch.net.protocols.Response;
> +import org.apache.nutch.protocol.http.api.HttpBase;
> +import org.apache.nutch.protocol.ProtocolException;
> +import org.apache.nutch.util.NutchConfiguration;
> +
> +import org.apache.nutch.protocol.selenium.HttpResponse;
> +
> +import org.slf4j.Logger;
> +import org.slf4j.LoggerFactory;
> +
> +public class Http extends HttpBase {
> +
> +  public static final Logger LOG = LoggerFactory.getLogger(Http.class);
> +
> +  public Http() {
> +    super(LOG);
> +  }
> +
> +  @Override
> +  public void setConf(Configuration conf) {
> +    super.setConf(conf);
> +  }
> +
> +  public static void main(String[] args) throws Exception {
> +    Http http = new Http();
> +    http.setConf(NutchConfiguration.create());
> +    main(http, args);
> +  }
> +
> +  @Override
> +  protected Response getResponse(URL url, CrawlDatum datum, boolean redirect)
> +      throws ProtocolException, IOException {
> +    return new HttpResponse(this, url, datum);
> +  }
> +
> +}
> 
> Added: nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java
> URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java?rev=1662530&view=auto
> ==============================================================================
> --- nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java (added)
> +++ nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java Thu Feb 26 18:31:39 2015
> @@ -0,0 +1,360 @@
> +/**
> + * Licensed to the Apache Software Foundation (ASF) under one or more
> + * contributor license agreements.  See the NOTICE file distributed with
> + * this work for additional information regarding copyright ownership.
> + * The ASF licenses this file to You under the Apache License, Version 2.0
> + * (the "License"); you may not use this file except in compliance with
> + * the License.  You may obtain a copy of the License at
> + *
> + *     http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +package org.apache.nutch.protocol.selenium;
> +
> +// JDK imports
> +import java.io.BufferedInputStream;
> +import java.io.EOFException;
> +import java.io.IOException;
> +import java.io.OutputStream;
> +import java.io.ByteArrayOutputStream;
> +import java.io.PushbackInputStream;
> +import java.net.InetSocketAddress;
> +import java.net.Socket;
> +import java.net.URL;
> +
> +import org.apache.hadoop.conf.Configuration;
> +import org.apache.nutch.crawl.CrawlDatum;
> +import org.apache.nutch.metadata.Metadata;
> +import org.apache.nutch.metadata.SpellCheckedMetadata;
> +import org.apache.nutch.net.protocols.HttpDateFormat;
> +import org.apache.nutch.net.protocols.Response;
> +import org.apache.nutch.protocol.ProtocolException;
> +import org.apache.nutch.protocol.http.api.HttpException;
> +import org.apache.nutch.protocol.http.api.HttpBase;
> +
> +/* Most of this code was borrowed from protocol-htmlunit; which in turn borrowed it from protocol-httpclient */
> +
> +public class HttpResponse implements Response {
> +
> +  private Http http;
> +  private URL url;
> +  private String orig;
> +  private String base;
> +  private byte[] content;
> +  private int code;
> +  private Metadata headers = new SpellCheckedMetadata();
> +
> +  /** The nutch configuration */
> +  private Configuration conf = null;
> +
> +  public HttpResponse(Http http, URL url, CrawlDatum datum) throws ProtocolException, IOException {
> +
> +    this.conf = http.getConf();
> +    this.http = http;
> +    this.url = url;
> +    this.orig = url.toString();
> +    this.base = url.toString();
> +
> +    if (!"http".equals(url.getProtocol()))
> +      throw new HttpException("Not an HTTP url:" + url);
> +
> +    if (Http.LOG.isTraceEnabled()) {
> +      Http.LOG.trace("fetching " + url);
> +    }
> +
> +    String path = "".equals(url.getFile()) ? "/" : url.getFile();
> +
> +    // some servers will redirect a request with a host line like
> +    // "Host: <hostname>:80" to "http://<hpstname>/<orig_path>"- they
> +    // don't want the :80...
> +
> +    String host = url.getHost();
> +    int port;
> +    String portString;
> +    if (url.getPort() == -1) {
> +      port = 80;
> +      portString = "";
> +    } else {
> +      port = url.getPort();
> +      portString = ":" + port;
> +    }
> +    Socket socket = null;
> +
> +    try {
> +      socket = new Socket(); // create the socket
> +      socket.setSoTimeout(http.getTimeout());
> +
> +      // connect
> +      String sockHost = http.useProxy() ? http.getProxyHost() : host;
> +      int sockPort = http.useProxy() ? http.getProxyPort() : port;
> +      InetSocketAddress sockAddr = new InetSocketAddress(sockHost, sockPort);
> +      socket.connect(sockAddr, http.getTimeout());
> +
> +      // make request
> +      OutputStream req = socket.getOutputStream();
> +
> +      StringBuffer reqStr = new StringBuffer("GET ");
> +      if (http.useProxy()) {
> +        reqStr.append(url.getProtocol() + "://" + host + portString + path);
> +      } else {
> +        reqStr.append(path);
> +      }
> +
> +      reqStr.append(" HTTP/1.0\r\n");
> +
> +      reqStr.append("Host: ");
> +      reqStr.append(host);
> +      reqStr.append(portString);
> +      reqStr.append("\r\n");
> +
> +      reqStr.append("Accept-Encoding: x-gzip, gzip, deflate\r\n");
> +
> +      String userAgent = http.getUserAgent();
> +      if ((userAgent == null) || (userAgent.length() == 0)) {
> +        if (Http.LOG.isErrorEnabled()) {
> +          Http.LOG.error("User-agent is not set!");
> +        }
> +      } else {
> +        reqStr.append("User-Agent: ");
> +        reqStr.append(userAgent);
> +        reqStr.append("\r\n");
> +      }
> +
> +      reqStr.append("Accept-Language: ");
> +      reqStr.append(this.http.getAcceptLanguage());
> +      reqStr.append("\r\n");
> +
> +      reqStr.append("Accept: ");
> +      reqStr.append(this.http.getAccept());
> +      reqStr.append("\r\n");
> +
> +      if (datum.getModifiedTime() > 0) {
> +        reqStr.append("If-Modified-Since: " + HttpDateFormat.toString(datum.getModifiedTime()));
> +        reqStr.append("\r\n");
> +      }
> +      reqStr.append("\r\n");
> +
> +      byte[] reqBytes = reqStr.toString().getBytes();
> +
> +      req.write(reqBytes);
> +      req.flush();
> +
> +      PushbackInputStream in = // process response
> +          new PushbackInputStream(new BufferedInputStream(socket.getInputStream(), Http.BUFFER_SIZE),
> +              Http.BUFFER_SIZE);
> +
> +      StringBuffer line = new StringBuffer();
> +
> +      boolean haveSeenNonContinueStatus = false;
> +      while (!haveSeenNonContinueStatus) {
> +        // parse status code line
> +        this.code = parseStatusLine(in, line);
> +        // parse headers
> +        parseHeaders(in, line);
> +        haveSeenNonContinueStatus = code != 100; // 100 is "Continue"
> +      }
> +
> +      // Get Content type header
> +      String contentType = getHeader(Response.CONTENT_TYPE);
> +
> +      // handle with Selenium only if content type in HTML or XHTML 
> +      if (contentType != null) {
> +        if (contentType.contains("text/html") || contentType.contains("application/xhtml")) {
> +          readPlainContent(url);
> +        } else {
> +          try {
> +            int contentLength = Integer.MAX_VALUE;
> +            String contentLengthString = headers.get(Response.CONTENT_LENGTH);
> +            if (contentLengthString != null) {
> +              try {
> +                contentLength = Integer.parseInt(contentLengthString.trim());
> +              } catch (NumberFormatException ex) {
> +                throw new HttpException("bad content length: " + contentLengthString);
> +              }
> +            }
> +
> +            if (http.getMaxContent() >= 0 && contentLength > http.getMaxContent()) {
> +              contentLength = http.getMaxContent();
> +            }
> +
> +            byte[] buffer = new byte[HttpBase.BUFFER_SIZE];
> +            int bufferFilled = 0;
> +            int totalRead = 0;
> +            ByteArrayOutputStream out = new ByteArrayOutputStream();
> +            while ((bufferFilled = in.read(buffer, 0, buffer.length)) != -1
> +                && totalRead + bufferFilled <= contentLength) {
> +              totalRead += bufferFilled;
> +              out.write(buffer, 0, bufferFilled);
> +            }
> +
> +            content = out.toByteArray();
> +
> +          } catch (Exception e) {
> +            if (code == 200)
> +              throw new IOException(e.toString());
> +            // for codes other than 200 OK, we are fine with empty content
> +          } finally {
> +            if (in != null) {
> +              in.close();
> +            }
> +          }
> +        }
> +      } 
> +
> +    } finally {
> +      if (socket != null)
> +        socket.close();
> +    }
> +  }
> +
> +  /* ------------------------- *
> +   * <implementation:Response> *
> +   * ------------------------- */
> +
> +  public URL getUrl() {
> +    return url;
> +  }
> +
> +  public int getCode() {
> +    return code;
> +  }
> +
> +  public String getHeader(String name) {
> +    return headers.get(name);
> +  }
> +
> +  public Metadata getHeaders() {
> +    return headers;
> +  }
> +
> +  public byte[] getContent() {
> +    return content;
> +  }
> +
> +  /* ------------------------- *
> +   * <implementation:Response> *
> +   * ------------------------- */
> +
> +  private void readPlainContent(URL url) throws IOException {
> +    String page = HttpWebClient.getHtmlPage(url.toString(), conf);
> +
> +    content = page.getBytes("UTF-8");
> +  }
> +
> +  private int parseStatusLine(PushbackInputStream in, StringBuffer line) throws IOException, HttpException {
> +    readLine(in, line, false);
> +
> +    int codeStart = line.indexOf(" ");
> +    int codeEnd = line.indexOf(" ", codeStart + 1);
> +
> +    // handle lines with no plaintext result code, ie:
> +    // "HTTP/1.1 200" vs "HTTP/1.1 200 OK"
> +    if (codeEnd == -1)
> +      codeEnd = line.length();
> +
> +    int code;
> +    try {
> +      code = Integer.parseInt(line.substring(codeStart + 1, codeEnd));
> +    } catch (NumberFormatException e) {
> +      throw new HttpException("bad status line '" + line + "': " + e.getMessage(), e);
> +    }
> +
> +    return code;
> +  }
> +
> +  private void processHeaderLine(StringBuffer line) throws IOException, HttpException {
> +
> +    int colonIndex = line.indexOf(":"); // key is up to colon
> +    if (colonIndex == -1) {
> +      int i;
> +      for (i = 0; i < line.length(); i++)
> +        if (!Character.isWhitespace(line.charAt(i)))
> +          break;
> +      if (i == line.length())
> +        return;
> +      throw new HttpException("No colon in header:" + line);
> +    }
> +    String key = line.substring(0, colonIndex);
> +
> +    int valueStart = colonIndex + 1; // skip whitespace
> +    while (valueStart < line.length()) {
> +      int c = line.charAt(valueStart);
> +      if (c != ' ' && c != '\t')
> +        break;
> +      valueStart++;
> +    }
> +    String value = line.substring(valueStart);
> +    headers.set(key, value);
> +  }
> +
> +  // Adds headers to our headers Metadata
> +  private void parseHeaders(PushbackInputStream in, StringBuffer line) throws IOException, HttpException {
> +
> +    while (readLine(in, line, true) != 0) {
> +
> +      // handle HTTP responses with missing blank line after headers
> +      int pos;
> +      if (((pos = line.indexOf("<!DOCTYPE")) != -1) || ((pos = line.indexOf("<HTML")) != -1)
> +          || ((pos = line.indexOf("<html")) != -1)) {
> +
> +        in.unread(line.substring(pos).getBytes("UTF-8"));
> +        line.setLength(pos);
> +
> +        try {
> +          //TODO: (CM) We don't know the header names here
> +          //since we're just handling them generically. It would
> +          //be nice to provide some sort of mapping function here
> +          //for the returned header names to the standard metadata
> +          //names in the ParseData class
> +          processHeaderLine(line);
> +        } catch (Exception e) {
> +          // fixme:
> +          Http.LOG.warn("Error: ", e);
> +        }
> +        return;
> +      }
> +
> +      processHeaderLine(line);
> +    }
> +  }
> +
> +  private static int readLine(PushbackInputStream in, StringBuffer line, boolean allowContinuedLine)
> +      throws IOException {
> +    line.setLength(0);
> +    for (int c = in.read(); c != -1; c = in.read()) {
> +      switch (c) {
> +      case '\r':
> +        if (peek(in) == '\n') {
> +          in.read();
> +        }
> +      case '\n':
> +        if (line.length() > 0) {
> +          // at EOL -- check for continued line if the current
> +          // (possibly continued) line wasn't blank
> +          if (allowContinuedLine)
> +            switch (peek(in)) {
> +            case ' ':
> +            case '\t': // line is continued
> +              in.read();
> +              continue;
> +            }
> +        }
> +        return line.length(); // else complete
> +      default:
> +        line.append((char) c);
> +      }
> +    }
> +    throw new EOFException();
> +  }
> +
> +  private static int peek(PushbackInputStream in) throws IOException {
> +    int value = in.read();
> +    in.unread(value);
> +    return value;
> +  }
> +}
> 
> Added: nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/package.html
> URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/package.html?rev=1662530&view=auto
> ==============================================================================
> --- nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/package.html (added)
> +++ nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/package.html Thu Feb 26 18:31:39 2015
> @@ -0,0 +1,5 @@
> +<html>
> +<body>
> +<p>Protocol plugin which supports retrieving documents via selenium.</p><p></p>
> +</body>
> +</html>
> 
> Added: nutch/trunk/src/plugin/protocol-selenium/src/target/classes/org/apache/nutch/protocol/htmlunit/package.html
> URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-selenium/src/target/classes/org/apache/nutch/protocol/htmlunit/package.html?rev=1662530&view=auto
> ==============================================================================
> --- nutch/trunk/src/plugin/protocol-selenium/src/target/classes/org/apache/nutch/protocol/htmlunit/package.html (added)
> +++ nutch/trunk/src/plugin/protocol-selenium/src/target/classes/org/apache/nutch/protocol/htmlunit/package.html Thu Feb 26 18:31:39 2015
> @@ -0,0 +1,5 @@
> +<html>
> +<body>
> +<p>Protocol plugin which supports retrieving documents via the htmlunit.</p><p></p>
> +</body>
> +</html>
> 
>