You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2014/01/29 15:05:14 UTC

svn commit: r1562447 - in /nutch/branches/2.x: ./ src/plugin/lib-nekohtml/ src/plugin/parse-html/src/java/org/apache/nutch/parse/html/ src/plugin/parse-html/src/test/org/apache/nutch/parse/html/

Author: lewismc
Date: Wed Jan 29 14:05:14 2014
New Revision: 1562447

URL: http://svn.apache.org/r1562447
Log:
NUTCH-1253 Incompatable neko and xerces versions

Modified:
    nutch/branches/2.x/CHANGES.txt
    nutch/branches/2.x/src/plugin/lib-nekohtml/ivy.xml
    nutch/branches/2.x/src/plugin/lib-nekohtml/plugin.xml
    nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
    nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java

Modified: nutch/branches/2.x/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1562447&r1=1562446&r2=1562447&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Wed Jan 29 14:05:14 2014
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Current Development
 
+* NUTCH-1253 Incompatable neko and xerces versions (snagel, lewismc)
+
 * NUTCH-1715 RobotRulesParser adds additional '*' to the robots name (tejasp)
 
 * NUTCH-356 Plugin repository cache can lead to memory leak (Enrico Triolo, Doğacan Güney via markus)

Modified: nutch/branches/2.x/src/plugin/lib-nekohtml/ivy.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/lib-nekohtml/ivy.xml?rev=1562447&r1=1562446&r2=1562447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/lib-nekohtml/ivy.xml (original)
+++ nutch/branches/2.x/src/plugin/lib-nekohtml/ivy.xml Wed Jan 29 14:05:14 2014
@@ -36,7 +36,7 @@
   </publications>
 
   <dependencies>
-    <dependency org="nekohtml" name="nekohtml" rev="0.9.5" conf="*->master"/>
+    <dependency org="net.sourceforge.nekohtml" name="nekohtml" rev="1.9.19" conf="*->master"/>
   </dependencies>
   
 </ivy-module>

Modified: nutch/branches/2.x/src/plugin/lib-nekohtml/plugin.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/lib-nekohtml/plugin.xml?rev=1562447&r1=1562446&r2=1562447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/lib-nekohtml/plugin.xml (original)
+++ nutch/branches/2.x/src/plugin/lib-nekohtml/plugin.xml Wed Jan 29 14:05:14 2014
@@ -16,20 +16,21 @@
  limitations under the License.
 -->
 <!--
- ! NekoHTML is a simple HTML scanner and tag balancer.
- ! (http://people.apache.org/~andyc/neko/doc/html/index.html)
+ ! NekoHTML is a simple HTML scanner and tag balancer that enables 
+ ! application programmers to parse HTML documents and access the 
+ ! information using standard XML interfaces.
+ ! (http://sourceforge.net/projects/nekohtml/)
  ! 
- ! Dowload : http://people.apache.org/~andyc/neko/doc/html/index.html
- ! License : http://people.apache.org/~andyc/neko/LICENSE
+ ! License : https://nekohtml.svn.sourceforge.net/svnroot/nekohtml/trunk/LICENSE.txt
  !-->
 <plugin
    id="lib-nekohtml"
    name="CyberNeko HTML Parser"
-   version="1.9.11"
-   provider-name="org.cyberneko">
+   version="1.9.19"
+   provider-name="net.sourceforge.nekohtml">
 
    <runtime>
-     <library name="nekohtml-0.9.5.jar">
+     <library name="nekohtml-1.9.19.jar">
         <export name="*"/>
      </library>
    </runtime>

Modified: nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java?rev=1562447&r1=1562446&r2=1562447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java (original)
+++ nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java Wed Jan 29 14:05:14 2014
@@ -254,6 +254,8 @@ public class HtmlParser implements Parse
   private DocumentFragment parseNeko(InputSource input) throws Exception {
     DOMFragmentParser parser = new DOMFragmentParser();
     try {
+      parser.setFeature("http://cyberneko.org/html/features/scanner/allow-selfclosing-iframe",
+              true);
       parser.setFeature("http://cyberneko.org/html/features/augmentations",
           true);
       parser.setProperty("http://cyberneko.org/html/properties/default-encoding",

Modified: nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java?rev=1562447&r1=1562446&r2=1562447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java (original)
+++ nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java Wed Jan 29 14:05:14 2014
@@ -223,6 +223,11 @@ public class TestDOMContentUtils {
     conf.setBoolean("parser.html.form.use_action", true);
     utils = new DOMContentUtils(conf);
     DOMFragmentParser parser= new DOMFragmentParser();
+    try {
+      parser.setFeature(
+          "http://cyberneko.org/html/features/scanner/allow-selfclosing-iframe",
+          true);
+    } catch (SAXException e) {}
     for (int i= 0; i < testPages.length; i++) {
         DocumentFragment node= 
           new HTMLDocumentImpl().createDocumentFragment();