You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2014/01/29 15:05:14 UTC
svn commit: r1562447 - in /nutch/branches/2.x: ./ src/plugin/lib-nekohtml/
src/plugin/parse-html/src/java/org/apache/nutch/parse/html/
src/plugin/parse-html/src/test/org/apache/nutch/parse/html/
Author: lewismc
Date: Wed Jan 29 14:05:14 2014
New Revision: 1562447
URL: http://svn.apache.org/r1562447
Log:
NUTCH-1253 Incompatable neko and xerces versions
Modified:
nutch/branches/2.x/CHANGES.txt
nutch/branches/2.x/src/plugin/lib-nekohtml/ivy.xml
nutch/branches/2.x/src/plugin/lib-nekohtml/plugin.xml
nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
Modified: nutch/branches/2.x/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1562447&r1=1562446&r2=1562447&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Wed Jan 29 14:05:14 2014
@@ -2,6 +2,8 @@ Nutch Change Log
Current Development
+* NUTCH-1253 Incompatable neko and xerces versions (snagel, lewismc)
+
* NUTCH-1715 RobotRulesParser adds additional '*' to the robots name (tejasp)
* NUTCH-356 Plugin repository cache can lead to memory leak (Enrico Triolo, DoÄacan Güney via markus)
Modified: nutch/branches/2.x/src/plugin/lib-nekohtml/ivy.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/lib-nekohtml/ivy.xml?rev=1562447&r1=1562446&r2=1562447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/lib-nekohtml/ivy.xml (original)
+++ nutch/branches/2.x/src/plugin/lib-nekohtml/ivy.xml Wed Jan 29 14:05:14 2014
@@ -36,7 +36,7 @@
</publications>
<dependencies>
- <dependency org="nekohtml" name="nekohtml" rev="0.9.5" conf="*->master"/>
+ <dependency org="net.sourceforge.nekohtml" name="nekohtml" rev="1.9.19" conf="*->master"/>
</dependencies>
</ivy-module>
Modified: nutch/branches/2.x/src/plugin/lib-nekohtml/plugin.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/lib-nekohtml/plugin.xml?rev=1562447&r1=1562446&r2=1562447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/lib-nekohtml/plugin.xml (original)
+++ nutch/branches/2.x/src/plugin/lib-nekohtml/plugin.xml Wed Jan 29 14:05:14 2014
@@ -16,20 +16,21 @@
limitations under the License.
-->
<!--
- ! NekoHTML is a simple HTML scanner and tag balancer.
- ! (http://people.apache.org/~andyc/neko/doc/html/index.html)
+ ! NekoHTML is a simple HTML scanner and tag balancer that enables
+ ! application programmers to parse HTML documents and access the
+ ! information using standard XML interfaces.
+ ! (http://sourceforge.net/projects/nekohtml/)
!
- ! Dowload : http://people.apache.org/~andyc/neko/doc/html/index.html
- ! License : http://people.apache.org/~andyc/neko/LICENSE
+ ! License : https://nekohtml.svn.sourceforge.net/svnroot/nekohtml/trunk/LICENSE.txt
!-->
<plugin
id="lib-nekohtml"
name="CyberNeko HTML Parser"
- version="1.9.11"
- provider-name="org.cyberneko">
+ version="1.9.19"
+ provider-name="net.sourceforge.nekohtml">
<runtime>
- <library name="nekohtml-0.9.5.jar">
+ <library name="nekohtml-1.9.19.jar">
<export name="*"/>
</library>
</runtime>
Modified: nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java?rev=1562447&r1=1562446&r2=1562447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java (original)
+++ nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java Wed Jan 29 14:05:14 2014
@@ -254,6 +254,8 @@ public class HtmlParser implements Parse
private DocumentFragment parseNeko(InputSource input) throws Exception {
DOMFragmentParser parser = new DOMFragmentParser();
try {
+ parser.setFeature("http://cyberneko.org/html/features/scanner/allow-selfclosing-iframe",
+ true);
parser.setFeature("http://cyberneko.org/html/features/augmentations",
true);
parser.setProperty("http://cyberneko.org/html/properties/default-encoding",
Modified: nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java?rev=1562447&r1=1562446&r2=1562447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java (original)
+++ nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java Wed Jan 29 14:05:14 2014
@@ -223,6 +223,11 @@ public class TestDOMContentUtils {
conf.setBoolean("parser.html.form.use_action", true);
utils = new DOMContentUtils(conf);
DOMFragmentParser parser= new DOMFragmentParser();
+ try {
+ parser.setFeature(
+ "http://cyberneko.org/html/features/scanner/allow-selfclosing-iframe",
+ true);
+ } catch (SAXException e) {}
for (int i= 0; i < testPages.length; i++) {
DocumentFragment node=
new HTMLDocumentImpl().createDocumentFragment();