You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2011/09/22 16:02:51 UTC
svn commit: r1174147 - in /nutch/branches/branch-1.4: conf/nutch-default.xml
src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java
Author: markus
Date: Thu Sep 22 14:02:51 2011
New Revision: 1174147
URL: http://svn.apache.org/viewvc?rev=1174147&view=rev
Log:
NUTCH-1115 Option to disable fixing of URL embedded parameters in DomContentUtils
Modified:
nutch/branches/branch-1.4/conf/nutch-default.xml
nutch/branches/branch-1.4/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
nutch/branches/branch-1.4/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java
Modified: nutch/branches/branch-1.4/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.4/conf/nutch-default.xml?rev=1174147&r1=1174146&r2=1174147&view=diff
==============================================================================
--- nutch/branches/branch-1.4/conf/nutch-default.xml (original)
+++ nutch/branches/branch-1.4/conf/nutch-default.xml Thu Sep 22 14:02:51 2011
@@ -921,7 +921,6 @@
"all" doesn't show either content or summaries.</description>
</property>
-
<property>
<name>parser.html.impl</name>
<value>neko</value>
@@ -950,6 +949,13 @@
</property>
<property>
+ <name>parser.fix.embeddedparams</name>
+ <value>true</value>
+ <description>Whether to fix URL embedded params using semi-colons.
+ See NUTCH-436 and NUTCH-1115</description>
+</property>
+
+<property>
<name>htmlparsefilter.order</name>
<value></value>
<description>The order by which HTMLParse filters are applied.
Modified: nutch/branches/branch-1.4/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.4/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java?rev=1174147&r1=1174146&r2=1174147&view=diff
==============================================================================
--- nutch/branches/branch-1.4/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java (original)
+++ nutch/branches/branch-1.4/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java Thu Sep 22 14:02:51 2011
@@ -39,6 +39,8 @@ import org.w3c.dom.*;
*/
public class DOMContentUtils {
+ private boolean fixEmbeddedParams;
+
public static class LinkParams {
public String elName;
public String attrName;
@@ -87,6 +89,9 @@ public class DOMContentUtils {
if ( ! forceTags.contains(ignoreTags[i]) )
linkParams.remove(ignoreTags[i]);
}
+
+ // https://issues.apache.org/jira/browse/NUTCH-1115
+ fixEmbeddedParams = conf.getBoolean("parser.fix.embeddedparams", true);
}
/**
@@ -321,7 +326,7 @@ public class DOMContentUtils {
// the target contains params information or the base doesn't then no
// conversion necessary, return regular URL
- if (target.indexOf(';') >= 0 || base.toString().indexOf(';') == -1) {
+ if (!fixEmbeddedParams || target.indexOf(';') >= 0 || base.toString().indexOf(';') == -1) {
return new URL(base, target);
}
Modified: nutch/branches/branch-1.4/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.4/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java?rev=1174147&r1=1174146&r2=1174147&view=diff
==============================================================================
--- nutch/branches/branch-1.4/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java (original)
+++ nutch/branches/branch-1.4/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java Thu Sep 22 14:02:51 2011
@@ -39,6 +39,8 @@ import org.w3c.dom.NodeList;
*/
class DOMContentUtils {
+ private boolean fixEmbeddedParams;
+
private static class LinkParams {
private String elName;
private String attrName;
@@ -87,6 +89,9 @@ class DOMContentUtils {
if ( ! forceTags.contains(ignoreTags[i]) )
linkParams.remove(ignoreTags[i]);
}
+
+ // https://issues.apache.org/jira/browse/NUTCH-1115
+ fixEmbeddedParams = conf.getBoolean("parser.fix.embeddedparams", true);
}
/**
@@ -318,10 +323,10 @@ class DOMContentUtils {
*/
private URL fixEmbeddedParams(URL base, String target)
throws MalformedURLException{
-
+
// the target contains params information or the base doesn't then no
// conversion necessary, return regular URL
- if (target.indexOf(';') >= 0 || base.toString().indexOf(';') == -1) {
+ if (!fixEmbeddedParams || target.indexOf(';') >= 0 || base.toString().indexOf(';') == -1) {
return new URL(base, target);
}
@@ -340,7 +345,7 @@ class DOMContentUtils {
else {
target += params;
}
-
+
return new URL(base, target);
}