You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ku...@apache.org on 2007/03/10 18:40:23 UTC
svn commit: r516757 - in /lucene/nutch/trunk/src/plugin/parse-html/src:
java/org/apache/nutch/parse/html/DOMContentUtils.java
test/org/apache/nutch/parse/html/TestDOMContentUtils.java
Author: kubes
Date: Sat Mar 10 09:40:20 2007
New Revision: 516757
URL: http://svn.apache.org/viewvc?view=rev&rev=516757
Log:
NUTCH-436 resolved. Fixed behavior of urls with param
(i.e. ;xxxx) information. Finally found workaround for
problems that I was experiencing with EOL characters.
Modified:
lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
lucene/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
Modified: lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java?view=diff&rev=516757&r1=516756&r2=516757
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java Sat Mar 10 09:40:20 2007
@@ -282,6 +282,51 @@
return false;
}
+
+ /**
+ * Handles cases where the url param information is encoded into the base
+ * url as opposed to the target.
+ * <p>
+ * If the taget contains params (i.e. ';xxxx') information then the target
+ * params information is assumed to be correct and any base params information
+ * is ignored. If the base contains params information but the tareget does
+ * not, then the params information is moved to the target allowing it to be
+ * correctly determined by the java.net.URL class.
+ *
+ * @param base The base URL.
+ * @param target The target path from the base URL.
+ *
+ * @return URL A URL with the params information correctly encoded.
+ *
+ * @throws MalformedURLException If the url is not a well formed URL.
+ */
+ private URL fixEmbeddedParams(URL base, String target)
+ throws MalformedURLException{
+
+ // the target contains params information or the base doesn't then no
+ // conversion necessary, return regular URL
+ if (target.indexOf(';') >= 0 || base.toString().indexOf(';') == -1) {
+ return new URL(base, target);
+ }
+
+ // get the base url and it params information
+ String baseURL = base.toString();
+ int startParams = baseURL.indexOf(';');
+ String params = baseURL.substring(startParams);
+
+ // if the target has a query string then put the params information after
+ // any path but before the query string, otherwise just append to the path
+ int startQS = target.indexOf('?');
+ if (startQS >= 0) {
+ target = target.substring(0, startQS) + params +
+ target.substring(startQS);
+ }
+ else {
+ target += params;
+ }
+
+ return new URL(base, target);
+ }
/**
* This method finds all anchors below the supplied DOM
@@ -333,7 +378,9 @@
}
if (target != null && !noFollow && !post)
try {
- URL url = new URL(base, target);
+
+ URL url = (base.toString().indexOf(';') > 0) ?
+ fixEmbeddedParams(base, target) : new URL(base, target);
outlinks.add(new Outlink(url.toString(),
linkText.toString().trim(), conf));
} catch (MalformedURLException e) {
Modified: lucene/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java?view=diff&rev=516757&r1=516756&r2=516757
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java Sat Mar 10 09:40:20 2007
@@ -134,6 +134,20 @@
+ "<input type=submit><p>test1</p></form>"
+ "<form method='GET' action='/dummy.jsp'><input type=text>"
+ "<input type=submit><p>test2</p></form></body></html>"),
+ new String("<html><head><title> title </title>"
+ + "</head><body>"
+ + "<a href=\";x\">anchor1</a>"
+ + "<a href=\"g;x\">anchor2</a>"
+ + "<a href=\"g;x?y#s\">anchor3</a>"
+ + "</body></html>"),
+ new String("<html><head><title> title </title>"
+ + "</head><body>"
+ + "<a href=\"g\">anchor1</a>"
+ + "<a href=\"g?y#s\">anchor2</a>"
+ + "<a href=\"?y=1\">anchor3</a>"
+ + "<a href=\"?y=1#s\">anchor4</a>"
+ + "<a href=\"?y=1;somethingelse\">anchor5</a>"
+ + "</body></html>"),
};
private static int SKIP = 9;
@@ -149,6 +163,8 @@
"http://www.nutch.org//",
"http://www.nutch.org/",
"http://www.nutch.org/",
+ "http://www.nutch.org/",
+ "http://www.nutch.org/;something"
};
private static final DocumentFragment testDOMs[]=
@@ -173,7 +189,9 @@
+ "End this madness ! . . . .",
"ignore ignore",
"test1 test2",
- "test1 test2"
+ "test1 test2",
+ "title anchor1 anchor2 anchor3",
+ "title anchor1 anchor2 anchor3 anchor4 anchor5"
};
private static final String[] answerTitle= {
@@ -186,7 +204,9 @@
"my title",
"",
"",
- ""
+ "",
+ "title",
+ "title"
};
// note: should be in page-order
@@ -258,6 +278,18 @@
new Outlink("http://www.nutch.org/dummy.jsp", "test2", conf),
},
{
+ },
+ {
+ new Outlink("http://www.nutch.org/;x", "anchor1", conf),
+ new Outlink("http://www.nutch.org/g;x", "anchor2", conf),
+ new Outlink("http://www.nutch.org/g;x?y#s", "anchor3", conf)
+ },
+ {
+ new Outlink("http://www.nutch.org/g;something", "anchor1", conf),
+ new Outlink("http://www.nutch.org/g;something?y#s", "anchor2", conf),
+ new Outlink("http://www.nutch.org/;something?y=1", "anchor3", conf),
+ new Outlink("http://www.nutch.org/;something?y=1#s", "anchor4", conf),
+ new Outlink("http://www.nutch.org/?y=1;somethingelse", "anchor5", conf)
}
};