You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ab...@apache.org on 2007/01/05 17:58:30 UTC
svn commit: r493085 - in /lucene/nutch/trunk: CHANGES.txt
src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
Author: ab
Date: Fri Jan 5 08:58:29 2007
New Revision: 493085
URL: http://svn.apache.org/viewvc?view=rev&rev=493085
Log:
Fix NUTCH-425 and NUTCH-426.
Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
Modified: lucene/nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=493085&r1=493084&r2=493085
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Fri Jan 5 08:58:29 2007
@@ -114,6 +114,9 @@
36. Fix Injector to preserve already existing CrawlDatum if the seed list
being injected also contains such URL. (ab)
+37. NUTCH-425, NUTCH-426 - Fix anchors pollution. Continue after
+ skipping bad URLs. (Michael Stack via ab)
+
Release 0.8 - 2006-07-25
Modified: lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java?view=diff&rev=493085&r1=493084&r2=493085
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java Fri Jan 5 08:58:29 2007
@@ -20,6 +20,7 @@
import java.io.FileInputStream;
import java.io.InputStream;
import java.io.InputStreamReader;
+import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
@@ -106,7 +107,7 @@
// if (LOG.isInfoEnabled()) {
// LOG.info("script: language=" + lang + ", text: " + script.toString());
// }
- Outlink[] links = getJSLinks(script.toString(), base, base);
+ Outlink[] links = getJSLinks(script.toString(), "", base);
if (links != null && links.length > 0) outlinks.addAll(Arrays.asList(links));
// no other children of interest here, go one level up.
return;
@@ -123,11 +124,11 @@
Node anode = attrs.item(i);
Outlink[] links = null;
if (anode.getNodeName().startsWith("on")) {
- links = getJSLinks(anode.getNodeValue(), base, base);
+ links = getJSLinks(anode.getNodeValue(), "", base);
} else if (anode.getNodeName().equalsIgnoreCase("href")) {
String val = anode.getNodeValue();
if (val != null && val.toLowerCase().indexOf("javascript:") != -1) {
- links = getJSLinks(val, base, base);
+ links = getJSLinks(val, "", base);
}
}
if (links != null && links.length > 0) outlinks.addAll(Arrays.asList(links));
@@ -146,7 +147,7 @@
return new ParseStatus(ParseStatus.FAILED_INVALID_FORMAT,
"Content not JavaScript: '" + type + "'").getEmptyParse(getConf());
String script = new String(c.getContent());
- Outlink[] outlinks = getJSLinks(script, c.getUrl(), c.getUrl());
+ Outlink[] outlinks = getJSLinks(script, "", c.getUrl());
if (outlinks == null) outlinks = new Outlink[0];
// Title? use the first line of the script...
String title;
@@ -212,7 +213,19 @@
}
if (url.startsWith("www.")) {
url = "http://" + url;
- } else url = new URL(baseURL, url).toString();
+ } else {
+ // See if candidate URL is parseable. If not, pass and move on to
+ // the next match.
+ try {
+ url = new URL(baseURL, url).toString();
+ } catch (MalformedURLException ex) {
+ if (LOG.isTraceEnabled()) {
+ LOG.trace(" - failed URL parse '" + url + "' and baseURL '" +
+ baseURL + "'", ex);
+ }
+ continue;
+ }
+ }
url = url.replaceAll("&", "&");
if (LOG.isTraceEnabled()) {
LOG.trace(" - outlink from JS: '" + url + "'");
@@ -249,7 +262,7 @@
while ((line = br.readLine()) != null) sb.append(line + "\n");
JSParseFilter parseFilter = new JSParseFilter();
parseFilter.setConf(NutchConfiguration.create());
- Outlink[] links = parseFilter.getJSLinks(sb.toString(), args[1], args[1]);
+ Outlink[] links = parseFilter.getJSLinks(sb.toString(), "", args[1]);
System.out.println("Outlinks extracted: " + links.length);
for (int i = 0; i < links.length; i++)
System.out.println(" - " + links[i]);