You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ab...@apache.org on 2005/06/11 22:39:05 UTC
svn commit: r190173 - in /lucene/nutch/trunk/src:
java/org/apache/nutch/tools/SegmentMergeTool.java
plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java
plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
Author: ab
Date: Sat Jun 11 13:39:04 2005
New Revision: 190173
URL: http://svn.apache.org/viewcvs?rev=190173&view=rev
Log:
Assorted fixes:
* JSParseFilter: parse javascript in "href" attributes
* HTMLMetaProcessor: more correct workround for broken meta refresh
* SegmentMergeTool: fix a bug in output segment handling.
Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/tools/SegmentMergeTool.java
lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java
lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/tools/SegmentMergeTool.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/tools/SegmentMergeTool.java?rev=190173&r1=190172&r2=190173&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/tools/SegmentMergeTool.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/tools/SegmentMergeTool.java Sat Jun 11 13:39:04 2005
@@ -567,7 +567,7 @@
LOG.severe("No input segments.");
return;
}
- if (output == null) output = (File)dirs.get(0);
+ if (output == null) output = ((File)dirs.get(0)).getParentFile();
SegmentMergeTool st = new SegmentMergeTool(nfs, (File[])dirs.toArray(new File[0]),
output, maxCount, runIndexer, delSegs);
st.run();
Modified: lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java?rev=190173&r1=190172&r2=190173&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java Sat Jun 11 13:39:04 2005
@@ -133,8 +133,12 @@
try {
refreshUrl = new URL(url);
} catch (Exception e) {
- // this has to be an absolute url!
- if (!url.startsWith("/")) url = "/" + url;
+ // XXX according to the spec, this has to be an absolute
+ // XXX url. However, many websites use relative URLs and
+ // XXX expect browsers to handle that.
+ // XXX Unfortunately, in some cases this may create a
+ // XXX infinitely recursive paths (a crawler trap)...
+ // if (!url.startsWith("/")) url = "/" + url;
try {
refreshUrl = new URL(currURL, url);
} catch (Exception e1) {
Modified: lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java?rev=190173&r1=190172&r2=190173&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java Sat Jun 11 13:39:04 2005
@@ -99,10 +99,16 @@
// Keyboard: onkeydown,onkeypress,onkeyup
// Mouse: onclick,ondbclick,onmousedown,onmouseout,onmousover,onmouseup
Node anode = attrs.item(i);
+ Outlink[] links = null;
if (anode.getNodeName().startsWith("on")) {
- Outlink[] links = getJSLinks(anode.getNodeValue(), base, base);
- if (links != null && links.length > 0) outlinks.addAll(Arrays.asList(links));
+ links = getJSLinks(anode.getNodeValue(), base, base);
+ } else if (anode.getNodeName().equalsIgnoreCase("href")) {
+ String val = anode.getNodeValue();
+ if (val != null && val.toLowerCase().indexOf("javascript:") != -1) {
+ links = getJSLinks(val, base, base);
+ }
}
+ if (links != null && links.length > 0) outlinks.addAll(Arrays.asList(links));
}
}
}