You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ab...@apache.org on 2005/06/11 22:39:05 UTC

svn commit: r190173 - in /lucene/nutch/trunk/src: java/org/apache/nutch/tools/SegmentMergeTool.java plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java

Author: ab
Date: Sat Jun 11 13:39:04 2005
New Revision: 190173

URL: http://svn.apache.org/viewcvs?rev=190173&view=rev
Log:
Assorted fixes:


* JSParseFilter: parse javascript in "href" attributes

* HTMLMetaProcessor: more correct workround for broken meta refresh

* SegmentMergeTool: fix a bug in output segment handling.

Modified:
    lucene/nutch/trunk/src/java/org/apache/nutch/tools/SegmentMergeTool.java
    lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java
    lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/tools/SegmentMergeTool.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/tools/SegmentMergeTool.java?rev=190173&r1=190172&r2=190173&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/tools/SegmentMergeTool.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/tools/SegmentMergeTool.java Sat Jun 11 13:39:04 2005
@@ -567,7 +567,7 @@
       LOG.severe("No input segments.");
       return;
     }
-    if (output == null) output = (File)dirs.get(0);
+    if (output == null) output = ((File)dirs.get(0)).getParentFile();
     SegmentMergeTool st = new SegmentMergeTool(nfs, (File[])dirs.toArray(new File[0]),
             output, maxCount, runIndexer, delSegs);
     st.run();

Modified: lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java?rev=190173&r1=190172&r2=190173&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java Sat Jun 11 13:39:04 2005
@@ -133,8 +133,12 @@
                   try {
                     refreshUrl = new URL(url);
                   } catch (Exception e) {
-                    // this has to be an absolute url!
-                    if (!url.startsWith("/")) url = "/" + url;
+                    // XXX according to the spec, this has to be an absolute
+                    // XXX url. However, many websites use relative URLs and
+                    // XXX expect browsers to handle that.
+                    // XXX Unfortunately, in some cases this may create a
+                    // XXX infinitely recursive paths (a crawler trap)...
+                    // if (!url.startsWith("/")) url = "/" + url;
                     try {
                       refreshUrl = new URL(currURL, url);
                     } catch (Exception e1) {

Modified: lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java?rev=190173&r1=190172&r2=190173&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java Sat Jun 11 13:39:04 2005
@@ -99,10 +99,16 @@
           // Keyboard: onkeydown,onkeypress,onkeyup
           // Mouse: onclick,ondbclick,onmousedown,onmouseout,onmousover,onmouseup
           Node anode = attrs.item(i);
+          Outlink[] links = null;
           if (anode.getNodeName().startsWith("on")) {
-            Outlink[] links = getJSLinks(anode.getNodeValue(), base, base);
-            if (links != null && links.length > 0) outlinks.addAll(Arrays.asList(links));
+            links = getJSLinks(anode.getNodeValue(), base, base);
+          } else if (anode.getNodeName().equalsIgnoreCase("href")) {
+            String val = anode.getNodeValue();
+            if (val != null && val.toLowerCase().indexOf("javascript:") != -1) {
+              links = getJSLinks(val, base, base);
+            }
           }
+          if (links != null && links.length > 0) outlinks.addAll(Arrays.asList(links));
         }
       }
     }