You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ab...@apache.org on 2007/01/05 17:58:30 UTC

svn commit: r493085 - in /lucene/nutch/trunk: CHANGES.txt src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java

Author: ab
Date: Fri Jan  5 08:58:29 2007
New Revision: 493085

URL: http://svn.apache.org/viewvc?view=rev&rev=493085
Log:
Fix NUTCH-425 and NUTCH-426.

Modified:
    lucene/nutch/trunk/CHANGES.txt
    lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=493085&r1=493084&r2=493085
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Fri Jan  5 08:58:29 2007
@@ -114,6 +114,9 @@
 36. Fix Injector to preserve already existing CrawlDatum if the seed list
     being injected also contains such URL. (ab)
 
+37. NUTCH-425, NUTCH-426 - Fix anchors pollution. Continue after
+    skipping bad URLs. (Michael Stack via ab)
+
 
 Release 0.8 - 2006-07-25
 

Modified: lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java?view=diff&rev=493085&r1=493084&r2=493085
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java Fri Jan  5 08:58:29 2007
@@ -20,6 +20,7 @@
 import java.io.FileInputStream;
 import java.io.InputStream;
 import java.io.InputStreamReader;
+import java.net.MalformedURLException;
 import java.net.URL;
 import java.util.ArrayList;
 import java.util.Arrays;
@@ -106,7 +107,7 @@
           // if (LOG.isInfoEnabled()) {
           //   LOG.info("script: language=" + lang + ", text: " + script.toString());
           // }
-          Outlink[] links = getJSLinks(script.toString(), base, base);
+          Outlink[] links = getJSLinks(script.toString(), "", base);
           if (links != null && links.length > 0) outlinks.addAll(Arrays.asList(links));
           // no other children of interest here, go one level up.
           return;
@@ -123,11 +124,11 @@
           Node anode = attrs.item(i);
           Outlink[] links = null;
           if (anode.getNodeName().startsWith("on")) {
-            links = getJSLinks(anode.getNodeValue(), base, base);
+            links = getJSLinks(anode.getNodeValue(), "", base);
           } else if (anode.getNodeName().equalsIgnoreCase("href")) {
             String val = anode.getNodeValue();
             if (val != null && val.toLowerCase().indexOf("javascript:") != -1) {
-              links = getJSLinks(val, base, base);
+              links = getJSLinks(val, "", base);
             }
           }
           if (links != null && links.length > 0) outlinks.addAll(Arrays.asList(links));
@@ -146,7 +147,7 @@
       return new ParseStatus(ParseStatus.FAILED_INVALID_FORMAT,
               "Content not JavaScript: '" + type + "'").getEmptyParse(getConf());
     String script = new String(c.getContent());
-    Outlink[] outlinks = getJSLinks(script, c.getUrl(), c.getUrl());
+    Outlink[] outlinks = getJSLinks(script, "", c.getUrl());
     if (outlinks == null) outlinks = new Outlink[0];
     // Title? use the first line of the script...
     String title;
@@ -212,7 +213,19 @@
         }
         if (url.startsWith("www.")) {
             url = "http://" + url;
-        } else url = new URL(baseURL, url).toString();
+        } else {
+          // See if candidate URL is parseable.  If not, pass and move on to
+          // the next match.
+          try {
+            url = new URL(baseURL, url).toString();
+          } catch (MalformedURLException ex) {
+            if (LOG.isTraceEnabled()) {
+              LOG.trace(" - failed URL parse '" + url + "' and baseURL '" +
+                  baseURL + "'", ex);
+            }
+            continue;
+          }
+        }
         url = url.replaceAll("&", "&");
         if (LOG.isTraceEnabled()) {
           LOG.trace(" - outlink from JS: '" + url + "'");
@@ -249,7 +262,7 @@
     while ((line = br.readLine()) != null) sb.append(line + "\n");
     JSParseFilter parseFilter = new JSParseFilter();
     parseFilter.setConf(NutchConfiguration.create());
-    Outlink[] links = parseFilter.getJSLinks(sb.toString(), args[1], args[1]);
+    Outlink[] links = parseFilter.getJSLinks(sb.toString(), "", args[1]);
     System.out.println("Outlinks extracted: " + links.length);
     for (int i = 0; i < links.length; i++)
       System.out.println(" - " + links[i]);