You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ab...@apache.org on 2006/09/25 20:14:32 UTC
svn commit: r449765 -
/lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/parse/OutlinkExtractor.java
Author: ab
Date: Mon Sep 25 11:14:31 2006
New Revision: 449765
URL: http://svn.apache.org/viewvc?view=rev&rev=449765
Log:
Catch exception on invalid urls, and continue collecting valid ones.
Modified:
lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/parse/OutlinkExtractor.java
Modified: lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/parse/OutlinkExtractor.java
URL: http://svn.apache.org/viewvc/lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/parse/OutlinkExtractor.java?view=diff&rev=449765&r1=449764&r2=449765
==============================================================================
--- lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/parse/OutlinkExtractor.java (original)
+++ lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/parse/OutlinkExtractor.java Mon Sep 25 11:14:31 2006
@@ -16,6 +16,7 @@
package org.apache.nutch.parse;
+import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.List;
@@ -108,7 +109,13 @@
}
result = matcher.getMatch();
url = result.group(0);
- outlinks.add(new Outlink(url, anchor, conf));
+ url = result.group(0);
+ try {
+ Outlink outlink = new Outlink(url, anchor, conf);
+ outlinks.add(new Outlink(url, anchor, conf));
+ } catch (MalformedURLException mue) {
+ LOG.warn("Invalid url: '" + url + "', skipping.");
+ }
}
} catch (Exception ex) {
// if the matcher fails (perhaps a malformed URL) we just log it and move on