You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by cu...@apache.org on 2005/06/07 23:50:52 UTC

svn commit: r189452 - /lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/ParseSegment.java

Author: cutting
Date: Tue Jun  7 14:50:51 2005
New Revision: 189452

URL: http://svn.apache.org/viewcvs?rev=189452&view=rev
Log:
Normalize & filter linked URLs prior to adding them.

Modified:
    lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/ParseSegment.java

Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/ParseSegment.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/ParseSegment.java?rev=189452&r1=189451&r2=189452&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/ParseSegment.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/ParseSegment.java Tue Jun  7 14:50:51 2005
@@ -22,6 +22,7 @@
 import org.apache.nutch.util.*;
 import org.apache.nutch.protocol.*;
 import org.apache.nutch.parse.*;
+import org.apache.nutch.net.*;
 
 import java.io.*;
 import java.util.*;
@@ -36,6 +37,8 @@
 
   private float interval;
 
+  private UrlNormalizer urlNormalizer = UrlNormalizerFactory.getNormalizer();
+        
   public ParseSegment() { super(null); }
 
   public ParseSegment(NutchConf conf) {
@@ -93,7 +96,7 @@
                               UTF8.class, CrawlDatum.class);
     
     return new RecordWriter() {
-        
+
         public void write(WritableComparable key, Writable value)
           throws IOException {
           
@@ -105,9 +108,17 @@
           // collect outlinks for subsequent db update
           Outlink[] links = parse.getData().getOutlinks();
           for (int i = 0; i < links.length; i++) {
-            crawlOut.append(new UTF8(links[i].getToUrl()),
-                            new CrawlDatum(CrawlDatum.STATUS_LINKED,
-                                           interval));
+            String toUrl = links[i].getToUrl();
+            try {
+              toUrl = urlNormalizer.normalize(toUrl); // normalize the url
+              toUrl = URLFilters.filter(toUrl);   // filter the url
+            } catch (Exception e) {
+              toUrl = null;
+            }
+            if (toUrl != null)
+              crawlOut.append(new UTF8(toUrl),
+                              new CrawlDatum(CrawlDatum.STATUS_LINKED,
+                                             interval));
           }
         }