You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by fe...@apache.org on 2012/08/31 17:57:18 UTC

svn commit: r1379488 - in /nutch/branches/2.x: ./ conf/ src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/scoring/

Author: ferdy
Date: Fri Aug 31 15:57:18 2012
New Revision: 1379488

URL: http://svn.apache.org/viewvc?rev=1379488&view=rev
Log:
NUTCH-1431 Introduce link 'distance' and add configurable max distance in the generator

Modified:
    nutch/branches/2.x/CHANGES.txt
    nutch/branches/2.x/conf/nutch-default.xml
    nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateMapper.java
    nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateReducer.java
    nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdaterJob.java
    nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorMapper.java
    nutch/branches/2.x/src/java/org/apache/nutch/crawl/InjectorJob.java
    nutch/branches/2.x/src/java/org/apache/nutch/scoring/ScoreDatum.java

Modified: nutch/branches/2.x/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1379488&r1=1379487&r2=1379488&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Fri Aug 31 15:57:18 2012
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Release 2.1 - Current Development
 
+* NUTCH-1431 Introduce link 'distance' and add configurable max distance in the generator (ferdy)
+
 * NUTCH-1448 Redirected urls should be handled more cleanly (more like an outlink url) (ferdy)
 
 * NUTCH-1463 Elasticsearch indexer should wait and check response for last flush (ferdy)

Modified: nutch/branches/2.x/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/conf/nutch-default.xml?rev=1379488&r1=1379487&r2=1379488&view=diff
==============================================================================
--- nutch/branches/2.x/conf/nutch-default.xml (original)
+++ nutch/branches/2.x/conf/nutch-default.xml Fri Aug 31 15:57:18 2012
@@ -543,6 +543,15 @@
 </property>
 
 <property>
+  <name>generate.max.distance</name>
+  <value>-1</value>
+  <description>The maximum distance of an URL that the generator is allowed
+  to select for fetch. The distance is the smallest number of nodes (shortest path)
+  of an URL from the original injected URL. (Injected URLs have distance 0).
+  </description>
+</property>
+
+<property>
   <name>generate.count.mode</name>
   <value>host</value>
   <description>Determines how the URLs are counted for generator.max.count.

Modified: nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateMapper.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateMapper.java?rev=1379488&r1=1379487&r2=1379488&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateMapper.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateMapper.java Fri Aug 31 15:57:18 2012
@@ -57,7 +57,11 @@ extends GoraMapper<String, WebPage, UrlW
     Map<Utf8, Utf8> outlinks = page.getOutlinks();
     if (outlinks != null) {
       for (Entry<Utf8, Utf8> e : outlinks.entrySet()) {
-        scoreData.add(new ScoreDatum(0.0f, e.getKey().toString(), e.getValue().toString()));
+                int depth=Integer.MAX_VALUE;
+        Utf8 depthUtf8=page.getFromMarkers(DbUpdaterJob.DISTANCE);
+        if (depthUtf8 != null) depth=Integer.parseInt(depthUtf8.toString());
+        scoreData.add(new ScoreDatum(0.0f, e.getKey().toString(), 
+            e.getValue().toString(), depth));
       }
     }
 

Modified: nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateReducer.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateReducer.java?rev=1379488&r1=1379487&r2=1379488&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateReducer.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateReducer.java Fri Aug 31 15:57:18 2012
@@ -154,6 +154,29 @@ extends GoraReducer<UrlWithScore, NutchW
       page.putToInlinks(new Utf8(inlink.getUrl()), new Utf8(inlink.getAnchor()));
     }
 
+    // Distance calculation.
+    // Retrieve smallest distance from all inlinks distances
+    // Calculate new distance for current page: smallest inlink distance plus 1.
+    // If the new distance is smaller than old one (or if old did not exist yet),
+    // write it to the page.
+    int smallestDist=Integer.MAX_VALUE;
+    for (ScoreDatum inlink : inlinkedScoreData) {
+      int inlinkDist = inlink.getDistance();
+      if (inlinkDist < smallestDist) {
+        smallestDist=inlinkDist;
+      }
+      page.putToInlinks(new Utf8(inlink.getUrl()), new Utf8(inlink.getAnchor()));
+    }
+    if (smallestDist != Integer.MAX_VALUE) {
+      int oldDistance=Integer.MAX_VALUE;
+      Utf8 oldDistUtf8 = page.getFromMarkers(DbUpdaterJob.DISTANCE);
+      if (oldDistUtf8 != null)oldDistance=Integer.parseInt(oldDistUtf8.toString());
+      int newDistance = smallestDist+1;
+      if (newDistance < oldDistance) {
+        page.putToMarkers(DbUpdaterJob.DISTANCE, new Utf8(Integer.toString(newDistance)));
+      }
+    }
+
     try {
       scoringFilters.updateScore(url, page, inlinkedScoreData);
     } catch (ScoringFilterException e) {

Modified: nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdaterJob.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdaterJob.java?rev=1379488&r1=1379487&r2=1379488&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdaterJob.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdaterJob.java Fri Aug 31 15:57:18 2012
@@ -20,6 +20,7 @@ import java.util.Collection;
 import java.util.HashSet;
 import java.util.Map;
 
+import org.apache.avro.util.Utf8;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.util.Tool;
 import org.apache.hadoop.util.ToolRunner;
@@ -60,6 +61,8 @@ public class DbUpdaterJob extends NutchT
     FIELDS.add(WebPage.Field.PREV_FETCH_TIME);
   }
 
+  public static final Utf8 DISTANCE = new Utf8("dist");
+
   public DbUpdaterJob() {
 
   }

Modified: nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorMapper.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorMapper.java?rev=1379488&r1=1379487&r2=1379488&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorMapper.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorMapper.java Fri Aug 31 15:57:18 2012
@@ -29,6 +29,7 @@ import org.apache.nutch.scoring.ScoringF
 import org.apache.nutch.storage.Mark;
 import org.apache.nutch.storage.WebPage;
 import org.apache.nutch.util.TableUtil;
+import org.apache.avro.util.Utf8;
 import org.apache.gora.mapreduce.GoraMapper;
 
 public class GeneratorMapper
@@ -42,6 +43,7 @@ extends GoraMapper<String, WebPage, Sele
   private ScoringFilters scoringFilters;
   private long curTime;
   private SelectorEntry entry = new SelectorEntry();
+  private int maxDistance;
 
   @Override
   public void map(String reversedUrl, WebPage page,
@@ -55,6 +57,17 @@ extends GoraMapper<String, WebPage, Sele
       return;
     }
 
+    //filter on distance
+    if (maxDistance > -1) {
+      Utf8 distanceUtf8 = page.getFromMarkers(DbUpdaterJob.DISTANCE);
+      if (distanceUtf8 != null) {
+        int distance=Integer.parseInt(distanceUtf8.toString());
+        if (distance > maxDistance) {
+          return;
+        }
+      }
+    }
+
     // If filtering is on don't generate URLs that don't pass URLFilters
     try {
       if (normalise) {
@@ -103,6 +116,7 @@ extends GoraMapper<String, WebPage, Sele
     if (normalise) {
       normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_GENERATE_HOST_COUNT);
     }
+    maxDistance=conf.getInt("generate.max.distance", -1);
     curTime = conf.getLong(GeneratorJob.GENERATOR_CUR_TIME, System.currentTimeMillis());
     schedule = FetchScheduleFactory.getFetchSchedule(conf);
     scoringFilters = new ScoringFilters(conf);

Modified: nutch/branches/2.x/src/java/org/apache/nutch/crawl/InjectorJob.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/InjectorJob.java?rev=1379488&r1=1379487&r2=1379488&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/crawl/InjectorJob.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/InjectorJob.java Fri Aug 31 15:57:18 2012
@@ -179,6 +179,8 @@ public class InjectorJob extends NutchTo
     				  + ", using default (" + e.getMessage() + ")");
     	  }
       }
+      
+      row.putToMarkers(DbUpdaterJob.DISTANCE, new Utf8(String.valueOf(0)));
 
       Mark.INJECT_MARK.putMark(row, YES_STRING);
       context.write(reversedUrl, row);

Modified: nutch/branches/2.x/src/java/org/apache/nutch/scoring/ScoreDatum.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/scoring/ScoreDatum.java?rev=1379488&r1=1379487&r2=1379488&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/scoring/ScoreDatum.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/scoring/ScoreDatum.java Fri Aug 31 15:57:18 2012
@@ -33,14 +33,16 @@ public class ScoreDatum implements Writa
   private float score;
   private String url;
   private String anchor;
+  private int distance;
   private Map<String, byte[]> metaData = new HashMap<String, byte[]>();
   
   public ScoreDatum() { }
   
-  public ScoreDatum(float score, String url, String anchor) {
+  public ScoreDatum(float score, String url, String anchor, int depth) {
     this.score = score;
     this.url = url;
     this.anchor = anchor;
+    this.distance = depth;
   }
 
   @Override
@@ -48,6 +50,7 @@ public class ScoreDatum implements Writa
     score = in.readFloat();
     url = Text.readString(in);
     anchor = Text.readString(in);
+    distance = WritableUtils.readVInt(in);
     metaData.clear();
     
     int size = WritableUtils.readVInt(in);
@@ -55,7 +58,7 @@ public class ScoreDatum implements Writa
       String key = Text.readString(in);
       byte[] value = Bytes.readByteArray(in);
       metaData.put(key, value);
-    }
+    }    
   }
 
   @Override
@@ -63,6 +66,7 @@ public class ScoreDatum implements Writa
     out.writeFloat(score);
     Text.writeString(out, url);
     Text.writeString(out, anchor);
+    WritableUtils.writeVInt(out, distance);
     
     WritableUtils.writeVInt(out, metaData.size());
     for (Entry<String, byte[]> e : metaData.entrySet()) {
@@ -102,11 +106,15 @@ public class ScoreDatum implements Writa
   public String getAnchor() {
     return anchor;
   }
+  
+  public int getDistance() {
+    return distance;
+  }
 
   @Override
   public String toString() {
     return "ScoreDatum [score=" + score + ", url=" + url + ", anchor=" + anchor
-        + ", metaData=" + metaData + "]";
+        + ", distance="+distance + ", metaData=" + metaData + "]";
   }