You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by cu...@apache.org on 2005/06/09 22:57:49 UTC

svn commit: r189818 - /lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/LinkDb.java

Author: cutting
Date: Thu Jun  9 13:57:49 2005
New Revision: 189818

URL: http://svn.apache.org/viewcvs?rev=189818&view=rev
Log:
Implement anchor truncation, as in trunk.

Modified:
    lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/LinkDb.java

Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/LinkDb.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/LinkDb.java?rev=189818&r1=189817&r2=189818&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/LinkDb.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/LinkDb.java Thu Jun  9 13:57:49 2005
@@ -35,6 +35,7 @@
 
   public static String CURRENT_NAME = "current";
 
+  private int maxAnchorLength;
 
   public LinkDb() {
     super(null);
@@ -45,7 +46,9 @@
     super(conf);
   }
 
-  public void configure(JobConf job) {}
+  public void configure(JobConf job) {
+    maxAnchorLength = job.getInt("db.max.anchor.length", 100);
+  }
 
   public void map(WritableComparable key, Writable value,
                   OutputCollector output) throws IOException {
@@ -56,7 +59,11 @@
     for (int i = 0; i < outlinks.length; i++) {
       Outlink outlink = outlinks[i];
       inlinks.clear();
-      inlinks.add(new Inlink(fromUrl, outlink.getAnchor()));
+      String anchor = outlink.getAnchor();        // truncate long anchors
+      if (anchor.length() > maxAnchorLength) {
+        anchor = anchor.substring(0, maxAnchorLength);
+      }
+      inlinks.add(new Inlink(fromUrl, anchor));   // collect inverted link
       output.collect(new UTF8(outlink.getToUrl()), inlinks);
     }
   }