You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by cu...@apache.org on 2005/10/20 22:04:21 UTC

svn commit: r326994 - in /lucene/nutch/branches/mapred: conf/ src/java/org/apache/nutch/crawl/

Author: cutting
Date: Thu Oct 20 13:04:17 2005
New Revision: 326994

URL: http://svn.apache.org/viewcvs?rev=326994&view=rev
Log:
Use OPIC score when crawling and as boost in search, replacing the use of simple incoming link count.

Modified:
    lucene/nutch/branches/mapred/conf/crawl-tool.xml
    lucene/nutch/branches/mapred/conf/nutch-default.xml
    lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Crawl.java
    lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/CrawlDatum.java
    lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
    lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java
    lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Generator.java
    lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Indexer.java
    lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/ParseOutputFormat.java

Modified: lucene/nutch/branches/mapred/conf/crawl-tool.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/conf/crawl-tool.xml?rev=326994&r1=326993&r2=326994&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/conf/crawl-tool.xml (original)
+++ lucene/nutch/branches/mapred/conf/crawl-tool.xml Thu Oct 20 13:04:17 2005
@@ -15,13 +15,6 @@
 </property>
 
 <property>
-  <name>indexer.boost.by.link.count</name>
-  <value>true</value>
-  <description>When true scores for a page are multipled by the log of
-  the number of incoming links to the page.</description>
-</property>
-
-<property>
   <name>db.ignore.internal.links</name>
   <value>false</value>
   <description>If true, when adding new links to a page, links from

Modified: lucene/nutch/branches/mapred/conf/nutch-default.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/conf/nutch-default.xml?rev=326994&r1=326993&r2=326994&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/conf/nutch-default.xml (original)
+++ lucene/nutch/branches/mapred/conf/nutch-default.xml Thu Oct 20 13:04:17 2005
@@ -271,17 +271,6 @@
   fetchlist.  -1 if unlimited.</description>
 </property>
 
-<!-- fetchlist tool properties -->
-
-<property>
-  <name>fetchlist.score.by.link.count</name>
-  <value>true</value>
-  <description>If true, set page scores on fetchlist entries based on
-  log(number of anchors), instead of using original page scores. This
-  results in prioritization of pages with many incoming links.
-  </description>
-</property>
-
 <!-- fetcher properties -->
 
 <property>
@@ -448,13 +437,6 @@
   value of this parameter.  This is compiled into indexes, so, when
   this is changed, pages must be re-indexed for it to take
   effect.</description>
-</property>
-
-<property>
-  <name>indexer.boost.by.link.count</name>
-  <value>true</value>
-  <description>When true scores for a page are multipled by the log of
-  the number of incoming links to the page.</description>
 </property>
 
 <property>

Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Crawl.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Crawl.java?rev=326994&r1=326993&r2=326994&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Crawl.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Crawl.java Thu Oct 20 13:04:17 2005
@@ -114,7 +114,7 @@
     new LinkDb(conf).invert(linkDb, segments); // invert links
 
     // index, dedup & merge
-    new Indexer(conf).index(indexes, linkDb, fs.listFiles(segments));
+    new Indexer(conf).index(indexes, crawlDb, linkDb, fs.listFiles(segments));
     new DeleteDuplicates(conf).dedup(new File[] { indexes });
     new IndexMerger(fs, fs.listFiles(indexes), index, tmpDir).merge();
 

Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/CrawlDatum.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/CrawlDatum.java?rev=326994&r1=326993&r2=326994&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/CrawlDatum.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/CrawlDatum.java Thu Oct 20 13:04:17 2005
@@ -31,7 +31,7 @@
   public static final String FETCH_DIR_NAME = "crawl_fetch";
   public static final String PARSE_DIR_NAME = "crawl_parse";
 
-  private final static byte CUR_VERSION = 1;
+  private final static byte CUR_VERSION = 2;
 
   public static final byte STATUS_DB_UNFETCHED = 1;
   public static final byte STATUS_DB_FETCHED = 2;
@@ -47,15 +47,18 @@
   private long fetchTime = System.currentTimeMillis();
   private byte retries;
   private float fetchInterval;
-  private int linkCount;
+  private float score = 1.0f;
 
   public CrawlDatum() {}
 
   public CrawlDatum(int status, float fetchInterval) {
     this.status = (byte)status;
     this.fetchInterval = fetchInterval;
-    if (status == STATUS_LINKED)
-      linkCount = 1;
+  }
+
+  public CrawlDatum(int status, float fetchInterval, float score) {
+    this(status, fetchInterval);
+    this.score = score;
   }
 
   //
@@ -80,8 +83,8 @@
     this.fetchInterval = fetchInterval;
   }
 
-  public int getLinkCount() { return linkCount; }
-  public void setLinkCount(int linkCount) { this.linkCount = linkCount; }
+  public float getScore() { return score; }
+  public void setScore(float score) { this.score = score; }
 
   //
   // writable methods
@@ -96,18 +99,18 @@
 
   public void readFields(DataInput in) throws IOException {
     byte version = in.readByte();                 // read version
-    if (version > CUR_VERSION)                    // check version
+    if (version != CUR_VERSION)                   // check version
       throw new VersionMismatchException(CUR_VERSION, version);
 
     status = in.readByte();
     fetchTime = in.readLong();
     retries = in.readByte();
     fetchInterval = in.readFloat();
-    linkCount = in.readInt();
+    score = in.readFloat();
   }
 
-  /** The number of bytes into a CrawlDatum that the linkCount is stored. */
-  private static final int LINK_COUNT_OFFSET = 1 + 1 + 8 + 1 + 4;
+  /** The number of bytes into a CrawlDatum that the score is stored. */
+  private static final int SCORE_OFFSET = 1 + 1 + 8 + 1 + 4;
 
   public void write(DataOutput out) throws IOException {
     out.writeByte(CUR_VERSION);                   // store current version
@@ -115,7 +118,7 @@
     out.writeLong(fetchTime);
     out.writeByte(retries);
     out.writeFloat(fetchInterval);
-    out.writeInt(linkCount);
+    out.writeFloat(score);
   }
 
   /** Copy the contents of another instance into this instance. */
@@ -124,7 +127,7 @@
     this.fetchTime = that.fetchTime;
     this.retries = that.retries;
     this.fetchInterval = that.fetchInterval;
-    this.linkCount = that.linkCount;
+    this.score = that.score;
   }
 
 
@@ -132,11 +135,11 @@
   // compare methods
   //
   
-  /** Sort by decreasing link count. */
+  /** Sort by decreasing score. */
   public int compareTo(Object o) {
     CrawlDatum that = (CrawlDatum)o; 
-    if (that.linkCount != this.linkCount)
-      return that.linkCount - this.linkCount;
+    if (that.score != this.score)
+      return (that.score - this.score) > 0 ? 1 : -1;
     if (that.status != this.status)
       return this.status - that.status;
     if (that.fetchTime != this.fetchTime)
@@ -153,10 +156,10 @@
     public Comparator() { super(CrawlDatum.class); }
 
     public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
-      int linkCount1 = readInt(b1,s1+LINK_COUNT_OFFSET);
-      int linkCount2 = readInt(b2,s2+LINK_COUNT_OFFSET);
-      if (linkCount2 != linkCount1) {
-        return linkCount2 - linkCount1;
+      float score1 = readFloat(b1,s1+SCORE_OFFSET);
+      float score2 = readFloat(b2,s2+SCORE_OFFSET);
+      if (score2 != score1) {
+        return (score2 - score1) > 0 ? 1 : -1;
       }
       int status1 = b1[s1+1];
       int status2 = b2[s2+1];
@@ -194,7 +197,7 @@
     buf.append("Fetch time: " + new Date(getFetchTime()) + "\n");
     buf.append("Retries since fetch: " + getRetriesSinceFetch() + "\n");
     buf.append("Retry interval: " + getFetchInterval() + " days\n");
-    buf.append("Link Count: " + getLinkCount() + "\n");
+    buf.append("Score: " + getScore() + "\n");
     return buf.toString();
   }
 
@@ -207,7 +210,7 @@
       (this.fetchTime == other.fetchTime) &&
       (this.retries == other.retries) &&
       (this.fetchInterval == other.fetchInterval) &&
-      (this.linkCount == other.linkCount);
+      (this.score == other.score);
   }
 
   public int hashCode() {
@@ -216,7 +219,7 @@
       ((int)fetchTime) ^
       retries ^
       Float.floatToIntBits(fetchInterval) ^
-      linkCount;
+      Float.floatToIntBits(score);
   }
 
   public Object clone() {

Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/CrawlDbReducer.java?rev=326994&r1=326993&r2=326994&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/CrawlDbReducer.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/CrawlDbReducer.java Thu Oct 20 13:04:17 2005
@@ -38,11 +38,10 @@
 
     CrawlDatum highest = null;
     CrawlDatum old = null;
-    int linkCount = 0;
+    float scoreIncrement = 0.0f;
 
     while (values.hasNext()) {
       CrawlDatum datum = (CrawlDatum)values.next();
-      linkCount += datum.getLinkCount();          // sum link counts
 
       if (highest == null || datum.getStatus() > highest.getStatus()) {
         highest = datum;                          // find highest status
@@ -52,6 +51,10 @@
       case CrawlDatum.STATUS_DB_UNFETCHED:
       case CrawlDatum.STATUS_DB_FETCHED:
         old = datum;
+        break;
+      case CrawlDatum.STATUS_LINKED:
+        scoreIncrement += datum.getScore();
+        break;
       }
     }
 
@@ -71,6 +74,7 @@
       } else {
         result = highest;                         // use new entry
         result.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
+        result.setScore(1.0f);                    // initial score is 1.0f
       }
       break;
       
@@ -99,7 +103,7 @@
     }
     
     if (result != null) {
-      result.setLinkCount(linkCount);
+      result.setScore(result.getScore() + scoreIncrement);
       output.collect(key, result);
     }
   }

Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java?rev=326994&r1=326993&r2=326994&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java Thu Oct 20 13:04:17 2005
@@ -38,6 +38,7 @@
   
   public static final String DIGEST_KEY = "nutch.content.digest";
   public static final String SEGMENT_NAME_KEY = "nutch.segment.name";
+  public static final String SCORE_KEY = "nutch.crawl.score";
 
   public static class InputFormat extends SequenceFileInputFormat {
     /** Don't split inputs, to keep things polite. */
@@ -197,6 +198,8 @@
         (DIGEST_KEY, MD5Hash.digest(content.getContent()).toString());
       content.getMetadata().setProperty           // add segment to metadata
         (SEGMENT_NAME_KEY, segmentName);
+      content.getMetadata().setProperty           // add score to metadata
+        (SCORE_KEY, Float.toString(datum.getScore()));
 
       Parse parse = null;
       if (parsing) {

Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Generator.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Generator.java?rev=326994&r1=326993&r2=326994&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Generator.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Generator.java Thu Oct 20 13:04:17 2005
@@ -61,7 +61,7 @@
       if (crawlDatum.getFetchTime() > curTime)
         return;                                   // not time yet
 
-      output.collect(crawlDatum, key);          // invert for sort by linkCount
+      output.collect(crawlDatum, key);          // invert for sort by score
     }
 
     /** Partition by host (value). */

Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Indexer.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Indexer.java?rev=326994&r1=326993&r2=326994&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Indexer.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Indexer.java Thu Oct 20 13:04:17 2005
@@ -138,11 +138,9 @@
     super(conf);
   }
 
-  private boolean boostByLinkCount;
   private float scorePower;
 
   public void configure(JobConf job) {
-    boostByLinkCount = job.getBoolean("indexer.boost.by.link.count", false);
     scorePower = job.getFloat("indexer.score.power", 0.5f);
   }
 
@@ -150,7 +148,8 @@
                      OutputCollector output, Reporter reporter)
     throws IOException {
     Inlinks inlinks = null;
-    CrawlDatum crawlDatum = null;
+    CrawlDatum dbDatum = null;
+    CrawlDatum fetchDatum = null;
     ParseData parseData = null;
     ParseText parseText = null;
     while (values.hasNext()) {
@@ -158,7 +157,21 @@
       if (value instanceof Inlinks) {
         inlinks = (Inlinks)value;
       } else if (value instanceof CrawlDatum) {
-        crawlDatum = (CrawlDatum)value;
+        CrawlDatum datum = (CrawlDatum)value;
+        switch (datum.getStatus()) {
+        case CrawlDatum.STATUS_DB_UNFETCHED:
+        case CrawlDatum.STATUS_DB_FETCHED:
+        case CrawlDatum.STATUS_DB_GONE:
+          dbDatum = datum;
+          break;
+        case CrawlDatum.STATUS_FETCH_SUCCESS:
+        case CrawlDatum.STATUS_FETCH_RETRY:
+        case CrawlDatum.STATUS_FETCH_GONE:
+          fetchDatum = datum;
+          break;
+        default:
+          throw new RuntimeException("Unexpected status: "+datum.getStatus());
+        }
       } else if (value instanceof ParseData) {
         parseData = (ParseData)value;
       } else if (value instanceof ParseText) {
@@ -168,7 +181,8 @@
       }
     }      
 
-    if (crawlDatum == null || parseText == null || parseData == null) {
+    if (fetchDatum == null || dbDatum == null
+        || parseText == null || parseData == null) {
       return;                                     // only have inlinks
     }
 
@@ -183,10 +197,8 @@
     // add digest, used by dedup
     doc.add(Field.UnIndexed("digest", meta.getProperty(Fetcher.DIGEST_KEY)));
 
-    // compute boost
-    float boost =
-      IndexSegment.calculateBoost(1.0f, scorePower, boostByLinkCount,
-                                  anchors.length);
+    // boost is opic
+    float boost = (float)Math.pow(dbDatum.getScore(), scorePower);
     // apply boost to all indexed fields.
     doc.setBoost(boost);
     // store boost for use by explain and dedup
@@ -205,7 +217,7 @@
       FetcherOutput fo =
         new FetcherOutput(new FetchListEntry(true,new Page((UTF8)key),anchors),
                           null, null);
-      fo.setFetchDate(crawlDatum.getFetchTime());
+      fo.setFetchDate(fetchDatum.getFetchTime());
 
       // run indexing filters
       doc = IndexingFilters.filter(doc,new ParseImpl(parseText, parseData),fo);
@@ -217,7 +229,7 @@
     output.collect(key, new ObjectWritable(doc));
   }
 
-  public void index(File indexDir, File linkDb, File[] segments)
+  public void index(File indexDir, File crawlDb, File linkDb, File[] segments)
     throws IOException {
 
     LOG.info("Indexer: starting");
@@ -232,6 +244,7 @@
       job.addInputDir(new File(segments[i], ParseText.DIR_NAME));
     }
 
+    job.addInputDir(new File(crawlDb, CrawlDatum.DB_DIR_NAME));
     job.addInputDir(new File(linkDb, LinkDb.CURRENT_NAME));
 
     job.setInputFormat(InputFormat.class);
@@ -253,17 +266,18 @@
   public static void main(String[] args) throws Exception {
     Indexer indexer = new Indexer(NutchConf.get());
     
-    if (args.length < 2) {
-      System.err.println("Usage: <index> <linkdb> <segment> <segment> ...");
+    if (args.length < 4) {
+      System.err.println("Usage: <index> <crawldb> <linkdb> <segment> ...");
       return;
     }
     
-    File[] segments = new File[args.length-2];
-    for (int i = 2; i < args.length; i++) {
-      segments[i-2] = new File(args[i]);
+    File[] segments = new File[args.length-3];
+    for (int i = 3; i < args.length; i++) {
+      segments[i-3] = new File(args[i]);
     }
 
-    indexer.index(new File(args[0]), new File(args[1]), segments);
+    indexer.index(new File(args[0]), new File(args[1]), new File(args[2]),
+                  segments);
   }
 
 }

Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/ParseOutputFormat.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/ParseOutputFormat.java?rev=326994&r1=326993&r2=326994&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/ParseOutputFormat.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/ParseOutputFormat.java Thu Oct 20 13:04:17 2005
@@ -64,6 +64,11 @@
 
           // collect outlinks for subsequent db update
           Outlink[] links = parse.getData().getOutlinks();
+
+          // compute OPIC score contribution
+          float score = Float.valueOf(parse.getData().get(Fetcher.SCORE_KEY));
+          score /= links.length;
+                          
           for (int i = 0; i < links.length; i++) {
             String toUrl = links[i].getToUrl();
             try {
@@ -75,7 +80,7 @@
             if (toUrl != null)
               crawlOut.append(new UTF8(toUrl),
                               new CrawlDatum(CrawlDatum.STATUS_LINKED,
-                                             interval));
+                                             interval, score));
           }
         }