You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ab...@apache.org on 2006/09/23 19:30:46 UTC

svn commit: r449279 - in /lucene/nutch/branches/branch-0.8: ./ src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/scoring/ src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/

Author: ab
Date: Sat Sep 23 10:30:45 2006
New Revision: 449279

URL: http://svn.apache.org/viewvc?view=rev&rev=449279
Log:
NUTCH-336: differentiate between newly discovered pages (known value through
inlink contributions) and newly injected pages (aribtrarily defined initial
value).

Modified:
    lucene/nutch/branches/branch-0.8/CHANGES.txt
    lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
    lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/crawl/Injector.java
    lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/scoring/ScoringFilter.java
    lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/scoring/ScoringFilters.java
    lucene/nutch/branches/branch-0.8/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java

Modified: lucene/nutch/branches/branch-0.8/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/branches/branch-0.8/CHANGES.txt?view=diff&rev=449279&r1=449278&r2=449279
==============================================================================
--- lucene/nutch/branches/branch-0.8/CHANGES.txt (original)
+++ lucene/nutch/branches/branch-0.8/CHANGES.txt Sat Sep 23 10:30:45 2006
@@ -31,6 +31,10 @@
 
 10. NUTCH-332 - Fix doubling score caused by links to self (Stefan
     Groschupf via ab)
+
+11. NUTCH-336 - Differentiate between newly discovered pages and newly
+    injected pages (Chris Schneider via ab) NOTE: this changes the
+    scoring API, filter implementations need to be updated.
     
 Release 0.8 - 2006-07-25
 

Modified: lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
URL: http://svn.apache.org/viewvc/lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/crawl/CrawlDbReducer.java?view=diff&rev=449279&r1=449278&r2=449279
==============================================================================
--- lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/crawl/CrawlDbReducer.java (original)
+++ lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/crawl/CrawlDbReducer.java Sat Sep 23 10:30:45 2006
@@ -36,12 +36,10 @@
   private CrawlDatum result = new CrawlDatum();
   private ArrayList linked = new ArrayList();
   private ScoringFilters scfilters = null;
-  private float scoreInjected;
 
   public void configure(JobConf job) {
     retryMax = job.getInt("db.fetch.retry.max", 3);
     scfilters = new ScoringFilters(job);
-    scoreInjected = job.getFloat("db.score.injected", 1.0f);
   }
 
   public void close() {}
@@ -112,7 +110,7 @@
             LOG.warn("Cannot filter init score for url " + key +
                      ", using default: " + e.getMessage());
           }
-          result.setScore(scoreInjected);
+          result.setScore(0.0f);
         }
       }
       break;

Modified: lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/crawl/Injector.java
URL: http://svn.apache.org/viewvc/lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/crawl/Injector.java?view=diff&rev=449279&r1=449278&r2=449279
==============================================================================
--- lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/crawl/Injector.java (original)
+++ lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/crawl/Injector.java Sat Sep 23 10:30:45 2006
@@ -78,10 +78,10 @@
         CrawlDatum datum = new CrawlDatum(CrawlDatum.STATUS_DB_UNFETCHED, interval);
         datum.setScore(scoreInjected);
         try {
-          scfilters.initialScore(value, datum);
+          scfilters.injectedScore(value, datum);
         } catch (ScoringFilterException e) {
           if (LOG.isWarnEnabled()) {
-            LOG.warn("Cannot filter init score for url " + url +
+            LOG.warn("Cannot filter injected score for url " + url +
                      ", using default (" + e.getMessage() + ")");
           }
           datum.setScore(scoreInjected);

Modified: lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/scoring/ScoringFilter.java
URL: http://svn.apache.org/viewvc/lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/scoring/ScoringFilter.java?view=diff&rev=449279&r1=449278&r2=449279
==============================================================================
--- lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/scoring/ScoringFilter.java (original)
+++ lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/scoring/ScoringFilter.java Sat Sep 23 10:30:45 2006
@@ -41,7 +41,21 @@
   public final static String X_POINT_ID = ScoringFilter.class.getName();
   
   /**
-   * Set an initial score for newly injected pages.
+   * Set an initial score for newly injected pages. Note: newly injected pages
+   * may have no inlinks, so filter implementations may wish to set this 
+   * score to a non-zero value, to give newly injected pages some initial
+   * credit.
+   * @param url url of the page
+   * @param datum new datum. Filters will modify it in-place.
+   * @throws ScoringFilterException
+   */
+  public void injectedScore(UTF8 url, CrawlDatum datum) throws ScoringFilterException;
+  
+  /**
+   * Set an initial score for newly discovered pages. Note: newly discovered pages
+   * have at least one inlink with its score contribution, so filter implementations
+   * may choose to set initial score to zero (unknown value), and then the inlink
+   * score contribution will set the "real" value of the new page.
    * @param url url of the page
    * @param datum new datum. Filters will modify it in-place.
    * @throws ScoringFilterException

Modified: lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/scoring/ScoringFilters.java
URL: http://svn.apache.org/viewvc/lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/scoring/ScoringFilters.java?view=diff&rev=449279&r1=449278&r2=449279
==============================================================================
--- lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/scoring/ScoringFilters.java (original)
+++ lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/scoring/ScoringFilters.java Sat Sep 23 10:30:45 2006
@@ -92,10 +92,17 @@
     return initSort;
   }
 
-  /** Calculate a new initial score, used when adding new pages. */
+  /** Calculate a new initial score, used when adding newly discovered pages. */
   public void initialScore(UTF8 url, CrawlDatum datum) throws ScoringFilterException {
     for (int i = 0; i < this.filters.length; i++) {
       this.filters[i].initialScore(url, datum);
+    }
+  }
+
+  /** Calculate a new initial score, used when injecting new pages. */
+  public void injectedScore(UTF8 url, CrawlDatum datum) throws ScoringFilterException {
+    for (int i = 0; i < this.filters.length; i++) {
+      this.filters[i].injectedScore(url, datum);
     }
   }
 

Modified: lucene/nutch/branches/branch-0.8/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java
URL: http://svn.apache.org/viewvc/lucene/nutch/branches/branch-0.8/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java?view=diff&rev=449279&r1=449278&r2=449279
==============================================================================
--- lucene/nutch/branches/branch-0.8/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java (original)
+++ lucene/nutch/branches/branch-0.8/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java Sat Sep 23 10:30:45 2006
@@ -73,8 +73,14 @@
   }
 
   /** Set to the value defined in config, 1.0f by default. */
-  public void initialScore(UTF8 url, CrawlDatum datum) throws ScoringFilterException {
+  public void injectedScore(UTF8 url, CrawlDatum datum) throws ScoringFilterException {
     datum.setScore(scoreInjected);
+  }
+
+  /** Set to 0.0f (unknown value) - inlink contributions will bring it to
+   * a correct level. Newly discovered pages have at least one inlink. */
+  public void initialScore(UTF8 url, CrawlDatum datum) throws ScoringFilterException {
+    datum.setScore(0.0f);
   }
 
   /** Use {@link CrawlDatum#getScore()}. */