You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ab...@apache.org on 2006/09/23 19:30:46 UTC
svn commit: r449279 - in /lucene/nutch/branches/branch-0.8: ./
src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/scoring/
src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/
Author: ab
Date: Sat Sep 23 10:30:45 2006
New Revision: 449279
URL: http://svn.apache.org/viewvc?view=rev&rev=449279
Log:
NUTCH-336: differentiate between newly discovered pages (known value through
inlink contributions) and newly injected pages (aribtrarily defined initial
value).
Modified:
lucene/nutch/branches/branch-0.8/CHANGES.txt
lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/crawl/Injector.java
lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/scoring/ScoringFilter.java
lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/scoring/ScoringFilters.java
lucene/nutch/branches/branch-0.8/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java
Modified: lucene/nutch/branches/branch-0.8/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/branches/branch-0.8/CHANGES.txt?view=diff&rev=449279&r1=449278&r2=449279
==============================================================================
--- lucene/nutch/branches/branch-0.8/CHANGES.txt (original)
+++ lucene/nutch/branches/branch-0.8/CHANGES.txt Sat Sep 23 10:30:45 2006
@@ -31,6 +31,10 @@
10. NUTCH-332 - Fix doubling score caused by links to self (Stefan
Groschupf via ab)
+
+11. NUTCH-336 - Differentiate between newly discovered pages and newly
+ injected pages (Chris Schneider via ab) NOTE: this changes the
+ scoring API, filter implementations need to be updated.
Release 0.8 - 2006-07-25
Modified: lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
URL: http://svn.apache.org/viewvc/lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/crawl/CrawlDbReducer.java?view=diff&rev=449279&r1=449278&r2=449279
==============================================================================
--- lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/crawl/CrawlDbReducer.java (original)
+++ lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/crawl/CrawlDbReducer.java Sat Sep 23 10:30:45 2006
@@ -36,12 +36,10 @@
private CrawlDatum result = new CrawlDatum();
private ArrayList linked = new ArrayList();
private ScoringFilters scfilters = null;
- private float scoreInjected;
public void configure(JobConf job) {
retryMax = job.getInt("db.fetch.retry.max", 3);
scfilters = new ScoringFilters(job);
- scoreInjected = job.getFloat("db.score.injected", 1.0f);
}
public void close() {}
@@ -112,7 +110,7 @@
LOG.warn("Cannot filter init score for url " + key +
", using default: " + e.getMessage());
}
- result.setScore(scoreInjected);
+ result.setScore(0.0f);
}
}
break;
Modified: lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/crawl/Injector.java
URL: http://svn.apache.org/viewvc/lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/crawl/Injector.java?view=diff&rev=449279&r1=449278&r2=449279
==============================================================================
--- lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/crawl/Injector.java (original)
+++ lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/crawl/Injector.java Sat Sep 23 10:30:45 2006
@@ -78,10 +78,10 @@
CrawlDatum datum = new CrawlDatum(CrawlDatum.STATUS_DB_UNFETCHED, interval);
datum.setScore(scoreInjected);
try {
- scfilters.initialScore(value, datum);
+ scfilters.injectedScore(value, datum);
} catch (ScoringFilterException e) {
if (LOG.isWarnEnabled()) {
- LOG.warn("Cannot filter init score for url " + url +
+ LOG.warn("Cannot filter injected score for url " + url +
", using default (" + e.getMessage() + ")");
}
datum.setScore(scoreInjected);
Modified: lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/scoring/ScoringFilter.java
URL: http://svn.apache.org/viewvc/lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/scoring/ScoringFilter.java?view=diff&rev=449279&r1=449278&r2=449279
==============================================================================
--- lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/scoring/ScoringFilter.java (original)
+++ lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/scoring/ScoringFilter.java Sat Sep 23 10:30:45 2006
@@ -41,7 +41,21 @@
public final static String X_POINT_ID = ScoringFilter.class.getName();
/**
- * Set an initial score for newly injected pages.
+ * Set an initial score for newly injected pages. Note: newly injected pages
+ * may have no inlinks, so filter implementations may wish to set this
+ * score to a non-zero value, to give newly injected pages some initial
+ * credit.
+ * @param url url of the page
+ * @param datum new datum. Filters will modify it in-place.
+ * @throws ScoringFilterException
+ */
+ public void injectedScore(UTF8 url, CrawlDatum datum) throws ScoringFilterException;
+
+ /**
+ * Set an initial score for newly discovered pages. Note: newly discovered pages
+ * have at least one inlink with its score contribution, so filter implementations
+ * may choose to set initial score to zero (unknown value), and then the inlink
+ * score contribution will set the "real" value of the new page.
* @param url url of the page
* @param datum new datum. Filters will modify it in-place.
* @throws ScoringFilterException
Modified: lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/scoring/ScoringFilters.java
URL: http://svn.apache.org/viewvc/lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/scoring/ScoringFilters.java?view=diff&rev=449279&r1=449278&r2=449279
==============================================================================
--- lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/scoring/ScoringFilters.java (original)
+++ lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/scoring/ScoringFilters.java Sat Sep 23 10:30:45 2006
@@ -92,10 +92,17 @@
return initSort;
}
- /** Calculate a new initial score, used when adding new pages. */
+ /** Calculate a new initial score, used when adding newly discovered pages. */
public void initialScore(UTF8 url, CrawlDatum datum) throws ScoringFilterException {
for (int i = 0; i < this.filters.length; i++) {
this.filters[i].initialScore(url, datum);
+ }
+ }
+
+ /** Calculate a new initial score, used when injecting new pages. */
+ public void injectedScore(UTF8 url, CrawlDatum datum) throws ScoringFilterException {
+ for (int i = 0; i < this.filters.length; i++) {
+ this.filters[i].injectedScore(url, datum);
}
}
Modified: lucene/nutch/branches/branch-0.8/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java
URL: http://svn.apache.org/viewvc/lucene/nutch/branches/branch-0.8/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java?view=diff&rev=449279&r1=449278&r2=449279
==============================================================================
--- lucene/nutch/branches/branch-0.8/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java (original)
+++ lucene/nutch/branches/branch-0.8/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java Sat Sep 23 10:30:45 2006
@@ -73,8 +73,14 @@
}
/** Set to the value defined in config, 1.0f by default. */
- public void initialScore(UTF8 url, CrawlDatum datum) throws ScoringFilterException {
+ public void injectedScore(UTF8 url, CrawlDatum datum) throws ScoringFilterException {
datum.setScore(scoreInjected);
+ }
+
+ /** Set to 0.0f (unknown value) - inlink contributions will bring it to
+ * a correct level. Newly discovered pages have at least one inlink. */
+ public void initialScore(UTF8 url, CrawlDatum datum) throws ScoringFilterException {
+ datum.setScore(0.0f);
}
/** Use {@link CrawlDatum#getScore()}. */