You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ab...@apache.org on 2006/03/18 20:21:13 UTC
svn commit: r386875 -
/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java
Author: ab
Date: Sat Mar 18 11:21:11 2006
New Revision: 386875
URL: http://svn.apache.org/viewcvs?rev=386875&view=rev
Log:
Apply patch in NUTCH-230, which provides additional control over which
outlinks are considered for OPIC "cash" value distribution.
Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java?rev=386875&r1=386874&r2=386875&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java Sat Mar 18 11:21:11 2006
@@ -25,6 +25,7 @@
import org.apache.nutch.net.*;
import java.io.*;
+import java.util.ArrayList;
/* Parse content in a segment. */
public class ParseOutputFormat implements OutputFormat {
@@ -42,6 +43,7 @@
this.filters = new URLFilters(job);
final float interval = job.getFloat("db.default.fetch.interval", 30f);
final float extscore = job.getFloat("db.score.link.external", 1.0f);
+ final boolean countFiltered = job.getBoolean("db.score.count.filtered", false);
File text =
new File(new File(job.getOutputDir(), ParseText.DIR_NAME), name);
@@ -92,9 +94,9 @@
.getContentMeta().get(Fetcher.SCORE_KEY);
float score = extscore;
// this may happen if there was a fetch error.
- if (scoreString != null) score = Float.parseFloat(scoreString);
- score /= links.length;
-
+ if (scoreString != null) score = Float.parseFloat(scoreString);
+ String[] toUrls = new String[links.length];
+ int validCount = 0;
for (int i = 0; i < links.length; i++) {
String toUrl = links[i].getToUrl();
try {
@@ -103,10 +105,18 @@
} catch (Exception e) {
toUrl = null;
}
- if (toUrl != null)
- crawlOut.append(new UTF8(toUrl),
- new CrawlDatum(CrawlDatum.STATUS_LINKED,
- interval, score));
+ if (toUrl != null) validCount++;
+ toUrls[i] = toUrl;
+ }
+ if (countFiltered) {
+ score = score / links.length;
+ } else {
+ score = score / validCount;
+ }
+ for (int i = 0; i < toUrls.length; i++) {
+ if (toUrls[i] == null) continue;
+ crawlOut.append(new UTF8(toUrls[i]),
+ new CrawlDatum(CrawlDatum.STATUS_LINKED, interval, score));
}
}