You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2017/12/18 15:49:47 UTC
[nutch] 03/23: - filter out NaN scores which break the quantile
calculation
This is an automated email from the ASF dual-hosted git repository.
snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git
commit 26669eb1f3f75e466eae732e79a4e6e85ea57073
Author: Sebastian Nagel <sn...@apache.org>
AuthorDate: Mon Dec 11 10:35:46 2017 +0100
- filter out NaN scores which break the quantile calculation
---
src/java/org/apache/nutch/crawl/CrawlDbReader.java | 27 ++++++++++++++--------
1 file changed, 18 insertions(+), 9 deletions(-)
diff --git a/src/java/org/apache/nutch/crawl/CrawlDbReader.java b/src/java/org/apache/nutch/crawl/CrawlDbReader.java
index 117aa7f..af30664 100644
--- a/src/java/org/apache/nutch/crawl/CrawlDbReader.java
+++ b/src/java/org/apache/nutch/crawl/CrawlDbReader.java
@@ -203,11 +203,15 @@ public class CrawlDbReader extends Configured implements Closeable, Tool {
output.collect(new Text("retry " + value.getRetriesSinceFetch()),
COUNT_1);
- NutchWritable score = new NutchWritable(
- new FloatWritable(value.getScore()));
- output.collect(new Text("sc"), score);
- output.collect(new Text("sct"), score);
- output.collect(new Text("scd"), score);
+ if (Float.isNaN(value.getScore())) {
+ output.collect(new Text("scNaN"), COUNT_1);
+ } else {
+ NutchWritable score = new NutchWritable(
+ new FloatWritable(value.getScore()));
+ output.collect(new Text("sc"), score);
+ output.collect(new Text("sct"), score);
+ output.collect(new Text("scd"), score);
+ }
// fetch time (in minutes to prevent from overflows when summing up)
NutchWritable fetchTime = new NutchWritable(
@@ -287,7 +291,7 @@ public class CrawlDbReader extends Configured implements Closeable, Tool {
cnt += value;
}
output.collect(key, new NutchWritable(new FloatWritable(cnt)));
- } else if (k.equals("scd") || k.equals("ftd") || k.equals("fid")) {
+ } else if (k.equals("scd")) {
MergingDigest tdigest = null;
while (values.hasNext()) {
Writable value = values.next().get();
@@ -301,10 +305,13 @@ public class CrawlDbReader extends Configured implements Closeable, Tool {
tdigest.add(tdig);
}
} else if (value instanceof FloatWritable) {
- if (tdigest == null) {
- tdigest = (MergingDigest) TDigest.createMergingDigest(100.0);
+ float val = ((FloatWritable) value).get();
+ if (!Float.isNaN(val)) {
+ if (tdigest == null) {
+ tdigest = (MergingDigest) TDigest.createMergingDigest(100.0);
+ }
+ tdigest.add(val);
}
- tdigest.add(((FloatWritable) value).get());
}
}
ByteBuffer tdigestBytes = ByteBuffer.allocate(tdigest.smallByteSize());
@@ -521,6 +528,8 @@ public class CrawlDbReader extends Configured implements Closeable, Tool {
LOG.info("max score:\t" + fvalue);
} else if (k.equals("sct")) {
LOG.info("avg score:\t" + (fvalue / totalCnt.get()));
+ } else if (k.equals("scNaN")) {
+ LOG.info("score == NaN:\t" + value);
} else if (k.equals("ftn")) {
LOG.info("earliest fetch time:\t" + new Date(1000 * 60 * value));
} else if (k.equals("ftx")) {
--
To stop receiving notification emails like this one, please contact
"commits@nutch.apache.org" <co...@nutch.apache.org>.