You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2017/12/18 15:49:47 UTC

[nutch] 03/23: - filter out NaN scores which break the quantile calculation

This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git

commit 26669eb1f3f75e466eae732e79a4e6e85ea57073
Author: Sebastian Nagel <sn...@apache.org>
AuthorDate: Mon Dec 11 10:35:46 2017 +0100

    - filter out NaN scores which break the quantile calculation
---
 src/java/org/apache/nutch/crawl/CrawlDbReader.java | 27 ++++++++++++++--------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/src/java/org/apache/nutch/crawl/CrawlDbReader.java b/src/java/org/apache/nutch/crawl/CrawlDbReader.java
index 117aa7f..af30664 100644
--- a/src/java/org/apache/nutch/crawl/CrawlDbReader.java
+++ b/src/java/org/apache/nutch/crawl/CrawlDbReader.java
@@ -203,11 +203,15 @@ public class CrawlDbReader extends Configured implements Closeable, Tool {
       output.collect(new Text("retry " + value.getRetriesSinceFetch()),
           COUNT_1);
 
-      NutchWritable score = new NutchWritable(
-          new FloatWritable(value.getScore()));
-      output.collect(new Text("sc"), score);
-      output.collect(new Text("sct"), score);
-      output.collect(new Text("scd"), score);
+      if (Float.isNaN(value.getScore())) {
+        output.collect(new Text("scNaN"), COUNT_1);
+      } else {
+        NutchWritable score = new NutchWritable(
+            new FloatWritable(value.getScore()));
+        output.collect(new Text("sc"), score);
+        output.collect(new Text("sct"), score);
+        output.collect(new Text("scd"), score);
+      }
 
       // fetch time (in minutes to prevent from overflows when summing up)
       NutchWritable fetchTime = new NutchWritable(
@@ -287,7 +291,7 @@ public class CrawlDbReader extends Configured implements Closeable, Tool {
           cnt += value;
         }
         output.collect(key, new NutchWritable(new FloatWritable(cnt)));
-      } else if (k.equals("scd") || k.equals("ftd") || k.equals("fid")) {
+      } else if (k.equals("scd")) {
         MergingDigest tdigest = null;
         while (values.hasNext()) {
           Writable value = values.next().get();
@@ -301,10 +305,13 @@ public class CrawlDbReader extends Configured implements Closeable, Tool {
               tdigest.add(tdig);
             }
           } else if (value instanceof FloatWritable) {
-            if (tdigest == null) {
-              tdigest = (MergingDigest) TDigest.createMergingDigest(100.0);
+            float val = ((FloatWritable) value).get();
+            if (!Float.isNaN(val)) {
+              if (tdigest == null) {
+                tdigest = (MergingDigest) TDigest.createMergingDigest(100.0);
+              }
+              tdigest.add(val);
             }
-            tdigest.add(((FloatWritable) value).get());
           }
         }
         ByteBuffer tdigestBytes = ByteBuffer.allocate(tdigest.smallByteSize());
@@ -521,6 +528,8 @@ public class CrawlDbReader extends Configured implements Closeable, Tool {
           LOG.info("max score:\t" + fvalue);
         } else if (k.equals("sct")) {
           LOG.info("avg score:\t" + (fvalue / totalCnt.get()));
+        } else if (k.equals("scNaN")) {
+          LOG.info("score == NaN:\t" + value);
         } else if (k.equals("ftn")) {
           LOG.info("earliest fetch time:\t" + new Date(1000 * 60 * value));
         } else if (k.equals("ftx")) {

-- 
To stop receiving notification emails like this one, please contact
"commits@nutch.apache.org" <co...@nutch.apache.org>.