You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2016/07/02 10:16:28 UTC

[1/4] nutch git commit: CrawlDb statistics: avoid overflow in sum of fetch times for large CrawlDb

Repository: nutch
Updated Branches:
  refs/heads/master 6b141fb10 -> ecf2bb011


CrawlDb statistics: avoid overflow in sum of fetch times for large CrawlDb


Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/4800ad91
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/4800ad91
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/4800ad91

Branch: refs/heads/master
Commit: 4800ad91ab911aed8b139b39527bc437d82f0de3
Parents: 39f6c71
Author: Sebastian Nagel <sn...@apache.org>
Authored: Thu Jun 23 16:32:48 2016 +0200
Committer: Sebastian Nagel <sn...@apache.org>
Committed: Sat Jul 2 12:06:04 2016 +0200

----------------------------------------------------------------------
 src/java/org/apache/nutch/crawl/CrawlDbReader.java | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/nutch/blob/4800ad91/src/java/org/apache/nutch/crawl/CrawlDbReader.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/crawl/CrawlDbReader.java b/src/java/org/apache/nutch/crawl/CrawlDbReader.java
index 3cf6ff3..5db5f95 100644
--- a/src/java/org/apache/nutch/crawl/CrawlDbReader.java
+++ b/src/java/org/apache/nutch/crawl/CrawlDbReader.java
@@ -198,8 +198,12 @@ public class CrawlDbReader extends Configured implements Closeable, Tool {
           .collect(new Text("retry " + value.getRetriesSinceFetch()), COUNT_1);
       output.collect(new Text("sc"), new LongWritable(
           (long) (value.getScore() * 1000.0)));
-      output.collect(new Text("ft"), new LongWritable(value.getFetchTime()));
-      output.collect(new Text("fi"), new LongWritable(value.getFetchInterval()));
+      // fetch time (in minutes to prevent from overflows when summing up)
+      output.collect(new Text("ft"),
+          new LongWritable(value.getFetchTime() / (1000 * 60)));
+      // fetch interval (in seconds)
+      output.collect(new Text("fi"),
+          new LongWritable(value.getFetchInterval()));
       if (sort) {
         URL u = new URL(key.toString());
         String host = u.getHost();
@@ -449,12 +453,12 @@ public class CrawlDbReader extends Configured implements Closeable, Tool {
           LOG.info("avg score:\t"
               + (float) ((((double) val.get()) / totalCnt.get()) / 1000.0));
         } else if (k.equals("ftn")) {
-          LOG.info("earliest fetch time:\t" + new Date(val.get()));
+          LOG.info("earliest fetch time:\t" + new Date(1000 * 60 * val.get()));
         } else if (k.equals("ftx")) {
-          LOG.info("latest fetch time:\t" + new Date(val.get()));
+          LOG.info("latest fetch time:\t" + new Date(1000 * 60 * val.get()));
         } else if (k.equals("ftt")) {
           LOG.info("avg of fetch times:\t"
-              + new Date(val.get() / totalCnt.get()));
+              + new Date(1000 * 60 * (val.get() / totalCnt.get())));
         } else if (k.equals("fin")) {
           LOG.info("shortest fetch interval:\t{}",
               TimingUtil.secondsToDaysHMS(val.get()));


[3/4] nutch git commit: CrawlDb statistics: add fetch time (earliest, latest, average)

Posted by sn...@apache.org.
CrawlDb statistics: add fetch time (earliest, latest, average)


Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/ea2843b9
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/ea2843b9
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/ea2843b9

Branch: refs/heads/master
Commit: ea2843b9be6569e17963031d7370f5db42261809
Parents: 6b141fb
Author: Sebastian Nagel <sn...@apache.org>
Authored: Mon Jun 20 14:42:04 2016 +0200
Committer: Sebastian Nagel <sn...@apache.org>
Committed: Sat Jul 2 12:06:04 2016 +0200

----------------------------------------------------------------------
 .../org/apache/nutch/crawl/CrawlDbReader.java   | 76 ++++++++++++--------
 1 file changed, 46 insertions(+), 30 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/nutch/blob/ea2843b9/src/java/org/apache/nutch/crawl/CrawlDbReader.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/crawl/CrawlDbReader.java b/src/java/org/apache/nutch/crawl/CrawlDbReader.java
index 8f42ac4..381cec5 100644
--- a/src/java/org/apache/nutch/crawl/CrawlDbReader.java
+++ b/src/java/org/apache/nutch/crawl/CrawlDbReader.java
@@ -197,6 +197,7 @@ public class CrawlDbReader extends Configured implements Closeable, Tool {
           .collect(new Text("retry " + value.getRetriesSinceFetch()), COUNT_1);
       output.collect(new Text("s"), new LongWritable(
           (long) (value.getScore() * 1000.0)));
+      output.collect(new Text("f"), new LongWritable(value.getFetchTime()));
       if (sort) {
         URL u = new URL(key.toString());
         String host = u.getHost();
@@ -219,32 +220,40 @@ public class CrawlDbReader extends Configured implements Closeable, Tool {
     public void close() {
     }
 
+    private void reduceMinMaxTotal(String keyPrefix, Iterator<LongWritable> values,
+        OutputCollector<Text, LongWritable> output, Reporter reporter)
+        throws IOException {
+      long total = 0;
+      long min = Long.MAX_VALUE;
+      long max = Long.MIN_VALUE;
+      while (values.hasNext()) {
+        LongWritable cnt = values.next();
+        if (cnt.get() < min)
+          min = cnt.get();
+        if (cnt.get() > max)
+          max = cnt.get();
+        total += cnt.get();
+      }
+      output.collect(new Text(keyPrefix+"n"), new LongWritable(min));
+      output.collect(new Text(keyPrefix+"x"), new LongWritable(max));
+      output.collect(new Text(keyPrefix+"t"), new LongWritable(total));
+    }
+    
     public void reduce(Text key, Iterator<LongWritable> values,
         OutputCollector<Text, LongWritable> output, Reporter reporter)
         throws IOException {
       val.set(0L);
       String k = key.toString();
-      if (!k.equals("s")) {
+      if (k.equals("s")) {
+        reduceMinMaxTotal("sc", values, output, reporter);
+      } else if (k.equals("f")) {
+        reduceMinMaxTotal("ft", values, output, reporter);
+      } else {
         while (values.hasNext()) {
           LongWritable cnt = values.next();
           val.set(val.get() + cnt.get());
         }
         output.collect(key, val);
-      } else {
-        long total = 0;
-        long min = Long.MAX_VALUE;
-        long max = Long.MIN_VALUE;
-        while (values.hasNext()) {
-          LongWritable cnt = values.next();
-          if (cnt.get() < min)
-            min = cnt.get();
-          if (cnt.get() > max)
-            max = cnt.get();
-          total += cnt.get();
-        }
-        output.collect(new Text("scn"), new LongWritable(min));
-        output.collect(new Text("scx"), new LongWritable(max));
-        output.collect(new Text("sct"), new LongWritable(total));
       }
     }
   }
@@ -277,7 +286,7 @@ public class CrawlDbReader extends Configured implements Closeable, Tool {
           cnt.set(cnt.get() + val.get());
         }
         output.collect(key, cnt);
-      } else if (k.equals("scx")) {
+      } else if (k.equals("scx") || k.equals("ftx")) {
         LongWritable cnt = new LongWritable(Long.MIN_VALUE);
         while (values.hasNext()) {
           LongWritable val = values.next();
@@ -285,7 +294,7 @@ public class CrawlDbReader extends Configured implements Closeable, Tool {
             cnt.set(val.get());
         }
         output.collect(key, cnt);
-      } else if (k.equals("scn")) {
+      } else if (k.equals("scn") || k.equals("ftn")) {
         LongWritable cnt = new LongWritable(Long.MAX_VALUE);
         while (values.hasNext()) {
           LongWritable val = values.next();
@@ -293,7 +302,7 @@ public class CrawlDbReader extends Configured implements Closeable, Tool {
             cnt.set(val.get());
         }
         output.collect(key, cnt);
-      } else if (k.equals("sct")) {
+      } else if (k.equals("sct") || k.equals("ftt")) {
         LongWritable cnt = new LongWritable();
         while (values.hasNext()) {
           LongWritable val = values.next();
@@ -393,16 +402,16 @@ public class CrawlDbReader extends Configured implements Closeable, Tool {
 			  LongWritable val = stats.get(k);
 			  if (val == null) {
 				  val = new LongWritable();
-				  if (k.equals("scx"))
+				  if (k.equals("scx") || k.equals("ftx"))
 					  val.set(Long.MIN_VALUE);
-				  if (k.equals("scn"))
+				  if (k.equals("scn") || k.equals("ftn"))
 					  val.set(Long.MAX_VALUE);
 				  stats.put(k, val);
 			  }
-			  if (k.equals("scx")) {
+			  if (k.equals("scx") || k.equals("ftx")) {
 				  if (val.get() < value.get())
 					  val.set(value.get());
-			  } else if (k.equals("scn")) {
+			  } else if (k.equals("scn") || k.equals("ftn")) {
 				  if (val.get() > value.get())
 					  val.set(value.get());
 			  } else {
@@ -439,6 +448,13 @@ public class CrawlDbReader extends Configured implements Closeable, Tool {
         } else if (k.equals("sct")) {
           LOG.info("avg score:\t"
               + (float) ((((double) val.get()) / totalCnt.get()) / 1000.0));
+        } else if (k.equals("ftn")) {
+          LOG.info("earliest fetch time:\t" + new Date(val.get()));
+        } else if (k.equals("ftx")) {
+          LOG.info("latest fetch time:\t" + new Date(val.get()));
+        } else if (k.equals("ftt")) {
+          LOG.info("avg of fetch times:\t"
+              + new Date(val.get() / totalCnt.get()));
         } else if (k.startsWith("status")) {
           String[] st = k.split(" ");
           int code = Integer.parseInt(st[1]);
@@ -732,11 +748,12 @@ public class CrawlDbReader extends Configured implements Closeable, Tool {
     return 0;
   }
   
-    public static void main(String[] args) throws Exception {
-        int result = ToolRunner.run(NutchConfiguration.create(),
-                new CrawlDbReader(), args);
-        System.exit(result);
-        }
+  public static void main(String[] args) throws Exception {
+    int result = ToolRunner.run(NutchConfiguration.create(),
+        new CrawlDbReader(), args);
+    System.exit(result);
+  }
+
   public Object query(Map<String, String> args, Configuration conf, String type, String crawlId) throws Exception {
  
 
@@ -759,7 +776,6 @@ public class CrawlDbReader extends Configured implements Closeable, Tool {
         String k = entry.getKey();
         LongWritable val = entry.getValue();
         if (k.equals("scn")) {
-
           results.put("minScore", String.valueOf((val.get() / 1000.0f)));
         } else if (k.equals("scx")) {
           results.put("maxScore", String.valueOf((val.get() / 1000.0f)));
@@ -854,5 +870,5 @@ public class CrawlDbReader extends Configured implements Closeable, Tool {
       return results;
     }
     return results;
-    }
+  }
 }
\ No newline at end of file


[4/4] nutch git commit: NUTCH-2286 - Merge branch 'CrawlDbStats' from https://github.com/sebastian-nagel/nutch this closes #125

Posted by sn...@apache.org.
NUTCH-2286 - Merge branch 'CrawlDbStats' from https://github.com/sebastian-nagel/nutch this closes #125


Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/ecf2bb01
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/ecf2bb01
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/ecf2bb01

Branch: refs/heads/master
Commit: ecf2bb011df77c71d2789c3912d82ea7fcf56802
Parents: 6b141fb 4800ad9
Author: Sebastian Nagel <sn...@apache.org>
Authored: Sat Jul 2 12:06:47 2016 +0200
Committer: Sebastian Nagel <sn...@apache.org>
Committed: Sat Jul 2 12:06:47 2016 +0200

----------------------------------------------------------------------
 .../org/apache/nutch/crawl/CrawlDbReader.java   | 91 +++++++++++++-------
 src/java/org/apache/nutch/util/TimingUtil.java  | 53 +++++++-----
 2 files changed, 93 insertions(+), 51 deletions(-)
----------------------------------------------------------------------



[2/4] nutch git commit: CrawlDb statistics: add fetch interval (shortest, longest, average)

Posted by sn...@apache.org.
CrawlDb statistics: add fetch interval (shortest, longest, average)


Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/39f6c713
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/39f6c713
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/39f6c713

Branch: refs/heads/master
Commit: 39f6c713974240d19d54a515cd04372878739456
Parents: ea2843b
Author: Sebastian Nagel <sn...@apache.org>
Authored: Wed Jun 22 16:22:33 2016 +0200
Committer: Sebastian Nagel <sn...@apache.org>
Committed: Sat Jul 2 12:06:04 2016 +0200

----------------------------------------------------------------------
 .../org/apache/nutch/crawl/CrawlDbReader.java   | 35 ++++++++-----
 src/java/org/apache/nutch/util/TimingUtil.java  | 53 ++++++++++++--------
 2 files changed, 55 insertions(+), 33 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/nutch/blob/39f6c713/src/java/org/apache/nutch/crawl/CrawlDbReader.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/crawl/CrawlDbReader.java b/src/java/org/apache/nutch/crawl/CrawlDbReader.java
index 381cec5..3cf6ff3 100644
--- a/src/java/org/apache/nutch/crawl/CrawlDbReader.java
+++ b/src/java/org/apache/nutch/crawl/CrawlDbReader.java
@@ -69,6 +69,7 @@ import org.apache.nutch.util.JexlUtil;
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.NutchJob;
 import org.apache.nutch.util.StringUtil;
+import org.apache.nutch.util.TimingUtil;
 import org.apache.commons.jexl2.Expression;
 import org.apache.commons.jexl2.JexlEngine;
 import org.apache.commons.lang.time.DateUtils;
@@ -195,9 +196,10 @@ public class CrawlDbReader extends Configured implements Closeable, Tool {
       output.collect(new Text("status " + value.getStatus()), COUNT_1);
       output
           .collect(new Text("retry " + value.getRetriesSinceFetch()), COUNT_1);
-      output.collect(new Text("s"), new LongWritable(
+      output.collect(new Text("sc"), new LongWritable(
           (long) (value.getScore() * 1000.0)));
-      output.collect(new Text("f"), new LongWritable(value.getFetchTime()));
+      output.collect(new Text("ft"), new LongWritable(value.getFetchTime()));
+      output.collect(new Text("fi"), new LongWritable(value.getFetchInterval()));
       if (sort) {
         URL u = new URL(key.toString());
         String host = u.getHost();
@@ -244,10 +246,8 @@ public class CrawlDbReader extends Configured implements Closeable, Tool {
         throws IOException {
       val.set(0L);
       String k = key.toString();
-      if (k.equals("s")) {
-        reduceMinMaxTotal("sc", values, output, reporter);
-      } else if (k.equals("f")) {
-        reduceMinMaxTotal("ft", values, output, reporter);
+      if (k.equals("sc") || k.equals("ft") || k.equals("fi")) {
+        reduceMinMaxTotal(k, values, output, reporter);
       } else {
         while (values.hasNext()) {
           LongWritable cnt = values.next();
@@ -286,7 +286,7 @@ public class CrawlDbReader extends Configured implements Closeable, Tool {
           cnt.set(cnt.get() + val.get());
         }
         output.collect(key, cnt);
-      } else if (k.equals("scx") || k.equals("ftx")) {
+      } else if (k.equals("scx") || k.equals("ftx") || k.equals("fix")) {
         LongWritable cnt = new LongWritable(Long.MIN_VALUE);
         while (values.hasNext()) {
           LongWritable val = values.next();
@@ -294,7 +294,7 @@ public class CrawlDbReader extends Configured implements Closeable, Tool {
             cnt.set(val.get());
         }
         output.collect(key, cnt);
-      } else if (k.equals("scn") || k.equals("ftn")) {
+      } else if (k.equals("scn") || k.equals("ftn") || k.equals("fin")) {
         LongWritable cnt = new LongWritable(Long.MAX_VALUE);
         while (values.hasNext()) {
           LongWritable val = values.next();
@@ -302,7 +302,7 @@ public class CrawlDbReader extends Configured implements Closeable, Tool {
             cnt.set(val.get());
         }
         output.collect(key, cnt);
-      } else if (k.equals("sct") || k.equals("ftt")) {
+      } else if (k.equals("sct") || k.equals("ftt") || k.equals("fit")) {
         LongWritable cnt = new LongWritable();
         while (values.hasNext()) {
           LongWritable val = values.next();
@@ -402,16 +402,16 @@ public class CrawlDbReader extends Configured implements Closeable, Tool {
 			  LongWritable val = stats.get(k);
 			  if (val == null) {
 				  val = new LongWritable();
-				  if (k.equals("scx") || k.equals("ftx"))
+				  if (k.equals("scx") || k.equals("ftx") || k.equals("fix"))
 					  val.set(Long.MIN_VALUE);
-				  if (k.equals("scn") || k.equals("ftn"))
+				  if (k.equals("scn") || k.equals("ftn") || k.equals("fin"))
 					  val.set(Long.MAX_VALUE);
 				  stats.put(k, val);
 			  }
-			  if (k.equals("scx") || k.equals("ftx")) {
+			  if (k.equals("scx") || k.equals("ftx") || k.equals("fix")) {
 				  if (val.get() < value.get())
 					  val.set(value.get());
-			  } else if (k.equals("scn") || k.equals("ftn")) {
+			  } else if (k.equals("scn") || k.equals("ftn") || k.equals("fin")) {
 				  if (val.get() > value.get())
 					  val.set(value.get());
 			  } else {
@@ -455,6 +455,15 @@ public class CrawlDbReader extends Configured implements Closeable, Tool {
         } else if (k.equals("ftt")) {
           LOG.info("avg of fetch times:\t"
               + new Date(val.get() / totalCnt.get()));
+        } else if (k.equals("fin")) {
+          LOG.info("shortest fetch interval:\t{}",
+              TimingUtil.secondsToDaysHMS(val.get()));
+        } else if (k.equals("fix")) {
+          LOG.info("longest fetch interval:\t{}",
+              TimingUtil.secondsToDaysHMS(val.get()));
+        } else if (k.equals("fit")) {
+          LOG.info("avg fetch interval:\t{}",
+              TimingUtil.secondsToDaysHMS(val.get() / totalCnt.get()));
         } else if (k.startsWith("status")) {
           String[] st = k.split(" ");
           int code = Integer.parseInt(st[1]);

http://git-wip-us.apache.org/repos/asf/nutch/blob/39f6c713/src/java/org/apache/nutch/util/TimingUtil.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/util/TimingUtil.java b/src/java/org/apache/nutch/util/TimingUtil.java
index 8f77969..c4af356 100644
--- a/src/java/org/apache/nutch/util/TimingUtil.java
+++ b/src/java/org/apache/nutch/util/TimingUtil.java
@@ -17,12 +17,10 @@
 
 package org.apache.nutch.util;
 
-import java.text.NumberFormat;
+import java.util.concurrent.TimeUnit;
 
 public class TimingUtil {
 
-  private static long[] TIME_FACTOR = { 60 * 60 * 1000, 60 * 1000, 1000 };
-
   /**
    * Calculate the elapsed time between two times specified in milliseconds.
    * 
@@ -37,23 +35,38 @@ public class TimingUtil {
     if (start > end) {
       return null;
     }
+    return secondsToHMS((end-start)/1000);
+  }
+  
+  /**
+   * Show time in seconds as hours, minutes and seconds (hh:mm:ss)
+   * 
+   * @param seconds
+   *          (elapsed) time in seconds
+   * @return human readable time string "hh:mm:ss"
+   */
+  public static String secondsToHMS(long seconds) {
+    long hours = TimeUnit.SECONDS.toHours(seconds);
+    long minutes = TimeUnit.SECONDS.toMinutes(seconds)
+        % TimeUnit.HOURS.toMinutes(1);
+    seconds = TimeUnit.SECONDS.toSeconds(seconds)
+        % TimeUnit.MINUTES.toSeconds(1);
+    return String.format("%02d:%02d:%02d", hours, minutes, seconds);
+  }
 
-    long[] elapsedTime = new long[TIME_FACTOR.length];
-
-    for (int i = 0; i < TIME_FACTOR.length; i++) {
-      elapsedTime[i] = start > end ? -1 : (end - start) / TIME_FACTOR[i];
-      start += TIME_FACTOR[i] * elapsedTime[i];
-    }
-
-    NumberFormat nf = NumberFormat.getInstance();
-    nf.setMinimumIntegerDigits(2);
-    StringBuffer buf = new StringBuffer();
-    for (int i = 0; i < elapsedTime.length; i++) {
-      if (i > 0) {
-        buf.append(":");
-      }
-      buf.append(nf.format(elapsedTime[i]));
-    }
-    return buf.toString();
+  /**
+   * Show time in seconds as days, hours, minutes and seconds (d days, hh:mm:ss)
+   * 
+   * @param seconds
+   *          (elapsed) time in seconds
+   * @return human readable time string "d days, hh:mm:ss"
+   */
+  public static String secondsToDaysHMS(long seconds) {
+    long days = TimeUnit.SECONDS.toDays(seconds);
+    if (days == 0)
+      return secondsToHMS(seconds);
+    String hhmmss = secondsToHMS(seconds % TimeUnit.DAYS.toSeconds(1));
+    return String.format("%d days, %s", days, hhmmss);
   }
+
 }