You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2016/07/02 10:16:29 UTC
[2/4] nutch git commit: CrawlDb statistics: add fetch interval
(shortest, longest, average)
CrawlDb statistics: add fetch interval (shortest, longest, average)
Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/39f6c713
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/39f6c713
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/39f6c713
Branch: refs/heads/master
Commit: 39f6c713974240d19d54a515cd04372878739456
Parents: ea2843b
Author: Sebastian Nagel <sn...@apache.org>
Authored: Wed Jun 22 16:22:33 2016 +0200
Committer: Sebastian Nagel <sn...@apache.org>
Committed: Sat Jul 2 12:06:04 2016 +0200
----------------------------------------------------------------------
.../org/apache/nutch/crawl/CrawlDbReader.java | 35 ++++++++-----
src/java/org/apache/nutch/util/TimingUtil.java | 53 ++++++++++++--------
2 files changed, 55 insertions(+), 33 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/nutch/blob/39f6c713/src/java/org/apache/nutch/crawl/CrawlDbReader.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/crawl/CrawlDbReader.java b/src/java/org/apache/nutch/crawl/CrawlDbReader.java
index 381cec5..3cf6ff3 100644
--- a/src/java/org/apache/nutch/crawl/CrawlDbReader.java
+++ b/src/java/org/apache/nutch/crawl/CrawlDbReader.java
@@ -69,6 +69,7 @@ import org.apache.nutch.util.JexlUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.StringUtil;
+import org.apache.nutch.util.TimingUtil;
import org.apache.commons.jexl2.Expression;
import org.apache.commons.jexl2.JexlEngine;
import org.apache.commons.lang.time.DateUtils;
@@ -195,9 +196,10 @@ public class CrawlDbReader extends Configured implements Closeable, Tool {
output.collect(new Text("status " + value.getStatus()), COUNT_1);
output
.collect(new Text("retry " + value.getRetriesSinceFetch()), COUNT_1);
- output.collect(new Text("s"), new LongWritable(
+ output.collect(new Text("sc"), new LongWritable(
(long) (value.getScore() * 1000.0)));
- output.collect(new Text("f"), new LongWritable(value.getFetchTime()));
+ output.collect(new Text("ft"), new LongWritable(value.getFetchTime()));
+ output.collect(new Text("fi"), new LongWritable(value.getFetchInterval()));
if (sort) {
URL u = new URL(key.toString());
String host = u.getHost();
@@ -244,10 +246,8 @@ public class CrawlDbReader extends Configured implements Closeable, Tool {
throws IOException {
val.set(0L);
String k = key.toString();
- if (k.equals("s")) {
- reduceMinMaxTotal("sc", values, output, reporter);
- } else if (k.equals("f")) {
- reduceMinMaxTotal("ft", values, output, reporter);
+ if (k.equals("sc") || k.equals("ft") || k.equals("fi")) {
+ reduceMinMaxTotal(k, values, output, reporter);
} else {
while (values.hasNext()) {
LongWritable cnt = values.next();
@@ -286,7 +286,7 @@ public class CrawlDbReader extends Configured implements Closeable, Tool {
cnt.set(cnt.get() + val.get());
}
output.collect(key, cnt);
- } else if (k.equals("scx") || k.equals("ftx")) {
+ } else if (k.equals("scx") || k.equals("ftx") || k.equals("fix")) {
LongWritable cnt = new LongWritable(Long.MIN_VALUE);
while (values.hasNext()) {
LongWritable val = values.next();
@@ -294,7 +294,7 @@ public class CrawlDbReader extends Configured implements Closeable, Tool {
cnt.set(val.get());
}
output.collect(key, cnt);
- } else if (k.equals("scn") || k.equals("ftn")) {
+ } else if (k.equals("scn") || k.equals("ftn") || k.equals("fin")) {
LongWritable cnt = new LongWritable(Long.MAX_VALUE);
while (values.hasNext()) {
LongWritable val = values.next();
@@ -302,7 +302,7 @@ public class CrawlDbReader extends Configured implements Closeable, Tool {
cnt.set(val.get());
}
output.collect(key, cnt);
- } else if (k.equals("sct") || k.equals("ftt")) {
+ } else if (k.equals("sct") || k.equals("ftt") || k.equals("fit")) {
LongWritable cnt = new LongWritable();
while (values.hasNext()) {
LongWritable val = values.next();
@@ -402,16 +402,16 @@ public class CrawlDbReader extends Configured implements Closeable, Tool {
LongWritable val = stats.get(k);
if (val == null) {
val = new LongWritable();
- if (k.equals("scx") || k.equals("ftx"))
+ if (k.equals("scx") || k.equals("ftx") || k.equals("fix"))
val.set(Long.MIN_VALUE);
- if (k.equals("scn") || k.equals("ftn"))
+ if (k.equals("scn") || k.equals("ftn") || k.equals("fin"))
val.set(Long.MAX_VALUE);
stats.put(k, val);
}
- if (k.equals("scx") || k.equals("ftx")) {
+ if (k.equals("scx") || k.equals("ftx") || k.equals("fix")) {
if (val.get() < value.get())
val.set(value.get());
- } else if (k.equals("scn") || k.equals("ftn")) {
+ } else if (k.equals("scn") || k.equals("ftn") || k.equals("fin")) {
if (val.get() > value.get())
val.set(value.get());
} else {
@@ -455,6 +455,15 @@ public class CrawlDbReader extends Configured implements Closeable, Tool {
} else if (k.equals("ftt")) {
LOG.info("avg of fetch times:\t"
+ new Date(val.get() / totalCnt.get()));
+ } else if (k.equals("fin")) {
+ LOG.info("shortest fetch interval:\t{}",
+ TimingUtil.secondsToDaysHMS(val.get()));
+ } else if (k.equals("fix")) {
+ LOG.info("longest fetch interval:\t{}",
+ TimingUtil.secondsToDaysHMS(val.get()));
+ } else if (k.equals("fit")) {
+ LOG.info("avg fetch interval:\t{}",
+ TimingUtil.secondsToDaysHMS(val.get() / totalCnt.get()));
} else if (k.startsWith("status")) {
String[] st = k.split(" ");
int code = Integer.parseInt(st[1]);
http://git-wip-us.apache.org/repos/asf/nutch/blob/39f6c713/src/java/org/apache/nutch/util/TimingUtil.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/util/TimingUtil.java b/src/java/org/apache/nutch/util/TimingUtil.java
index 8f77969..c4af356 100644
--- a/src/java/org/apache/nutch/util/TimingUtil.java
+++ b/src/java/org/apache/nutch/util/TimingUtil.java
@@ -17,12 +17,10 @@
package org.apache.nutch.util;
-import java.text.NumberFormat;
+import java.util.concurrent.TimeUnit;
public class TimingUtil {
- private static long[] TIME_FACTOR = { 60 * 60 * 1000, 60 * 1000, 1000 };
-
/**
* Calculate the elapsed time between two times specified in milliseconds.
*
@@ -37,23 +35,38 @@ public class TimingUtil {
if (start > end) {
return null;
}
+ return secondsToHMS((end-start)/1000);
+ }
+
+ /**
+ * Show time in seconds as hours, minutes and seconds (hh:mm:ss)
+ *
+ * @param seconds
+ * (elapsed) time in seconds
+ * @return human readable time string "hh:mm:ss"
+ */
+ public static String secondsToHMS(long seconds) {
+ long hours = TimeUnit.SECONDS.toHours(seconds);
+ long minutes = TimeUnit.SECONDS.toMinutes(seconds)
+ % TimeUnit.HOURS.toMinutes(1);
+ seconds = TimeUnit.SECONDS.toSeconds(seconds)
+ % TimeUnit.MINUTES.toSeconds(1);
+ return String.format("%02d:%02d:%02d", hours, minutes, seconds);
+ }
- long[] elapsedTime = new long[TIME_FACTOR.length];
-
- for (int i = 0; i < TIME_FACTOR.length; i++) {
- elapsedTime[i] = start > end ? -1 : (end - start) / TIME_FACTOR[i];
- start += TIME_FACTOR[i] * elapsedTime[i];
- }
-
- NumberFormat nf = NumberFormat.getInstance();
- nf.setMinimumIntegerDigits(2);
- StringBuffer buf = new StringBuffer();
- for (int i = 0; i < elapsedTime.length; i++) {
- if (i > 0) {
- buf.append(":");
- }
- buf.append(nf.format(elapsedTime[i]));
- }
- return buf.toString();
+ /**
+ * Show time in seconds as days, hours, minutes and seconds (d days, hh:mm:ss)
+ *
+ * @param seconds
+ * (elapsed) time in seconds
+ * @return human readable time string "d days, hh:mm:ss"
+ */
+ public static String secondsToDaysHMS(long seconds) {
+ long days = TimeUnit.SECONDS.toDays(seconds);
+ if (days == 0)
+ return secondsToHMS(seconds);
+ String hhmmss = secondsToHMS(seconds % TimeUnit.DAYS.toSeconds(1));
+ return String.format("%d days, %s", days, hhmmss);
}
+
}