You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2016/07/02 10:16:30 UTC
[3/4] nutch git commit: CrawlDb statistics: add fetch time (earliest,
latest, average)
CrawlDb statistics: add fetch time (earliest, latest, average)
Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/ea2843b9
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/ea2843b9
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/ea2843b9
Branch: refs/heads/master
Commit: ea2843b9be6569e17963031d7370f5db42261809
Parents: 6b141fb
Author: Sebastian Nagel <sn...@apache.org>
Authored: Mon Jun 20 14:42:04 2016 +0200
Committer: Sebastian Nagel <sn...@apache.org>
Committed: Sat Jul 2 12:06:04 2016 +0200
----------------------------------------------------------------------
.../org/apache/nutch/crawl/CrawlDbReader.java | 76 ++++++++++++--------
1 file changed, 46 insertions(+), 30 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/nutch/blob/ea2843b9/src/java/org/apache/nutch/crawl/CrawlDbReader.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/crawl/CrawlDbReader.java b/src/java/org/apache/nutch/crawl/CrawlDbReader.java
index 8f42ac4..381cec5 100644
--- a/src/java/org/apache/nutch/crawl/CrawlDbReader.java
+++ b/src/java/org/apache/nutch/crawl/CrawlDbReader.java
@@ -197,6 +197,7 @@ public class CrawlDbReader extends Configured implements Closeable, Tool {
.collect(new Text("retry " + value.getRetriesSinceFetch()), COUNT_1);
output.collect(new Text("s"), new LongWritable(
(long) (value.getScore() * 1000.0)));
+ output.collect(new Text("f"), new LongWritable(value.getFetchTime()));
if (sort) {
URL u = new URL(key.toString());
String host = u.getHost();
@@ -219,32 +220,40 @@ public class CrawlDbReader extends Configured implements Closeable, Tool {
public void close() {
}
+ private void reduceMinMaxTotal(String keyPrefix, Iterator<LongWritable> values,
+ OutputCollector<Text, LongWritable> output, Reporter reporter)
+ throws IOException {
+ long total = 0;
+ long min = Long.MAX_VALUE;
+ long max = Long.MIN_VALUE;
+ while (values.hasNext()) {
+ LongWritable cnt = values.next();
+ if (cnt.get() < min)
+ min = cnt.get();
+ if (cnt.get() > max)
+ max = cnt.get();
+ total += cnt.get();
+ }
+ output.collect(new Text(keyPrefix+"n"), new LongWritable(min));
+ output.collect(new Text(keyPrefix+"x"), new LongWritable(max));
+ output.collect(new Text(keyPrefix+"t"), new LongWritable(total));
+ }
+
public void reduce(Text key, Iterator<LongWritable> values,
OutputCollector<Text, LongWritable> output, Reporter reporter)
throws IOException {
val.set(0L);
String k = key.toString();
- if (!k.equals("s")) {
+ if (k.equals("s")) {
+ reduceMinMaxTotal("sc", values, output, reporter);
+ } else if (k.equals("f")) {
+ reduceMinMaxTotal("ft", values, output, reporter);
+ } else {
while (values.hasNext()) {
LongWritable cnt = values.next();
val.set(val.get() + cnt.get());
}
output.collect(key, val);
- } else {
- long total = 0;
- long min = Long.MAX_VALUE;
- long max = Long.MIN_VALUE;
- while (values.hasNext()) {
- LongWritable cnt = values.next();
- if (cnt.get() < min)
- min = cnt.get();
- if (cnt.get() > max)
- max = cnt.get();
- total += cnt.get();
- }
- output.collect(new Text("scn"), new LongWritable(min));
- output.collect(new Text("scx"), new LongWritable(max));
- output.collect(new Text("sct"), new LongWritable(total));
}
}
}
@@ -277,7 +286,7 @@ public class CrawlDbReader extends Configured implements Closeable, Tool {
cnt.set(cnt.get() + val.get());
}
output.collect(key, cnt);
- } else if (k.equals("scx")) {
+ } else if (k.equals("scx") || k.equals("ftx")) {
LongWritable cnt = new LongWritable(Long.MIN_VALUE);
while (values.hasNext()) {
LongWritable val = values.next();
@@ -285,7 +294,7 @@ public class CrawlDbReader extends Configured implements Closeable, Tool {
cnt.set(val.get());
}
output.collect(key, cnt);
- } else if (k.equals("scn")) {
+ } else if (k.equals("scn") || k.equals("ftn")) {
LongWritable cnt = new LongWritable(Long.MAX_VALUE);
while (values.hasNext()) {
LongWritable val = values.next();
@@ -293,7 +302,7 @@ public class CrawlDbReader extends Configured implements Closeable, Tool {
cnt.set(val.get());
}
output.collect(key, cnt);
- } else if (k.equals("sct")) {
+ } else if (k.equals("sct") || k.equals("ftt")) {
LongWritable cnt = new LongWritable();
while (values.hasNext()) {
LongWritable val = values.next();
@@ -393,16 +402,16 @@ public class CrawlDbReader extends Configured implements Closeable, Tool {
LongWritable val = stats.get(k);
if (val == null) {
val = new LongWritable();
- if (k.equals("scx"))
+ if (k.equals("scx") || k.equals("ftx"))
val.set(Long.MIN_VALUE);
- if (k.equals("scn"))
+ if (k.equals("scn") || k.equals("ftn"))
val.set(Long.MAX_VALUE);
stats.put(k, val);
}
- if (k.equals("scx")) {
+ if (k.equals("scx") || k.equals("ftx")) {
if (val.get() < value.get())
val.set(value.get());
- } else if (k.equals("scn")) {
+ } else if (k.equals("scn") || k.equals("ftn")) {
if (val.get() > value.get())
val.set(value.get());
} else {
@@ -439,6 +448,13 @@ public class CrawlDbReader extends Configured implements Closeable, Tool {
} else if (k.equals("sct")) {
LOG.info("avg score:\t"
+ (float) ((((double) val.get()) / totalCnt.get()) / 1000.0));
+ } else if (k.equals("ftn")) {
+ LOG.info("earliest fetch time:\t" + new Date(val.get()));
+ } else if (k.equals("ftx")) {
+ LOG.info("latest fetch time:\t" + new Date(val.get()));
+ } else if (k.equals("ftt")) {
+ LOG.info("avg of fetch times:\t"
+ + new Date(val.get() / totalCnt.get()));
} else if (k.startsWith("status")) {
String[] st = k.split(" ");
int code = Integer.parseInt(st[1]);
@@ -732,11 +748,12 @@ public class CrawlDbReader extends Configured implements Closeable, Tool {
return 0;
}
- public static void main(String[] args) throws Exception {
- int result = ToolRunner.run(NutchConfiguration.create(),
- new CrawlDbReader(), args);
- System.exit(result);
- }
+ public static void main(String[] args) throws Exception {
+ int result = ToolRunner.run(NutchConfiguration.create(),
+ new CrawlDbReader(), args);
+ System.exit(result);
+ }
+
public Object query(Map<String, String> args, Configuration conf, String type, String crawlId) throws Exception {
@@ -759,7 +776,6 @@ public class CrawlDbReader extends Configured implements Closeable, Tool {
String k = entry.getKey();
LongWritable val = entry.getValue();
if (k.equals("scn")) {
-
results.put("minScore", String.valueOf((val.get() / 1000.0f)));
} else if (k.equals("scx")) {
results.put("maxScore", String.valueOf((val.get() / 1000.0f)));
@@ -854,5 +870,5 @@ public class CrawlDbReader extends Configured implements Closeable, Tool {
return results;
}
return results;
- }
+ }
}
\ No newline at end of file