You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2017/03/16 10:40:24 UTC
[nutch] branch master updated: NUTCH-2367 Get single record from
HostDB
This is an automated email from the ASF dual-hosted git repository.
markus pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push:
new be3aea1 NUTCH-2367 Get single record from HostDB
be3aea1 is described below
commit be3aea1410835b34cfacdff7c3def9fb01a83e76
Author: Markus Jelsma <ma...@apache.org>
AuthorDate: Thu Mar 16 11:40:02 2017 +0100
NUTCH-2367 Get single record from HostDB
---
src/java/org/apache/nutch/hostdb/ReadHostDb.java | 39 ++++++++++++++++++++++--
1 file changed, 37 insertions(+), 2 deletions(-)
diff --git a/src/java/org/apache/nutch/hostdb/ReadHostDb.java b/src/java/org/apache/nutch/hostdb/ReadHostDb.java
index 5b08504..17e135a 100644
--- a/src/java/org/apache/nutch/hostdb/ReadHostDb.java
+++ b/src/java/org/apache/nutch/hostdb/ReadHostDb.java
@@ -30,9 +30,11 @@ import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
@@ -200,6 +202,29 @@ public class ReadHostDb extends Configured implements Tool {
long end = System.currentTimeMillis();
LOG.info("ReadHostDb: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
}
+
+ private void getHostDbRecord(Path hostDb, String host) throws Exception {
+ Configuration conf = getConf();
+ SequenceFile.Reader[] readers = SequenceFileOutputFormat.getReaders(conf, hostDb);
+
+ Class<?> keyClass = readers[0].getKeyClass();
+ Class<?> valueClass = readers[0].getValueClass();
+
+ if (!keyClass.getName().equals("org.apache.hadoop.io.Text"))
+ throw new IOException("Incompatible key (" + keyClass.getName() + ")");
+
+ Text key = (Text) keyClass.newInstance();
+ HostDatum value = (HostDatum) valueClass.newInstance();
+
+ for (int i = 0; i < readers.length; i++) {
+ while (readers[i].next(key, value)) {
+ if (host.equals(key.toString())) {
+ System.out.println(value.toString());
+ }
+ }
+ readers[i].close();
+ }
+ }
public static void main(String args[]) throws Exception {
int res = ToolRunner.run(NutchConfiguration.create(), new ReadHostDb(), args);
@@ -208,13 +233,14 @@ public class ReadHostDb extends Configured implements Tool {
public int run(String[] args) throws Exception {
if (args.length < 2) {
- System.err.println("Usage: ReadHostDb <hostdb> <output> [-dumpHomepages | -dumpHostnames | -expr <expr.>]");
+ System.err.println("Usage: ReadHostDb <hostdb> [-get <url>] [<output> [-dumpHomepages | -dumpHostnames | -expr <expr.>]]");
return -1;
}
boolean dumpHomepages = false;
boolean dumpHostnames = false;
String expr = null;
+ String get = null;
for (int i = 0; i < args.length; i++) {
if (args[i].equals("-dumpHomepages")) {
@@ -225,6 +251,11 @@ public class ReadHostDb extends Configured implements Tool {
LOG.info("ReadHostDb: dumping hostnames");
dumpHostnames = true;
}
+ if (args[i].equals("-get")) {
+ get = args[i + 1];
+ LOG.info("ReadHostDb: get: "+ get);
+ i++;
+ }
if (args[i].equals("-expr")) {
expr = args[i + 1];
LOG.info("ReadHostDb: evaluating expression: " + expr);
@@ -233,7 +264,11 @@ public class ReadHostDb extends Configured implements Tool {
}
try {
- readHostDb(new Path(args[0]), new Path(args[1]), dumpHomepages, dumpHostnames, expr);
+ if (get != null) {
+ getHostDbRecord(new Path(args[0], "current"), get);
+ } else {
+ readHostDb(new Path(args[0]), new Path(args[1]), dumpHomepages, dumpHostnames, expr);
+ }
return 0;
} catch (Exception e) {
LOG.error("ReadHostDb: " + StringUtils.stringifyException(e));
--
To stop receiving notification emails like this one, please contact
['"commits@nutch.apache.org" <co...@nutch.apache.org>'].