You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2017/03/16 10:40:24 UTC

[nutch] branch master updated: NUTCH-2367 Get single record from HostDB

This is an automated email from the ASF dual-hosted git repository.

markus pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git

The following commit(s) were added to refs/heads/master by this push:
       new  be3aea1   NUTCH-2367 Get single record from HostDB
be3aea1 is described below

commit be3aea1410835b34cfacdff7c3def9fb01a83e76
Author: Markus Jelsma <ma...@apache.org>
AuthorDate: Thu Mar 16 11:40:02 2017 +0100

    NUTCH-2367 Get single record from HostDB
---
 src/java/org/apache/nutch/hostdb/ReadHostDb.java | 39 ++++++++++++++++++++++--
 1 file changed, 37 insertions(+), 2 deletions(-)

diff --git a/src/java/org/apache/nutch/hostdb/ReadHostDb.java b/src/java/org/apache/nutch/hostdb/ReadHostDb.java
index 5b08504..17e135a 100644
--- a/src/java/org/apache/nutch/hostdb/ReadHostDb.java
+++ b/src/java/org/apache/nutch/hostdb/ReadHostDb.java
@@ -30,9 +30,11 @@ import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.io.FloatWritable;
 import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.SequenceFile;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.Writable;
 import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapred.SequenceFileOutputFormat;
 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
@@ -200,6 +202,29 @@ public class ReadHostDb extends Configured implements Tool {
     long end = System.currentTimeMillis();
     LOG.info("ReadHostDb: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
   }
+  
+  private void getHostDbRecord(Path hostDb, String host) throws Exception {
+    Configuration conf = getConf();
+    SequenceFile.Reader[] readers = SequenceFileOutputFormat.getReaders(conf, hostDb);
+
+    Class<?> keyClass = readers[0].getKeyClass();
+    Class<?> valueClass = readers[0].getValueClass();
+    
+    if (!keyClass.getName().equals("org.apache.hadoop.io.Text"))
+      throw new IOException("Incompatible key (" + keyClass.getName() + ")");
+      
+    Text key = (Text) keyClass.newInstance();
+    HostDatum value = (HostDatum) valueClass.newInstance();
+    
+    for (int i = 0; i < readers.length; i++) {
+      while (readers[i].next(key, value)) {
+        if (host.equals(key.toString())) {
+          System.out.println(value.toString());
+        }
+      }
+      readers[i].close();
+    }    
+  }
 
   public static void main(String args[]) throws Exception {
     int res = ToolRunner.run(NutchConfiguration.create(), new ReadHostDb(), args);
@@ -208,13 +233,14 @@ public class ReadHostDb extends Configured implements Tool {
 
   public int run(String[] args) throws Exception {
     if (args.length < 2) {
-      System.err.println("Usage: ReadHostDb <hostdb> <output> [-dumpHomepages | -dumpHostnames | -expr <expr.>]");
+      System.err.println("Usage: ReadHostDb <hostdb> [-get <url>] [<output> [-dumpHomepages | -dumpHostnames | -expr <expr.>]]");
       return -1;
     }
 
     boolean dumpHomepages = false;
     boolean dumpHostnames = false;
     String expr = null;
+    String get = null;
 
     for (int i = 0; i < args.length; i++) {
       if (args[i].equals("-dumpHomepages")) {
@@ -225,6 +251,11 @@ public class ReadHostDb extends Configured implements Tool {
         LOG.info("ReadHostDb: dumping hostnames");
         dumpHostnames = true;
       }
+      if (args[i].equals("-get")) {
+        get = args[i + 1];
+        LOG.info("ReadHostDb: get: "+ get);
+        i++;
+      }
       if (args[i].equals("-expr")) {
         expr = args[i + 1];
         LOG.info("ReadHostDb: evaluating expression: " + expr);
@@ -233,7 +264,11 @@ public class ReadHostDb extends Configured implements Tool {
     }
 
     try {
-      readHostDb(new Path(args[0]), new Path(args[1]), dumpHomepages, dumpHostnames, expr);
+      if (get != null) {
+        getHostDbRecord(new Path(args[0], "current"), get);
+      } else {
+        readHostDb(new Path(args[0]), new Path(args[1]), dumpHomepages, dumpHostnames, expr);
+      }
       return 0;
     } catch (Exception e) {
       LOG.error("ReadHostDb: " + StringUtils.stringifyException(e));

-- 
To stop receiving notification emails like this one, please contact
['"commits@nutch.apache.org" <co...@nutch.apache.org>'].