You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2015/03/31 19:34:38 UTC

svn commit: r1670404 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/crawl/CrawlDbReader.java src/java/org/apache/nutch/tools/Benchmark.java

Author: markus
Date: Tue Mar 31 17:34:37 2015
New Revision: 1670404

URL: http://svn.apache.org/r1670404
Log:
NUTCH-1979 CrawlDbReader to implement Tool


Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java
    nutch/trunk/src/java/org/apache/nutch/tools/Benchmark.java

Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1670404&r1=1670403&r2=1670404&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Tue Mar 31 17:34:37 2015
@@ -2,6 +2,8 @@ Nutch Change Log
  
 Nutch Current Development 1.10-SNAPSHOT
 
+* NUTCH-1979 CrawlDbReader to implement Tool (markus)
+
 * NUTCH-1970 Pretty print JSON output in config resource (Tyler Pasulich, mattmann)
 
 * NUTCH-1976 Allow Users to Set Hostname for Server (Tyler Palsulich via mattmann)

Modified: nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java?rev=1670404&r1=1670403&r2=1670404&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java Tue Mar 31 17:34:37 2015
@@ -33,7 +33,7 @@ import java.util.TreeMap;
 // Commons Logging imports
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
-import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.FloatWritable;
@@ -59,6 +59,8 @@ import org.apache.hadoop.mapred.lib.Hash
 import org.apache.hadoop.mapred.lib.IdentityMapper;
 import org.apache.hadoop.mapred.lib.IdentityReducer;
 import org.apache.hadoop.util.Progressable;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.NutchJob;
 import org.apache.nutch.util.StringUtil;
@@ -69,13 +71,13 @@ import org.apache.nutch.util.StringUtil;
  * @author Andrzej Bialecki
  * 
  */
-public class CrawlDbReader implements Closeable {
+public class CrawlDbReader extends Configured implements Closeable, Tool {
 
   public static final Logger LOG = LoggerFactory.getLogger(CrawlDbReader.class);
 
   private MapFile.Reader[] readers = null;
 
-  private void openReaders(String crawlDb, Configuration config)
+  private void openReaders(String crawlDb, JobConf config)
       throws IOException {
     if (readers != null)
       return;
@@ -343,7 +345,7 @@ public class CrawlDbReader implements Cl
     closeReaders();
   }
 
-  public void processStatJob(String crawlDb, Configuration config, boolean sort)
+  public void processStatJob(String crawlDb, JobConf config, boolean sort)
       throws IOException {
 
     if (LOG.isInfoEnabled()) {
@@ -443,7 +445,7 @@ public class CrawlDbReader implements Cl
 
   }
 
-  public CrawlDatum get(String crawlDb, String url, Configuration config)
+  public CrawlDatum get(String crawlDb, String url, JobConf config)
       throws IOException {
     Text key = new Text(url);
     CrawlDatum val = new CrawlDatum();
@@ -453,7 +455,7 @@ public class CrawlDbReader implements Cl
     return res;
   }
 
-  public void readUrl(String crawlDb, String url, Configuration config)
+  public void readUrl(String crawlDb, String url, JobConf config)
       throws IOException {
     CrawlDatum res = get(crawlDb, url, config);
     System.out.println("URL: " + url);
@@ -465,7 +467,7 @@ public class CrawlDbReader implements Cl
   }
 
   public void processDumpJob(String crawlDb, String output,
-      Configuration config, String format, String regex, String status,
+      JobConf config, String format, String regex, String status,
       Integer retry) throws IOException {
     if (LOG.isInfoEnabled()) {
       LOG.info("CrawlDb dump: starting");
@@ -554,7 +556,7 @@ public class CrawlDbReader implements Cl
   }
 
   public void processTopNJob(String crawlDb, long topN, float min,
-      String output, Configuration config) throws IOException {
+      String output, JobConf config) throws IOException {
 
     if (LOG.isInfoEnabled()) {
       LOG.info("CrawlDb topN: starting (topN=" + topN + ", min=" + min + ")");
@@ -609,7 +611,7 @@ public class CrawlDbReader implements Cl
 
   }
 
-  public static void main(String[] args) throws IOException {
+  public int run(String[] args) throws IOException {
     @SuppressWarnings("resource")
     CrawlDbReader dbr = new CrawlDbReader();
 
@@ -638,11 +640,11 @@ public class CrawlDbReader implements Cl
       System.err
           .println("\t\t[<min>]\tskip records with scores below this value.");
       System.err.println("\t\t\tThis can significantly improve performance.");
-      return;
+      return -1;
     }
     String param = null;
     String crawlDb = args[0];
-    Configuration conf = NutchConfiguration.create();
+    JobConf job = new NutchJob(getConf());
     for (int i = 1; i < args.length; i++) {
       if (args[i].equals("-stats")) {
         boolean toSort = false;
@@ -650,7 +652,7 @@ public class CrawlDbReader implements Cl
           toSort = true;
           i++;
         }
-        dbr.processStatJob(crawlDb, conf, toSort);
+        dbr.processStatJob(crawlDb, job, toSort);
       } else if (args[i].equals("-dump")) {
         param = args[++i];
         String format = "normal";
@@ -675,10 +677,10 @@ public class CrawlDbReader implements Cl
             i = i + 2;
           }
         }
-        dbr.processDumpJob(crawlDb, param, conf, format, regex, status, retry);
+        dbr.processDumpJob(crawlDb, param, job, format, regex, status, retry);
       } else if (args[i].equals("-url")) {
         param = args[++i];
-        dbr.readUrl(crawlDb, param, conf);
+        dbr.readUrl(crawlDb, param, job);
       } else if (args[i].equals("-topN")) {
         param = args[++i];
         long topN = Long.parseLong(param);
@@ -687,11 +689,18 @@ public class CrawlDbReader implements Cl
         if (i < args.length - 1) {
           min = Float.parseFloat(args[++i]);
         }
-        dbr.processTopNJob(crawlDb, topN, min, param, conf);
+        dbr.processTopNJob(crawlDb, topN, min, param, job);
       } else {
         System.err.println("\nError: wrong argument " + args[i]);
+        return -1;
       }
     }
-    return;
+    return 0;
   }
-}
+  
+    public static void main(String[] args) throws Exception {
+        int result = ToolRunner.run(NutchConfiguration.create(),
+                new CrawlDbReader(), args);
+        System.exit(result);
+    }
+}
\ No newline at end of file

Modified: nutch/trunk/src/java/org/apache/nutch/tools/Benchmark.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/Benchmark.java?rev=1670404&r1=1670403&r2=1670404&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/Benchmark.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/Benchmark.java Tue Mar 31 17:34:37 2015
@@ -277,7 +277,7 @@ public class Benchmark extends Configure
     }
     res.elapsed = System.currentTimeMillis() - res.elapsed;
     CrawlDbReader dbreader = new CrawlDbReader();
-    dbreader.processStatJob(crawlDb.toString(), conf, false);
+    dbreader.processStatJob(crawlDb.toString(), job, false);
     return res;
   }