You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2015/04/22 03:46:28 UTC

svn commit: r1675243 [1/2] - in /nutch/trunk: ./ conf/ ivy/ src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/fetcher/ src/java/org/apache/nutch/metadata/ src/java/org/apache/nutch/parse/ src/java/org/apache/nutch/protocol/ src/java/org/apache...

Author: mattmann
Date: Wed Apr 22 01:46:28 2015
New Revision: 1675243

URL: http://svn.apache.org/r1675243
Log:
Fix for NUTCH-1973 Job Administration end point for the REST service contributed by Sujen Shah <su...@gmail.com> this closes #16.

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/conf/log4j.properties
    nutch/trunk/conf/nutch-default.xml
    nutch/trunk/ivy/ivy.xml
    nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java
    nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java
    nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java
    nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
    nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java
    nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java
    nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
    nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java
    nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java
    nutch/trunk/src/java/org/apache/nutch/protocol/RobotRulesParser.java
    nutch/trunk/src/java/org/apache/nutch/segment/SegmentChecker.java
    nutch/trunk/src/java/org/apache/nutch/service/ConfManager.java
    nutch/trunk/src/java/org/apache/nutch/service/JobManager.java
    nutch/trunk/src/java/org/apache/nutch/service/NutchServer.java
    nutch/trunk/src/java/org/apache/nutch/service/impl/ConfManagerImpl.java
    nutch/trunk/src/java/org/apache/nutch/service/model/response/JobInfo.java
    nutch/trunk/src/java/org/apache/nutch/service/resources/AbstractResource.java
    nutch/trunk/src/java/org/apache/nutch/service/resources/ConfigResource.java
    nutch/trunk/src/java/org/apache/nutch/service/resources/JobResource.java
    nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
    nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java
    nutch/trunk/src/java/org/apache/nutch/util/domain/DomainStatistics.java
    nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
    nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java

Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1675243&r1=1675242&r2=1675243&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed Apr 22 01:46:28 2015
@@ -2,6 +2,8 @@ Nutch Change Log
  
 Nutch Current Development 1.10-SNAPSHOT
 
+* NUTCH-1973 Job Administration end point for the REST service (Sujen Shah via mattmann)
+
 * NUTCH-1697 SegmentMerger to implement Tool (markus, snagel)
 
 * NUTCH-1987 - Make bin/crawl indexer agnostic (Michael Joyce, snagel via mattmann)
@@ -9,6 +11,20 @@ Nutch Current Development 1.10-SNAPSHOT
 * NUTCH-1854 bin/crawl fails with a parsing fetcher (Asitang Mishra via snagel)
 
 * NUTCH-1989 Handling invalid URLs in CommonCrawlDataDumper (Giuseppe Totaro via mattmann)
+
+* NUTCH-1988 Make nested output directory dump optional (Michael Joyce via mattmann)
+
+* NUTCH-1927 Create a whitelist of IPs/hostnames to allow skipping of RobotRules parsing (mattmann, snagel)
+
+* NUTCH-1986 Clarify Elastic Search Indexer Plugin Settings (Michael Joyce via mattmann)
+
+* NUTCH-1906 Typo in CrawlDbReader command line help (Michael Joyce via mattmann)
+
+* NUTCH-1911 Improve DomainStatistics tool command line parsing (Michael Joyce via mattmann)
+
+* NUTCH-1854 bin/crawl fails with a parsing fetcher (Asitang Mishra via snagel)
+
+* NUTCH-1989 Handling invalid URLs in CommonCrawlDataDumper (Giuseppe Totaro via mattmann)
 
 * NUTCH-1988 Make nested output directory dump optional (Michael Joyce via mattmann)
 

Modified: nutch/trunk/conf/log4j.properties
URL: http://svn.apache.org/viewvc/nutch/trunk/conf/log4j.properties?rev=1675243&r1=1675242&r2=1675243&view=diff
==============================================================================
--- nutch/trunk/conf/log4j.properties (original)
+++ nutch/trunk/conf/log4j.properties Wed Apr 22 01:46:28 2015
@@ -54,7 +54,6 @@ log4j.logger.org.apache.nutch.indexer.In
 log4j.logger.org.apache.nutch.tools.FreeGenerator=INFO,cmdstdout
 log4j.logger.org.apache.nutch.util.domain.DomainStatistics=INFO,cmdstdout
 log4j.logger.org.apache.nutch.tools.CrawlDBScanner=INFO,cmdstdout
-log4j.logger.org.apache.nutch.protocol.RobotRulesParser=INFO,cmdstdout
 log4j.logger.org.apache.nutch.plugin.PluginRepository=WARN
 
 log4j.logger.org.apache.nutch=INFO

Modified: nutch/trunk/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1675243&r1=1675242&r2=1675243&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Wed Apr 22 01:46:28 2015
@@ -118,15 +118,6 @@
 </property>
 
 <property>
-  <name>http.robot.rules.whitelist</name>
-  <value></value>
-  <description>Comma separated list of hostnames or IP addresses to ignore 
-  robot rules parsing for. Use with care and only if you are explicitly
-  allowed by the site owner to ignore the site's robots.txt!
-  </description>
-</property>
-
-<property>
   <name>http.robots.403.allow</name>
   <value>true</value>
   <description>Some servers return HTTP status 403 (Forbidden) if
@@ -1590,32 +1581,22 @@
 
 <property>
   <name>elastic.host</name>
-  <value>localhost</value>
-  <description>
-  The hostname to send documents to using TransportClient. Either host
-  and port must be defined or cluster to connect.
-  </description>
+  <value></value>
+  <description>The hostname to send documents to using TransportClient. Either host
+  and port must be defined or cluster.</description>
 </property>
 
 <property> 
   <name>elastic.port</name>
-  <value>9300</value>
-  <description>
-  The port to connect to using TransportClient. Note, the default port
-  for HTTP is 9200. The TransportClient port is NOT the same. By default
-  it should be 9300.
+  <value>9300</value>The port to connect to using TransportClient.<description>
   </description>
 </property>
 
 <property> 
   <name>elastic.cluster</name>
-  <value>elasticsearch</value>
-  <description>
-  The cluster name to discover. Either host and port must be defined
-  or cluster. The default Elasticsearch cluster name is 'elasticsearch'. If
-  yours is different be sure to update this value even if you specified a host
-  and port for connection otherwise you may encounter issues.
-  </description>
+  <value></value>
+  <description>The cluster name to discover. Either host and potr must be defined
+  or cluster.</description>
 </property>
 
 <property> 

Modified: nutch/trunk/ivy/ivy.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/ivy/ivy.xml?rev=1675243&r1=1675242&r2=1675243&view=diff
==============================================================================
--- nutch/trunk/ivy/ivy.xml (original)
+++ nutch/trunk/ivy/ivy.xml Wed Apr 22 01:46:28 2015
@@ -43,8 +43,6 @@
 		
 		<dependency org="commons-lang" name="commons-lang" rev="2.6"
 			conf="*->default" />
-		<dependency org="commons-validator" name="commons-validator" rev="1.4.1"
-			conf="*->default" />
 		<dependency org="commons-collections" name="commons-collections"
 			rev="3.1" conf="*->default" />
 		<dependency org="commons-httpclient" name="commons-httpclient"

Modified: nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java?rev=1675243&r1=1675242&r2=1675243&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java Wed Apr 22 01:46:28 2015
@@ -24,24 +24,23 @@ import java.util.*;
 // Commons Logging imports
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
-
 import org.apache.hadoop.io.*;
 import org.apache.hadoop.fs.*;
 import org.apache.hadoop.conf.*;
 import org.apache.hadoop.mapred.*;
 import org.apache.hadoop.util.*;
-
 import org.apache.nutch.util.HadoopFSUtil;
 import org.apache.nutch.util.LockUtil;
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.NutchJob;
+import org.apache.nutch.util.NutchTool;
 import org.apache.nutch.util.TimingUtil;
 
 /**
  * This class takes the output of the fetcher and updates the crawldb
  * accordingly.
  */
-public class CrawlDb extends Configured implements Tool {
+public class CrawlDb extends NutchTool implements Tool {
   public static final Logger LOG = LoggerFactory.getLogger(CrawlDb.class);
 
   public static final String CRAWLDB_ADDITIONS_ALLOWED = "db.update.additions.allowed";
@@ -232,4 +231,61 @@ public class CrawlDb extends Configured
       return -1;
     }
   }
+
+  /*
+   * Used for Nutch REST service
+   */
+  @Override
+  public Map<String, Object> run(Map<String, String> args, String crawlId) throws Exception {
+
+    Map<String, Object> results = new HashMap<String, Object>();
+    String RESULT = "result";
+    boolean normalize = getConf().getBoolean(CrawlDbFilter.URL_NORMALIZING,
+        false);
+    boolean filter = getConf().getBoolean(CrawlDbFilter.URL_FILTERING, false);
+    boolean additionsAllowed = getConf().getBoolean(CRAWLDB_ADDITIONS_ALLOWED,
+        true);
+    boolean force = false;
+    HashSet<Path> dirs = new HashSet<Path>();
+
+    if (args.containsKey("normalize")) {
+      normalize = true;
+    } 
+    if (args.containsKey("filter")) {
+      filter = true;
+    } 
+    if (args.containsKey("force")) {
+      force = true;
+    } 
+    if (args.containsKey("noAdditions")) {
+      additionsAllowed = false;
+    }
+    
+    String crawldb = crawlId+"/crawldb";
+    String segment_dir = crawlId+"/segments";
+    File segmentsDir = new File(segment_dir);
+    File[] segmentsList = segmentsDir.listFiles();  
+    Arrays.sort(segmentsList, new Comparator<File>(){
+      @Override
+      public int compare(File f1, File f2) {
+        if(f1.lastModified()>f2.lastModified())
+          return -1;
+        else
+          return 0;
+      }      
+    });
+    
+    dirs.add(new Path(segmentsList[0].getPath()));
+    
+    try {
+      update(new Path(crawldb), dirs.toArray(new Path[dirs.size()]), normalize,
+          filter, additionsAllowed, force);
+      results.put(RESULT, Integer.toString(0));
+      return results;
+    } catch (Exception e) {
+      LOG.error("CrawlDb update: " + StringUtils.stringifyException(e));
+      results.put(RESULT, Integer.toString(-1));
+      return results;
+    }
+  }
 }

Modified: nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java?rev=1675243&r1=1675242&r2=1675243&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java Wed Apr 22 01:46:28 2015
@@ -18,10 +18,12 @@
 package org.apache.nutch.crawl;
 
 import java.io.DataOutputStream;
+import java.io.File;
 import java.io.IOException;
 import java.io.Closeable;
 import java.net.URL;
 import java.util.Date;
+import java.util.HashMap;
 import java.util.Iterator;
 import java.util.Map;
 import java.util.Map.Entry;
@@ -30,10 +32,20 @@ import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 import java.util.TreeMap;
 
+
+
+
+
+
+
+
+
+
 // Commons Logging imports
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.FloatWritable;
@@ -63,6 +75,7 @@ import org.apache.hadoop.util.Tool;
 import org.apache.hadoop.util.ToolRunner;
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.NutchJob;
+import org.apache.nutch.util.NutchTool;
 import org.apache.nutch.util.StringUtil;
 
 /**
@@ -345,70 +358,76 @@ public class CrawlDbReader extends Confi
     closeReaders();
   }
 
-  public void processStatJob(String crawlDb, JobConf config, boolean sort)
+  private TreeMap<String, LongWritable> processStatJobHelper(String crawlDb, Configuration config, boolean sort) throws IOException{
+	  Path tmpFolder = new Path(crawlDb, "stat_tmp" + System.currentTimeMillis());
+
+	  JobConf job = new NutchJob(config);
+	  job.setJobName("stats " + crawlDb);
+	  job.setBoolean("db.reader.stats.sort", sort);
+
+	  FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME));
+	  job.setInputFormat(SequenceFileInputFormat.class);
+
+	  job.setMapperClass(CrawlDbStatMapper.class);
+	  job.setCombinerClass(CrawlDbStatCombiner.class);
+	  job.setReducerClass(CrawlDbStatReducer.class);
+
+	  FileOutputFormat.setOutputPath(job, tmpFolder);
+	  job.setOutputFormat(SequenceFileOutputFormat.class);
+	  job.setOutputKeyClass(Text.class);
+	  job.setOutputValueClass(LongWritable.class);
+
+	  // https://issues.apache.org/jira/browse/NUTCH-1029
+	  job.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);
+
+	  JobClient.runJob(job);
+
+	  // reading the result
+	  FileSystem fileSystem = FileSystem.get(config);
+	  SequenceFile.Reader[] readers = SequenceFileOutputFormat.getReaders(config,
+			  tmpFolder);
+
+	  Text key = new Text();
+	  LongWritable value = new LongWritable();
+
+	  TreeMap<String, LongWritable> stats = new TreeMap<String, LongWritable>();
+	  for (int i = 0; i < readers.length; i++) {
+		  SequenceFile.Reader reader = readers[i];
+		  while (reader.next(key, value)) {
+			  String k = key.toString();
+			  LongWritable val = stats.get(k);
+			  if (val == null) {
+				  val = new LongWritable();
+				  if (k.equals("scx"))
+					  val.set(Long.MIN_VALUE);
+				  if (k.equals("scn"))
+					  val.set(Long.MAX_VALUE);
+				  stats.put(k, val);
+			  }
+			  if (k.equals("scx")) {
+				  if (val.get() < value.get())
+					  val.set(value.get());
+			  } else if (k.equals("scn")) {
+				  if (val.get() > value.get())
+					  val.set(value.get());
+			  } else {
+				  val.set(val.get() + value.get());
+			  }
+		  }
+		  reader.close();
+	  }
+	  // removing the tmp folder
+	  fileSystem.delete(tmpFolder, true);
+	  return stats;
+  }
+  
+  public void processStatJob(String crawlDb, Configuration config, boolean sort)
       throws IOException {
 
     if (LOG.isInfoEnabled()) {
       LOG.info("CrawlDb statistics start: " + crawlDb);
     }
-
-    Path tmpFolder = new Path(crawlDb, "stat_tmp" + System.currentTimeMillis());
-
-    JobConf job = new NutchJob(config);
-    job.setJobName("stats " + crawlDb);
-    job.setBoolean("db.reader.stats.sort", sort);
-
-    FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME));
-    job.setInputFormat(SequenceFileInputFormat.class);
-
-    job.setMapperClass(CrawlDbStatMapper.class);
-    job.setCombinerClass(CrawlDbStatCombiner.class);
-    job.setReducerClass(CrawlDbStatReducer.class);
-
-    FileOutputFormat.setOutputPath(job, tmpFolder);
-    job.setOutputFormat(SequenceFileOutputFormat.class);
-    job.setOutputKeyClass(Text.class);
-    job.setOutputValueClass(LongWritable.class);
-
-    // https://issues.apache.org/jira/browse/NUTCH-1029
-    job.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);
-
-    JobClient.runJob(job);
-
-    // reading the result
-    FileSystem fileSystem = FileSystem.get(config);
-    SequenceFile.Reader[] readers = SequenceFileOutputFormat.getReaders(config,
-        tmpFolder);
-
-    Text key = new Text();
-    LongWritable value = new LongWritable();
-
-    TreeMap<String, LongWritable> stats = new TreeMap<String, LongWritable>();
-    for (int i = 0; i < readers.length; i++) {
-      SequenceFile.Reader reader = readers[i];
-      while (reader.next(key, value)) {
-        String k = key.toString();
-        LongWritable val = stats.get(k);
-        if (val == null) {
-          val = new LongWritable();
-          if (k.equals("scx"))
-            val.set(Long.MIN_VALUE);
-          if (k.equals("scn"))
-            val.set(Long.MAX_VALUE);
-          stats.put(k, val);
-        }
-        if (k.equals("scx")) {
-          if (val.get() < value.get())
-            val.set(value.get());
-        } else if (k.equals("scn")) {
-          if (val.get() > value.get())
-            val.set(value.get());
-        } else {
-          val.set(val.get() + value.get());
-        }
-      }
-      reader.close();
-    }
+    TreeMap<String, LongWritable> stats = processStatJobHelper(crawlDb, config, sort);
 
     if (LOG.isInfoEnabled()) {
       LOG.info("Statistics for CrawlDb: " + crawlDb);
@@ -437,8 +456,6 @@ public class CrawlDbReader extends Confi
           LOG.info(k + ":\t" + val);
       }
     }
-    // removing the tmp folder
-    fileSystem.delete(tmpFolder, true);
     if (LOG.isInfoEnabled()) {
       LOG.info("CrawlDb statistics: done");
     }
@@ -622,7 +639,7 @@ public class CrawlDbReader extends Confi
           .println("\t<crawldb>\tdirectory name where crawldb is located");
       System.err
           .println("\t-stats [-sort] \tprint overall statistics to System.out");
-      System.err.println("\t\t\tand optionally sort by host");
+      System.err.println("\t\t[-sort]\tlist status sorted by host");
       System.err
           .println("\t-dump <out_dir> [-format normal|csv|crawldb]\tdump the whole db to a text file in <out_dir>");
       System.err.println("\t\t[-format csv]\tdump in Csv format");
@@ -702,5 +719,118 @@ public class CrawlDbReader extends Confi
         int result = ToolRunner.run(NutchConfiguration.create(),
                 new CrawlDbReader(), args);
         System.exit(result);
+        }
+  public Object query(Map<String, String> args, Configuration conf, String type, String crawlId) throws Exception {
+ 
+
+    Map<String, Object> results = new HashMap<String, Object>();
+    String crawlDb = crawlId + "/crawldb";
+
+    if(type.equalsIgnoreCase("stats")){
+      boolean sort = false;
+      if(args.containsKey("sort")){
+        if(args.get("sort").equalsIgnoreCase("true"))
+          sort = true;
+      }
+      TreeMap<String , LongWritable> stats = processStatJobHelper(crawlDb, NutchConfiguration.create(), sort);
+      LongWritable totalCnt = stats.get("T");
+      stats.remove("T");
+      results.put("totalUrls", String.valueOf(totalCnt.get()));
+      Map<String, Object> statusMap = new HashMap<String, Object>();      
+
+      for (Map.Entry<String, LongWritable> entry : stats.entrySet()) {
+        String k = entry.getKey();
+        LongWritable val = entry.getValue();
+        if (k.equals("scn")) {
+
+          results.put("minScore", String.valueOf((val.get() / 1000.0f)));
+        } else if (k.equals("scx")) {
+          results.put("maxScore", String.valueOf((val.get() / 1000.0f)));
+        } else if (k.equals("sct")) {
+          results.put("avgScore", String.valueOf((float) ((((double) val.get()) / totalCnt.get()) / 1000.0)));
+        } else if (k.startsWith("status")) {
+          String[] st = k.split(" ");
+          int code = Integer.parseInt(st[1]);
+          if (st.length > 2){
+            Map<String, Object> individualStatusInfo = (Map<String, Object>) statusMap.get(String.valueOf(code));
+            Map<String, String> hostValues;
+            if(individualStatusInfo.containsKey("hostValues")){
+              hostValues= (Map<String, String>) individualStatusInfo.get("hostValues");
+            }
+            else{
+              hostValues = new HashMap<String, String>();
+              individualStatusInfo.put("hostValues", hostValues);
+            }
+            hostValues.put(st[2], String.valueOf(val));
+          }
+          else{
+            Map<String, Object> individualStatusInfo = new HashMap<String, Object>();
+
+            individualStatusInfo.put("statusValue", CrawlDatum.getStatusName((byte) code));
+            individualStatusInfo.put("count", String.valueOf(val));
+
+            statusMap.put(String.valueOf(code), individualStatusInfo);
+          }
+        } else
+          results.put(k, String.valueOf(val));			  
+      }
+      results.put("status", statusMap);
+      return results;
+    }
+    if(type.equalsIgnoreCase("dump")){
+      String output = args.get("out_dir");
+      String format = "normal";
+      String regex = null;
+      Integer retry = null;
+      String status = null;
+      if (args.containsKey("format")) {
+        format = args.get("format");
+      }
+      if (args.containsKey("regex")) {
+        regex = args.get("regex");
+      }
+      if (args.containsKey("retry")) {
+        retry = Integer.parseInt(args.get("retry"));
+      }
+      if (args.containsKey("status")) {
+        status = args.get("status");
+      }
+      processDumpJob(crawlDb, output, new NutchJob(conf), format, regex, status, retry);
+      File dumpFile = new File(output+"/part-00000");
+      return dumpFile;		  
+    }
+    if (type.equalsIgnoreCase("topN")) {
+      String output = args.get("out_dir");
+      long topN = Long.parseLong(args.get("nnn"));
+      float min = 0.0f;
+      if(args.containsKey("min")){
+        min = Float.parseFloat(args.get("min"));
+      }
+      processTopNJob(crawlDb, topN, min, output, new NutchJob(conf));
+      File dumpFile = new File(output+"/part-00000");
+      return dumpFile;
+    }
+
+    if(type.equalsIgnoreCase("url")){
+      String url = args.get("url");
+      CrawlDatum res = get(crawlDb, url, new NutchJob(conf));
+      results.put("status", res.getStatus());
+      results.put("fetchTime", new Date(res.getFetchTime()));
+      results.put("modifiedTime", new Date(res.getModifiedTime()));
+      results.put("retriesSinceFetch", res.getRetriesSinceFetch());
+      results.put("retryInterval", res.getFetchInterval());
+      results.put("score", res.getScore());
+      results.put("signature", StringUtil.toHexString(res.getSignature()));
+      Map<String, String> metadata = new HashMap<String, String>();
+      if(res.getMetaData()!=null){
+        for (Entry<Writable, Writable> e : res.getMetaData().entrySet()) {
+          metadata.put(String.valueOf(e.getKey()), String.valueOf(e.getValue()));
+        }
+      }
+      results.put("metadata", metadata);
+
+      return results;
+    }
+    return results;
     }
-}
+}
\ No newline at end of file

Modified: nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java?rev=1675243&r1=1675242&r2=1675243&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java Wed Apr 22 01:46:28 2015
@@ -18,7 +18,9 @@ package org.apache.nutch.crawl;
 
 import java.io.IOException;
 import java.text.SimpleDateFormat;
+import java.util.HashMap;
 import java.util.Iterator;
+import java.util.Map;
 import java.util.Random;
 
 import org.apache.hadoop.conf.Configured;
@@ -45,6 +47,7 @@ import org.apache.nutch.crawl.CrawlDatum
 import org.apache.nutch.crawl.CrawlDb;
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.NutchJob;
+import org.apache.nutch.util.NutchTool;
 import org.apache.nutch.util.TimingUtil;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -58,7 +61,7 @@ import org.slf4j.LoggerFactory;
  * then the one with the shortest URL is kept. The documents marked as duplicate
  * can then be deleted with the command CleaningJob.
  ***/
-public class DeduplicationJob extends Configured implements Tool {
+public class DeduplicationJob extends NutchTool implements Tool {
 
   public static final Logger LOG = LoggerFactory
       .getLogger(DeduplicationJob.class);
@@ -294,4 +297,19 @@ public class DeduplicationJob extends Co
         new DeduplicationJob(), args);
     System.exit(result);
   }
+
+  @Override
+  public Map<String, Object> run(Map<String, String> args, String crawlId) throws Exception {
+//    if(args.size()<1){
+//      throw new IllegalArgumentException("Required argument <crawldb>");
+//    }
+    Map<String, Object> results = new HashMap<String, Object>();
+    String RESULT = "result";
+    String[] arg = new String[1];
+    String crawldb = crawlId+"/crawldb";
+    arg[0] = crawldb;
+    int res = run(arg);
+    results.put(RESULT, Integer.toString(res));
+    return results;
+  }
 }

Modified: nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java?rev=1675243&r1=1675242&r2=1675243&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Wed Apr 22 01:46:28 2015
@@ -25,7 +25,6 @@ import java.text.*;
 // rLogging imports
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
-
 import org.apache.hadoop.io.*;
 import org.apache.hadoop.conf.*;
 import org.apache.hadoop.mapred.*;
@@ -34,7 +33,6 @@ import org.apache.hadoop.util.*;
 import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
-
 import org.apache.nutch.metadata.Nutch;
 import org.apache.nutch.net.URLFilterException;
 import org.apache.nutch.net.URLFilters;
@@ -44,6 +42,7 @@ import org.apache.nutch.scoring.ScoringF
 import org.apache.nutch.util.LockUtil;
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.NutchJob;
+import org.apache.nutch.util.NutchTool;
 import org.apache.nutch.util.TimingUtil;
 import org.apache.nutch.util.URLUtil;
 
@@ -55,7 +54,7 @@ import org.apache.nutch.util.URLUtil;
  * a segment. We can chose separately how to count the URLS i.e. by domain or
  * host to limit the entries.
  **/
-public class Generator extends Configured implements Tool {
+public class Generator extends NutchTool implements Tool {
 
   public static final Logger LOG = LoggerFactory.getLogger(Generator.class);
 
@@ -751,4 +750,63 @@ public class Generator extends Configure
     }
     return 0;
   }
+
+  @Override
+  public Map<String, Object> run(Map<String, String> args, String crawlId) throws Exception {
+
+
+    Map<String, Object> results = new HashMap<String, Object>();
+    String RESULT = "result";
+    String crawldb = crawlId+"/crawldb";
+    Path dbDir = new Path(crawldb);
+    String segments_dir = crawlId+"/segments";
+    Path segmentsDir = new Path(segments_dir);
+    long curTime = System.currentTimeMillis();
+    long topN = Long.MAX_VALUE;
+    int numFetchers = -1;
+    boolean filter = true;
+    boolean norm = true;
+    boolean force = false;
+    int maxNumSegments = 1;
+
+
+    if (args.containsKey("topN")) {
+      topN = Long.parseLong(args.get("topN"));
+    }
+    if (args.containsKey("numFetchers")) {
+      numFetchers = Integer.parseInt(args.get("numFetchers"));
+    }
+    if (args.containsKey("adddays")) {
+      long numDays = Integer.parseInt(args.get("adddays"));
+      curTime += numDays * 1000L * 60 * 60 * 24;
+    }
+    if (args.containsKey("noFilter")) {
+      filter = false;
+    } 
+    if (args.containsKey("noNorm")) {
+      norm = false;
+    } 
+    if (args.containsKey("force")) {
+      force = true;
+    } 
+    if (args.containsKey("maxNumSegments")) {
+      maxNumSegments = Integer.parseInt(args.get("maxNumSegments"));
+    }
+
+    try {
+      Path[] segs = generate(dbDir, segmentsDir, numFetchers, topN, curTime,
+          filter, norm, force, maxNumSegments);
+      if (segs == null){
+        results.put(RESULT, Integer.toString(1));
+        return results;
+      }
+
+    } catch (Exception e) {
+      LOG.error("Generator: " + StringUtils.stringifyException(e));
+      results.put(RESULT, Integer.toString(-1));
+      return results;
+    }
+    results.put(RESULT, Integer.toString(0));
+    return results;
+  }
 }

Modified: nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java?rev=1675243&r1=1675242&r2=1675243&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java Wed Apr 22 01:46:28 2015
@@ -24,19 +24,18 @@ import java.util.*;
 // Commons Logging imports
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
-
 import org.apache.hadoop.io.*;
 import org.apache.hadoop.fs.*;
 import org.apache.hadoop.conf.*;
 import org.apache.hadoop.mapred.*;
 import org.apache.hadoop.util.*;
-
 import org.apache.nutch.net.*;
 import org.apache.nutch.metadata.Nutch;
 import org.apache.nutch.scoring.ScoringFilterException;
 import org.apache.nutch.scoring.ScoringFilters;
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.NutchJob;
+import org.apache.nutch.util.NutchTool;
 import org.apache.nutch.util.TimingUtil;
 
 /**
@@ -53,7 +52,7 @@ import org.apache.nutch.util.TimingUtil;
  * e.g. http://www.nutch.org/ \t nutch.score=10 \t nutch.fetchInterval=2592000
  * \t userType=open_source
  **/
-public class Injector extends Configured implements Tool {
+public class Injector extends NutchTool implements Tool {
   public static final Logger LOG = LoggerFactory.getLogger(Injector.class);
 
   /** metadata key reserved for setting a custom score for a specific URL */
@@ -385,4 +384,23 @@ public class Injector extends Configured
     }
   }
 
+  @Override
+  /**
+   * Used by the Nutch REST service
+   */
+  public Map<String, Object> run(Map<String, String> args, String crawlId) throws Exception {
+    if(args.size()<1){
+      throw new IllegalArgumentException("Required arguments <url_dir>");
+    }
+    Map<String, Object> results = new HashMap<String, Object>();
+    String RESULT = "result";
+    String crawldb = crawlId+"/crawldb";
+    String url_dir = args.get("url_dir");
+
+    inject(new Path(crawldb), new Path(url_dir));
+    results.put(RESULT, Integer.toString(0));
+    return results;
+
+  }
+
 }

Modified: nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java?rev=1675243&r1=1675242&r2=1675243&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java Wed Apr 22 01:46:28 2015
@@ -25,14 +25,12 @@ import java.net.*;
 // Commons Logging imports
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
-
 import org.apache.hadoop.io.*;
 import org.apache.hadoop.fs.*;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.conf.*;
 import org.apache.hadoop.mapred.*;
 import org.apache.hadoop.util.*;
-
 import org.apache.nutch.net.URLFilters;
 import org.apache.nutch.net.URLNormalizers;
 import org.apache.nutch.parse.*;
@@ -40,10 +38,11 @@ import org.apache.nutch.util.HadoopFSUti
 import org.apache.nutch.util.LockUtil;
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.NutchJob;
+import org.apache.nutch.util.NutchTool;
 import org.apache.nutch.util.TimingUtil;
 
 /** Maintains an inverted link map, listing incoming links for each url. */
-public class LinkDb extends Configured implements Tool,
+public class LinkDb extends NutchTool implements Tool,
     Mapper<Text, ParseData, Text, Inlinks> {
 
   public static final Logger LOG = LoggerFactory.getLogger(LinkDb.class);
@@ -338,4 +337,53 @@ public class LinkDb extends Configured i
     }
   }
 
-}
+  /*
+   * Used for Nutch REST service
+   */
+  @Override
+  public Map<String, Object> run(Map<String, String> args, String crawlId) throws Exception {
+//    if (args.size() < 2) {
+//      throw new IllegalArgumentException("Required arguments <linkdb> (-dir <segmentsDir> | <seg1> <seg2> ...) [-force] [-noNormalize] [-noFilter]");
+//    }
+    
+    Map<String, Object> results = new HashMap<String, Object>();
+    String RESULT = "result";
+    String linkdb = crawlId + "/linkdb";
+    Path db = new Path(linkdb);
+    ArrayList<Path> segs = new ArrayList<Path>();
+    boolean filter = true;
+    boolean normalize = true;
+    boolean force = false;
+    if (args.containsKey("noNormalize")) {
+      normalize = false;
+    } 
+    if (args.containsKey("noFilter")) {
+      filter = false;
+    } 
+    if (args.containsKey("force")) {
+      force = true;
+    }
+    String segment_dir = crawlId+"/segments";
+    File segmentsDir = new File(segment_dir);
+    File[] segmentsList = segmentsDir.listFiles();  
+    Arrays.sort(segmentsList, new Comparator<File>(){
+      @Override
+      public int compare(File f1, File f2) {
+        if(f1.lastModified()>f2.lastModified())
+          return -1;
+        else
+          return 0;
+      }      
+    });
+    segs.add(new Path(segmentsList[0].getPath()));
+    try {
+      invert(db, segs.toArray(new Path[segs.size()]), normalize, filter, force);
+      results.put(RESULT, Integer.toString(0));
+      return results;
+    } catch (Exception e) {
+      LOG.error("LinkDb: " + StringUtils.stringifyException(e));
+      results.put(RESULT, Integer.toString(-1));
+      return results;
+    }
+  }
+}
\ No newline at end of file

Modified: nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=1675243&r1=1675242&r2=1675243&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Wed Apr 22 01:46:28 2015
@@ -16,6 +16,7 @@
  */
 package org.apache.nutch.fetcher;
 
+import java.io.File;
 import java.io.IOException;
 import java.net.InetAddress;
 import java.net.MalformedURLException;
@@ -27,10 +28,18 @@ import java.util.Map.Entry;
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.concurrent.atomic.AtomicLong;
 
+
+
+
+
+
+
+
+
+
 // Slf4j Logging imports
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
-
 import org.apache.hadoop.io.*;
 import org.apache.hadoop.fs.*;
 import org.apache.hadoop.conf.*;
@@ -38,7 +47,6 @@ import org.apache.hadoop.mapred.*;
 import org.apache.hadoop.util.StringUtils;
 import org.apache.hadoop.util.Tool;
 import org.apache.hadoop.util.ToolRunner;
-
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.crawl.NutchWritable;
 import org.apache.nutch.crawl.SignatureFactory;
@@ -91,7 +99,7 @@ import crawlercommons.robots.BaseRobotRu
  * 
  * @author Andrzej Bialecki
  */
-public class Fetcher extends Configured implements Tool,
+public class Fetcher extends NutchTool implements Tool,
     MapRunnable<Text, CrawlDatum, Text, NutchWritable> {
 
   public static final int PERM_REFRESH_TIME = 5;
@@ -1191,7 +1199,7 @@ public class Fetcher extends Configured
   }
 
   public Fetcher() {
-    super(null);
+	  super(null);
   }
 
   public Fetcher(Configuration conf) {
@@ -1618,4 +1626,44 @@ public class Fetcher extends Configured
     }
   }
 
+  @Override
+  public Map<String, Object> run(Map<String, String> args, String crawlId) throws Exception {
+
+    Map<String, Object> results = new HashMap<String, Object>();
+    String RESULT = "result";
+    String segment_dir = crawlId+"/segments";
+    File segmentsDir = new File(segment_dir);
+    File[] segmentsList = segmentsDir.listFiles();  
+    Arrays.sort(segmentsList, new Comparator<File>(){
+      @Override
+      public int compare(File f1, File f2) {
+        if(f1.lastModified()>f2.lastModified())
+          return -1;
+        else
+          return 0;
+      }      
+    });
+    
+    Path segment = new Path(segmentsList[0].getPath());
+
+    int threads = getConf().getInt("fetcher.threads.fetch", 10);
+    boolean parsing = false;
+
+    // parse command line
+    if (args.containsKey("threads")) { // found -threads option
+      threads = Integer.parseInt(args.get("threads"));
+    }
+    getConf().setInt("fetcher.threads.fetch", threads);
+
+    try {
+      fetch(segment, threads);
+      results.put(RESULT, Integer.toString(0));
+      return results;
+    } catch (Exception e) {
+      LOG.error("Fetcher: " + StringUtils.stringifyException(e));
+      results.put(RESULT, Integer.toString(-1));
+      return results;
+    }
+  }
+
 }

Modified: nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java?rev=1675243&r1=1675242&r2=1675243&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java Wed Apr 22 01:46:28 2015
@@ -26,52 +26,58 @@ import org.apache.hadoop.io.Text;
  */
 public interface Nutch {
 
-  public static final String ORIGINAL_CHAR_ENCODING = "OriginalCharEncoding";
+	public static final String ORIGINAL_CHAR_ENCODING = "OriginalCharEncoding";
 
-  public static final String CHAR_ENCODING_FOR_CONVERSION = "CharEncodingForConversion";
+	public static final String CHAR_ENCODING_FOR_CONVERSION = "CharEncodingForConversion";
 
-  public static final String SIGNATURE_KEY = "nutch.content.digest";
+	public static final String SIGNATURE_KEY = "nutch.content.digest";
 
-  public static final String SEGMENT_NAME_KEY = "nutch.segment.name";
+	public static final String SEGMENT_NAME_KEY = "nutch.segment.name";
 
-  public static final String SCORE_KEY = "nutch.crawl.score";
+	public static final String SCORE_KEY = "nutch.crawl.score";
 
-  public static final String GENERATE_TIME_KEY = "_ngt_";
+	public static final String GENERATE_TIME_KEY = "_ngt_";
 
-  public static final Text WRITABLE_GENERATE_TIME_KEY = new Text(
-      GENERATE_TIME_KEY);
+	public static final Text WRITABLE_GENERATE_TIME_KEY = new Text(
+			GENERATE_TIME_KEY);
 
-  public static final String PROTO_STATUS_KEY = "_pst_";
+	public static final String PROTO_STATUS_KEY = "_pst_";
 
-  public static final Text WRITABLE_PROTO_STATUS_KEY = new Text(
-      PROTO_STATUS_KEY);
+	public static final Text WRITABLE_PROTO_STATUS_KEY = new Text(
+			PROTO_STATUS_KEY);
 
-  public static final String FETCH_TIME_KEY = "_ftk_";
+	public static final String FETCH_TIME_KEY = "_ftk_";
 
-  public static final String FETCH_STATUS_KEY = "_fst_";
+	public static final String FETCH_STATUS_KEY = "_fst_";
 
-  /**
-   * Sites may request that search engines don't provide access to cached
-   * documents.
-   */
-  public static final String CACHING_FORBIDDEN_KEY = "caching.forbidden";
+	/**
+	 * Sites may request that search engines don't provide access to cached
+	 * documents.
+	 */
+	public static final String CACHING_FORBIDDEN_KEY = "caching.forbidden";
 
-  /** Show both original forbidden content and summaries (default). */
-  public static final String CACHING_FORBIDDEN_NONE = "none";
+	/** Show both original forbidden content and summaries (default). */
+	public static final String CACHING_FORBIDDEN_NONE = "none";
 
-  /** Don't show either original forbidden content or summaries. */
-  public static final String CACHING_FORBIDDEN_ALL = "all";
+	/** Don't show either original forbidden content or summaries. */
+	public static final String CACHING_FORBIDDEN_ALL = "all";
 
-  /** Don't show original forbidden content, but show summaries. */
-  public static final String CACHING_FORBIDDEN_CONTENT = "content";
+	/** Don't show original forbidden content, but show summaries. */
+	public static final String CACHING_FORBIDDEN_CONTENT = "content";
 
-  public static final String REPR_URL_KEY = "_repr_";
+	public static final String REPR_URL_KEY = "_repr_";
 
-  public static final Text WRITABLE_REPR_URL_KEY = new Text(REPR_URL_KEY);
+	public static final Text WRITABLE_REPR_URL_KEY = new Text(REPR_URL_KEY);
 
-  /** Used by AdaptiveFetchSchedule to maintain custom fetch interval */
-  public static final String FIXED_INTERVAL_KEY = "fixedInterval";
+	/** Used by AdaptiveFetchSchedule to maintain custom fetch interval */
+	public static final String FIXED_INTERVAL_KEY = "fixedInterval";
 
-  public static final Text WRITABLE_FIXED_INTERVAL_KEY = new Text(
-      FIXED_INTERVAL_KEY);
+	public static final Text WRITABLE_FIXED_INTERVAL_KEY = new Text(
+			FIXED_INTERVAL_KEY);
+
+	
+	 /** For progress of job. Used by the Nutch REST service */
+	public static final String STAT_PROGRESS = "progress";
+	/**Used by Nutch REST service */
+	public static final String CRAWL_ID_KEY = "storage.crawl.id";
 }

Modified: nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java?rev=1675243&r1=1675242&r2=1675243&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java Wed Apr 22 01:46:28 2015
@@ -19,10 +19,8 @@ package org.apache.nutch.parse;
 
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
-
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.crawl.SignatureFactory;
-import org.apache.nutch.segment.SegmentChecker;
 import org.apache.hadoop.io.*;
 import org.apache.hadoop.mapred.*;
 import org.apache.hadoop.util.*;
@@ -33,7 +31,6 @@ import org.apache.nutch.net.protocols.Re
 import org.apache.nutch.protocol.*;
 import org.apache.nutch.scoring.ScoringFilterException;
 import org.apache.nutch.scoring.ScoringFilters;
-import org.apache.hadoop.fs.FileSystem;
 import org.apache.nutch.util.*;
 import org.apache.hadoop.fs.Path;
 
@@ -43,7 +40,7 @@ import java.util.*;
 import java.util.Map.Entry;
 
 /* Parse content in a segment. */
-public class ParseSegment extends Configured implements Tool,
+public class ParseSegment extends NutchTool implements Tool,
     Mapper<WritableComparable<?>, Content, Text, ParseImpl>,
     Reducer<Text, Writable, Text, Writable> {
 
@@ -200,11 +197,6 @@ public class ParseSegment extends Config
   }
 
   public void parse(Path segment) throws IOException {
-	if (SegmentChecker.isParsed(segment, FileSystem.get(getConf()))) {
-	  LOG.warn("Segment: " + segment
-			  + " already parsed!! Skipped parsing this segment!!"); // NUTCH-1854
-	  return;
-	}
 
     SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
     long start = System.currentTimeMillis();
@@ -265,4 +257,37 @@ public class ParseSegment extends Config
     parse(segment);
     return 0;
   }
+
+  /*
+   * Used for Nutch REST service
+   */
+  public Map<String, Object> run(Map<String, String> args, String crawlId) throws Exception {
+
+    Map<String, Object> results = new HashMap<String, Object>();
+    String RESULT = "result";
+    if (args.containsKey("nofilter")) {
+      getConf().setBoolean("parse.filter.urls", false);
+    }
+    if (args.containsKey("nonormalize")) {
+      getConf().setBoolean("parse.normalize.urls", false);
+    }
+
+    String segment_dir = crawlId+"/segments";
+    File segmentsDir = new File(segment_dir);
+    File[] segmentsList = segmentsDir.listFiles();  
+    Arrays.sort(segmentsList, new Comparator<File>(){
+      @Override
+      public int compare(File f1, File f2) {
+        if(f1.lastModified()>f2.lastModified())
+          return -1;
+        else
+          return 0;
+      }      
+    });
+    
+    Path segment = new Path(segmentsList[0].getPath());
+    parse(segment);
+    results.put(RESULT, Integer.toString(0));
+    return results;
+  }
 }

Modified: nutch/trunk/src/java/org/apache/nutch/protocol/RobotRulesParser.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/protocol/RobotRulesParser.java?rev=1675243&r1=1675242&r2=1675243&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/protocol/RobotRulesParser.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/protocol/RobotRulesParser.java Wed Apr 22 01:46:28 2015
@@ -20,15 +20,10 @@ package org.apache.nutch.protocol;
 // JDK imports
 import java.io.File;
 import java.io.FileReader;
-import java.io.IOException;
-import java.io.InputStream;
 import java.io.LineNumberReader;
-import java.net.MalformedURLException;
 import java.net.URL;
-import java.util.Arrays;
-import java.util.HashSet;
+import java.util.ArrayList;
 import java.util.Hashtable;
-import java.util.Set;
 import java.util.StringTokenizer;
 
 // Commons Logging imports
@@ -37,11 +32,10 @@ import org.slf4j.LoggerFactory;
 
 // Nutch imports
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configurable;
 import org.apache.hadoop.io.Text;
-import org.apache.hadoop.util.StringUtils;
-import org.apache.hadoop.util.Tool;
-import org.apache.hadoop.util.ToolRunner;
-import org.apache.nutch.util.NutchConfiguration;
+
+import com.google.common.io.Files;
 
 import crawlercommons.robots.BaseRobotRules;
 import crawlercommons.robots.SimpleRobotRules;
@@ -52,11 +46,8 @@ import crawlercommons.robots.SimpleRobot
  * This class uses crawler-commons for handling the parsing of
  * {@code robots.txt} files. It emits SimpleRobotRules objects, which describe
  * the download permissions as described in SimpleRobotRulesParser.
- * 
- * Protocol-specific implementations have to implement the method
- * {@link getRobotRulesSet}.
  */
-public abstract class RobotRulesParser implements Tool {
+public abstract class RobotRulesParser implements Configurable {
 
   public static final Logger LOG = LoggerFactory
       .getLogger(RobotRulesParser.class);
@@ -79,13 +70,9 @@ public abstract class RobotRulesParser i
       RobotRulesMode.ALLOW_NONE);
 
   private static SimpleRobotRulesParser robotParser = new SimpleRobotRulesParser();
-  protected Configuration conf;
+  private Configuration conf;
   protected String agentNames;
 
-  /** set of host names or IPs to be explicitly excluded from robots.txt checking */
-  protected Set<String> whiteList = new HashSet<String>();;
-
-
   public RobotRulesParser() {
   }
 
@@ -125,12 +112,6 @@ public abstract class RobotRulesParser i
 
       agentNames = sb.toString();
     }
-
-    String[] confWhiteList = conf.getStrings("http.robot.rules.whitelist");
-    if (confWhiteList != null && confWhiteList.length > 0) {
-      whiteList.addAll(Arrays.asList(confWhiteList));
-      LOG.info("Whitelisted hosts: " + whiteList);
-    }
   }
 
   /**
@@ -140,14 +121,6 @@ public abstract class RobotRulesParser i
     return conf;
   }
 
-
-  /**
-   * Check whether a URL belongs to a whitelisted host.
-   */
-  public boolean isWhiteListed(URL url) {
-    return whiteList.contains(url.getHost());
-  }
-
   /**
    * Parses the robots content using the {@link SimpleRobotRulesParser} from
    * crawler commons
@@ -178,127 +151,41 @@ public abstract class RobotRulesParser i
     return getRobotRulesSet(protocol, u);
   }
 
-  /**
-   * Fetch robots.txt (or it's protocol-specific equivalent) which applies to
-   * the given URL, parse it and return the set of robot rules applicable for
-   * the configured agent name(s).
-   * 
-   * @param protocol
-   *          protocol implementation
-   * @param url
-   *          URL to be checked whether fetching is allowed by robot rules
-   * @return robot rules
-   */
   public abstract BaseRobotRules getRobotRulesSet(Protocol protocol, URL url);
 
-  @Override
-  public int run(String[] args) {
+  /** command-line main for testing */
+  public static void main(String[] argv) {
 
-    if (args.length < 2) {
-      String[] help = {
-          "Usage: RobotRulesParser <robots-file> <url-file> [<agent-names>]\n",
-          "\tThe <robots-file> will be parsed as a robots.txt file,",
-          "\tusing the given <agent-name> to select rules.",
-          "\tURLs will be read (one per line) from <url-file>,",
-          "\tand tested against the rules.",
-          "\tMultiple agent names can be provided using",
-          "\tcomma as a delimiter without any spaces.",
-          "\tIf no agent name is given the property http.agent.name",
-          "\tis used. If http.agent.name is empty, robots.txt is checked",
-          "\tfor rules assigned to the user agent `*' (meaning any other)." };
-      for (String s : help) {
-        System.err.println(s);
-      }
+    if (argv.length != 3) {
+      System.err
+          .println("Usage: RobotRulesParser <robots-file> <url-file> <agent-names>\n");
+      System.err
+          .println("\tThe <robots-file> will be parsed as a robots.txt file,");
+      System.err
+          .println("\tusing the given <agent-name> to select rules.  URLs ");
+      System.err
+          .println("\twill be read (one per line) from <url-file>, and tested");
+      System.err
+          .println("\tagainst the rules. Multiple agent names can be provided using");
+      System.err.println("\tcomma as a delimiter without any spaces.");
       System.exit(-1);
     }
 
-    File robotsFile = new File(args[0]);
-    File urlFile = new File(args[1]);
-
-    if (args.length > 2) {
-      // set agent name from command-line in configuration and update parser
-      String agents = args[2];
-      conf.set("http.agent.name", agents);
-      setConf(conf);
-    }
-
     try {
-      BaseRobotRules rules = getRobotRulesSet(null, robotsFile.toURI().toURL());
+      byte[] robotsBytes = Files.toByteArray(new File(argv[0]));
+      BaseRobotRules rules = robotParser.parseContent(argv[0], robotsBytes,
+          "text/plain", argv[2]);
 
-      LineNumberReader testsIn = new LineNumberReader(new FileReader(urlFile));
-      String testPath;
-      testPath = testsIn.readLine().trim();
+      LineNumberReader testsIn = new LineNumberReader(new FileReader(argv[1]));
+      String testPath = testsIn.readLine().trim();
       while (testPath != null) {
-        try {
-          // testPath can be just a path or a complete URL
-          URL url = new URL(testPath);
-          String status;
-          if (isWhiteListed(url)) {
-            status = "whitelisted";
-          } else if (rules.isAllowed(testPath)) {
-            status = "allowed";
-          } else {
-            status = "not allowed";
-          }
-          System.out.println(status + ":\t" + testPath);
-        } catch (MalformedURLException e) {
-        }
+        System.out.println((rules.isAllowed(testPath) ? "allowed"
+            : "not allowed") + ":\t" + testPath);
         testPath = testsIn.readLine();
       }
       testsIn.close();
-    } catch (IOException e) {
-      LOG.error("Failed to run: " + StringUtils.stringifyException(e));
-      return -1;
-    }
-
-    return 0;
-  }
-
-  /**
-   * {@link RobotRulesParser} implementation which expects the location of the
-   * robots.txt passed by URL (usually pointing to a local file) in
-   * {@link getRobotRulesSet}.
-   */
-  private static class TestRobotRulesParser extends RobotRulesParser {
-
-    public TestRobotRulesParser(Configuration conf) {
-      // make sure that agent name is set so that setConf() does not complain,
-      // the agent name is later overwritten by command-line argument
-      if (conf.get("http.agent.name") == null) {
-        conf.set("http.agent.name", "*");
-      }
-      setConf(conf);
-    }
-
-    /**
-     * @param protocol  (ignored)
-     * @param url
-     *          location of the robots.txt file
-     * */
-    public BaseRobotRules getRobotRulesSet(Protocol protocol, URL url) {
-      BaseRobotRules rules;
-      try {
-        int contentLength = url.openConnection().getContentLength();
-        byte[] robotsBytes = new byte[contentLength];
-        InputStream openStream = url.openStream();
-        openStream.read(robotsBytes);
-        openStream.close();
-        rules = robotParser.parseContent(url.toString(), robotsBytes,
-            "text/plain", this.conf.get("http.agent.name"));
-      } catch (IOException e) {
-        LOG.error("Failed to open robots.txt file " + url
-            + StringUtils.stringifyException(e));
-        rules = EMPTY_RULES;
-      }
-      return rules;
+    } catch (Exception e) {
+      e.printStackTrace();
     }
-
-  }
-
-  public static void main(String[] args) throws Exception {
-    Configuration conf = NutchConfiguration.create();
-    int res = ToolRunner.run(conf, new TestRobotRulesParser(conf), args);
-    System.exit(res);
   }
-
 }

Modified: nutch/trunk/src/java/org/apache/nutch/segment/SegmentChecker.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/segment/SegmentChecker.java?rev=1675243&r1=1675242&r2=1675243&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/segment/SegmentChecker.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/segment/SegmentChecker.java Wed Apr 22 01:46:28 2015
@@ -115,16 +115,4 @@ public class SegmentChecker {
     }
   }
 
-  /**
-   * Check the segment to see if it is has been parsed before.
-   */
-  public static boolean isParsed(Path segment, FileSystem fs)
-	  throws IOException {
-
-	if (fs.exists(new Path(segment, CrawlDatum.PARSE_DIR_NAME)))
-	  return true;
-	return false;
-
-  } 
-
-}
+}
\ No newline at end of file

Modified: nutch/trunk/src/java/org/apache/nutch/service/ConfManager.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/service/ConfManager.java?rev=1675243&r1=1675242&r2=1675243&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/service/ConfManager.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/service/ConfManager.java Wed Apr 22 01:46:28 2015
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
- 
+
 package org.apache.nutch.service;
 
 import java.util.Map;
@@ -30,10 +30,10 @@ public interface ConfManager {
   public Map<String, String> getAsMap(String confId);
 
   public void setProperty(String confId, String propName, String propValue);
-  
+
   public Set<String> list();
 
   public String create(NutchConfig nutchConfig);
-  
+
   public void delete(String confId);
 }

Modified: nutch/trunk/src/java/org/apache/nutch/service/JobManager.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/service/JobManager.java?rev=1675243&r1=1675242&r2=1675243&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/service/JobManager.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/service/JobManager.java Wed Apr 22 01:46:28 2015
@@ -14,27 +14,33 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
- 
+
 package org.apache.nutch.service;
 
 import java.util.Collection;
+import java.util.Map;
 
-import org.apache.nutch.service.model.response.JobConfig;
+import org.apache.nutch.service.model.request.JobConfig;
 import org.apache.nutch.service.model.response.JobInfo;
 import org.apache.nutch.service.model.response.JobInfo.State;
 
 public interface JobManager {
-	
-	public static enum JobType{
-		INJECT, GENERATE, FETCH, PARSE, UPDATEDB, INDEX, READDB, CLASS
-	};
-	public Collection<JobInfo> list(String crawlId, State state);
-
-	public JobInfo get(String crawlId, String id);
-
-	public String create(JobConfig jobConfig);
-	
-	public boolean abort(String crawlId, String id);
 
-	public boolean stop(String crawlId, String id);
+  public static enum JobType{
+    INJECT, GENERATE, FETCH, PARSE, UPDATEDB, INDEX, READDB, CLASS, INVERTLINKS, DEDUP
+  };
+  public Collection<JobInfo> list(String crawlId, State state);
+
+  public JobInfo get(String crawlId, String id);
+
+  /**
+   * Creates specified job
+   * @param jobConfig
+   * @return JobInfo
+   */
+  public JobInfo create(JobConfig jobConfig);
+
+  public boolean abort(String crawlId, String id);
+
+  public boolean stop(String crawlId, String id);
 }

Modified: nutch/trunk/src/java/org/apache/nutch/service/NutchServer.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/service/NutchServer.java?rev=1675243&r1=1675242&r2=1675243&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/service/NutchServer.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/service/NutchServer.java Wed Apr 22 01:46:28 2015
@@ -20,6 +20,8 @@ package org.apache.nutch.service;
 
 import java.util.ArrayList;
 import java.util.List;
+import java.util.concurrent.BlockingQueue;
+import java.util.concurrent.TimeUnit;
 
 import com.fasterxml.jackson.jaxrs.json.JacksonJaxbJsonProvider;
 import org.apache.commons.cli.CommandLineParser;
@@ -35,132 +37,154 @@ import org.apache.cxf.jaxrs.JAXRSServerF
 import org.apache.cxf.jaxrs.lifecycle.ResourceProvider;
 import org.apache.cxf.jaxrs.lifecycle.SingletonResourceProvider;
 import org.apache.nutch.service.impl.ConfManagerImpl;
+import org.apache.nutch.service.impl.JobFactory;
+import org.apache.nutch.service.impl.JobManagerImpl;
+import org.apache.nutch.service.impl.NutchServerPoolExecutor;
 import org.apache.nutch.service.resources.ConfigResource;
+import org.apache.nutch.service.resources.DbResource;
 import org.apache.nutch.service.resources.JobResource;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import com.google.common.collect.Queues;
+
 public class NutchServer {
 
-	private static final Logger LOG = LoggerFactory.getLogger(NutchServer.class);
+  private static final Logger LOG = LoggerFactory.getLogger(NutchServer.class);
 
-	private static final String LOCALHOST = "localhost";
-	private static final Integer DEFAULT_PORT = 8081;
-	private static final int JOB_CAPACITY = 100;
+  private static final String LOCALHOST = "localhost";
+  private static final Integer DEFAULT_PORT = 8081;
+  private static final int JOB_CAPACITY = 100;
 
-	private static Integer port = DEFAULT_PORT;
+  private static Integer port = DEFAULT_PORT;
 	private static String host  = LOCALHOST;
 
-	private static final String CMD_HELP = "help";
-	private static final String CMD_PORT = "port";
+  private static final String CMD_HELP = "help";
+  private static final String CMD_PORT = "port";
 	private static final String CMD_HOST = "host";
 
-	private long started;
-	private boolean running;
-	private ConfManager configManager;
-	private JAXRSServerFactoryBean sf; 
-
-	private static NutchServer server;
-
-	static {
-		server = new NutchServer();
-	}
-
-	private NutchServer() {
-		configManager = new ConfManagerImpl();
-
-		sf = new JAXRSServerFactoryBean();
-		BindingFactoryManager manager = sf.getBus().getExtension(BindingFactoryManager.class);
-		JAXRSBindingFactory factory = new JAXRSBindingFactory();
-		factory.setBus(sf.getBus());
-		manager.registerBindingFactory(JAXRSBindingFactory.JAXRS_BINDING_ID, factory);
-		sf.setResourceClasses(getClasses());
-		sf.setResourceProviders(getResourceProviders());
-		sf.setProvider(new JacksonJaxbJsonProvider());
-
-
-	}
-
-	public static NutchServer getInstance() {
-		return server;
-	}
+  private long started;
+  private boolean running;
+  private ConfManager configManager;
+  private JobManager jobManager;
+  private JAXRSServerFactoryBean sf; 
+
+  private static NutchServer server;
+
+  static {
+    server = new NutchServer();
+  }
+
+  private NutchServer() {
+    configManager = new ConfManagerImpl();
+    BlockingQueue<Runnable> runnables = Queues.newArrayBlockingQueue(JOB_CAPACITY);
+    NutchServerPoolExecutor executor = new NutchServerPoolExecutor(10, JOB_CAPACITY, 1, TimeUnit.HOURS, runnables);
+    jobManager = new JobManagerImpl(new JobFactory(), configManager, executor);
+
+    sf = new JAXRSServerFactoryBean();
+    BindingFactoryManager manager = sf.getBus().getExtension(BindingFactoryManager.class);
+    JAXRSBindingFactory factory = new JAXRSBindingFactory();
+    factory.setBus(sf.getBus());
+    manager.registerBindingFactory(JAXRSBindingFactory.JAXRS_BINDING_ID, factory);
+    sf.setResourceClasses(getClasses());
+    sf.setResourceProviders(getResourceProviders());
+    sf.setProvider(new JacksonJaxbJsonProvider());
+
+
+  }
+
+  public static NutchServer getInstance() {
+    return server;
+  }
+
+  private static void startServer() {
+    server.start();
+  }
 
-	private static void startServer() {
-		server.start();
-	}
-
-	private void start() {
+  private void start() {
 		LOG.info("Starting NutchServer on {}:{}  ...", host, port);
-		try{
+    try{
 			String address = "http://" + host + ":" + port;
-			sf.setAddress(address);
-			sf.create();
-		}catch(Exception e){
-			throw new IllegalStateException("Server could not be started", e);
-		}
+      sf.setAddress(address);
+      sf.create();
+    }catch(Exception e){
+      throw new IllegalStateException("Server could not be started", e);
+    }
 
-		started = System.currentTimeMillis();
-		running = true;
+    started = System.currentTimeMillis();
+    running = true;
 		LOG.info("Started Nutch Server on {}:{} at {}", host, port, started);
 		System.out.println("Started Nutch Server on " + host + ":" + port + " at " + started);
-	}
+  }
 
-	public List<Class<?>> getClasses() {
-		List<Class<?>> resources = new ArrayList<Class<?>>();
-		resources.add(JobResource.class);
-		resources.add(ConfigResource.class);
-		return resources;
-	}
-
-	public List<ResourceProvider> getResourceProviders() {
-		List<ResourceProvider> resourceProviders = new ArrayList<ResourceProvider>();
-		resourceProviders.add(new SingletonResourceProvider(getConfManager()));
-
-		return resourceProviders;
-	}
-
-	public ConfManager getConfManager() {
-		return configManager;
-	}
-
-	public static void main(String[] args) throws ParseException {
-		CommandLineParser parser = new PosixParser();
-		Options options = createOptions();
-		CommandLine commandLine = parser.parse(options, args);
-		if (commandLine.hasOption(CMD_HELP)) {
-			HelpFormatter formatter = new HelpFormatter();
-			formatter.printHelp("NutchServer", options, true);
-			return;
-		}
-
-		if (commandLine.hasOption(CMD_PORT)) {
-			port = Integer.parseInt(commandLine.getOptionValue(CMD_PORT));
-		}
+  private List<Class<?>> getClasses() {
+    List<Class<?>> resources = new ArrayList<Class<?>>();
+    resources.add(JobResource.class);
+    resources.add(ConfigResource.class);
+    resources.add(DbResource.class);
+    return resources;
+  }
+
+  private List<ResourceProvider> getResourceProviders() {
+    List<ResourceProvider> resourceProviders = new ArrayList<ResourceProvider>();
+    resourceProviders.add(new SingletonResourceProvider(getConfManager()));
+    return resourceProviders;
+  }
+
+  public ConfManager getConfManager() {
+    return configManager;
+  }
+
+  public JobManager getJobManager() {
+    return jobManager;
+  }
+
+  public boolean isRunning(){
+    return running;
+  }
+
+  public long getStarted(){
+    return started;
+  }
+
+  public static void main(String[] args) throws ParseException {
+    CommandLineParser parser = new PosixParser();
+    Options options = createOptions();
+    CommandLine commandLine = parser.parse(options, args);
+    if (commandLine.hasOption(CMD_HELP)) {
+      HelpFormatter formatter = new HelpFormatter();
+      formatter.printHelp("NutchServer", options, true);
+      return;
+    }
+
+    if (commandLine.hasOption(CMD_PORT)) {
+      port = Integer.parseInt(commandLine.getOptionValue(CMD_PORT));
+    }
 
 		if (commandLine.hasOption(CMD_HOST)) {
 			host = commandLine.getOptionValue(CMD_HOST);
 		}
 
-		startServer();
-	}
+    startServer();
+  }
 
-	private static Options createOptions() {
-		Options options = new Options();
+  private static Options createOptions() {
+    Options options = new Options();
 
-		OptionBuilder.withDescription("Show this help");
-		options.addOption(OptionBuilder.create(CMD_HELP));
+    OptionBuilder.withDescription("Show this help");
+    options.addOption(OptionBuilder.create(CMD_HELP));
 
-		OptionBuilder.withArgName("port");
-		OptionBuilder.hasOptionalArg();
-		OptionBuilder.withDescription("The port to run the Nutch Server. Default port 8081");
-		options.addOption(OptionBuilder.create(CMD_PORT));
+    OptionBuilder.withArgName("port");
+    OptionBuilder.hasOptionalArg();
+    OptionBuilder.withDescription("The port to run the Nutch Server. Default port 8081");
+    options.addOption(OptionBuilder.create(CMD_PORT));
 
 		OptionBuilder.withArgName("host");
 		OptionBuilder.hasOptionalArg();
 		OptionBuilder.withDescription("The host to bind the Nutch Server to. Default is localhost.");
 		options.addOption(OptionBuilder.create(CMD_PORT));
 
-		return options;
-	}
+    return options;
+  }
 
 }

Modified: nutch/trunk/src/java/org/apache/nutch/service/impl/ConfManagerImpl.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/service/impl/ConfManagerImpl.java?rev=1675243&r1=1675242&r2=1675243&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/service/impl/ConfManagerImpl.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/service/impl/ConfManagerImpl.java Wed Apr 22 01:46:28 2015
@@ -34,99 +34,99 @@ import org.apache.nutch.util.NutchConfig
 import com.google.common.collect.Maps;
 
 public class ConfManagerImpl implements ConfManager {
-	
 
-	private Map<String, Configuration> configurations = Maps.newConcurrentMap();
 
-	private AtomicInteger newConfigId = new AtomicInteger();
+  private Map<String, Configuration> configurations = Maps.newConcurrentMap();
 
-	public ConfManagerImpl() {
-		configurations.put(ConfigResource.DEFAULT, NutchConfiguration.create());
-	}
-	
-	/**
-	 * Returns the configuration associatedConfManagerImpl with the given confId
-	 */
-	public Configuration get(String confId) {
-	    if (confId == null) {
-	      return configurations.get(ConfigResource.DEFAULT);
-	    }
-	    return configurations.get(confId);
-	  }
-
-	public Map<String, String> getAsMap(String confId) {
-	    Configuration configuration = configurations.get(confId);
-	    if (configuration == null) {
-	      return Collections.emptyMap();
-	    }
-
-	    Iterator<Entry<String, String>> iterator = configuration.iterator();
-	    Map<String, String> configMap = Maps.newTreeMap();
-	    while (iterator.hasNext()) {
-	      Entry<String, String> entry = iterator.next();
-	      configMap.put(entry.getKey(), entry.getValue());
-	    }
-	    return configMap;
-	  }
-	
-	/**
-	 * Sets the given property in the configuration associated with the confId
-	 */
-	public void setProperty(String confId, String propName, String propValue) {
-	    if (!configurations.containsKey(confId)) {
-	      throw new IllegalArgumentException("Unknown configId '" + confId + "'");
-	    }
-	    Configuration conf = configurations.get(confId);
-	    conf.set(propName, propValue);
-	}
-
-	public Set<String> list() {
-	    return configurations.keySet();
-	}
-
-	/**
-	 * Created a new configuration based on the values provided.
-	 * @param NutchConfig
-	 * @return String - confId
-	 */
-	public String create(NutchConfig nutchConfig) {
-	    if (StringUtils.isBlank(nutchConfig.getConfigId())) {
-	      nutchConfig.setConfigId(String.valueOf(newConfigId.incrementAndGet()));
-	    }
-
-	    if (!canCreate(nutchConfig)) {
-	      throw new IllegalArgumentException("Config already exists.");
-	    }
-
-	    createHadoopConfig(nutchConfig);
-	    return nutchConfig.getConfigId();
-	}
-
-	
-	public void delete(String confId) {
-	    configurations.remove(confId);
-	}
-	
-	private boolean canCreate(NutchConfig nutchConfig) {
-	    if (nutchConfig.isForce()) {
-	      return true;
-	    }
-	    if (!configurations.containsKey(nutchConfig.getConfigId())) {
-	      return true;
-	    }
-	    return false;
-	}
-	
-	private void createHadoopConfig(NutchConfig nutchConfig) {
-	    Configuration conf = NutchConfiguration.create();
-	    configurations.put(nutchConfig.getConfigId(), conf);
-
-	    if (MapUtils.isEmpty(nutchConfig.getParams())) {
-	      return;
-	    }
-	    for (Entry<String, String> e : nutchConfig.getParams().entrySet()) {
-	      conf.set(e.getKey(), e.getValue());
-	    }
-	}
+  private AtomicInteger newConfigId = new AtomicInteger();
+
+  public ConfManagerImpl() {
+    configurations.put(ConfigResource.DEFAULT, NutchConfiguration.create());
+  }
+
+  /**
+   * Returns the configuration associatedConfManagerImpl with the given confId
+   */
+  public Configuration get(String confId) {
+    if (confId == null) {
+      return configurations.get(ConfigResource.DEFAULT);
+    }
+    return configurations.get(confId);
+  }
+
+  public Map<String, String> getAsMap(String confId) {
+    Configuration configuration = configurations.get(confId);
+    if (configuration == null) {
+      return Collections.emptyMap();
+    }
+
+    Iterator<Entry<String, String>> iterator = configuration.iterator();
+    Map<String, String> configMap = Maps.newTreeMap();
+    while (iterator.hasNext()) {
+      Entry<String, String> entry = iterator.next();
+      configMap.put(entry.getKey(), entry.getValue());
+    }
+    return configMap;
+  }
+
+  /**
+   * Sets the given property in the configuration associated with the confId
+   */
+  public void setProperty(String confId, String propName, String propValue) {
+    if (!configurations.containsKey(confId)) {
+      throw new IllegalArgumentException("Unknown configId '" + confId + "'");
+    }
+    Configuration conf = configurations.get(confId);
+    conf.set(propName, propValue);
+  }
+
+  public Set<String> list() {
+    return configurations.keySet();
+  }
+
+  /**
+   * Created a new configuration based on the values provided.
+   * @param NutchConfig
+   * @return String - confId
+   */
+  public String create(NutchConfig nutchConfig) {
+    if (StringUtils.isBlank(nutchConfig.getConfigId())) {
+      nutchConfig.setConfigId(String.valueOf(newConfigId.incrementAndGet()));
+    }
+
+    if (!canCreate(nutchConfig)) {
+      throw new IllegalArgumentException("Config already exists.");
+    }
+
+    createHadoopConfig(nutchConfig);
+    return nutchConfig.getConfigId();
+  }
+
+
+  public void delete(String confId) {
+    configurations.remove(confId);
+  }
+
+  private boolean canCreate(NutchConfig nutchConfig) {
+    if (nutchConfig.isForce()) {
+      return true;
+    }
+    if (!configurations.containsKey(nutchConfig.getConfigId())) {
+      return true;
+    }
+    return false;
+  }
+
+  private void createHadoopConfig(NutchConfig nutchConfig) {
+    Configuration conf = NutchConfiguration.create();
+    configurations.put(nutchConfig.getConfigId(), conf);
+
+    if (MapUtils.isEmpty(nutchConfig.getParams())) {
+      return;
+    }
+    for (Entry<String, String> e : nutchConfig.getParams().entrySet()) {
+      conf.set(e.getKey(), e.getValue());
+    }
+  }
 
 }

Modified: nutch/trunk/src/java/org/apache/nutch/service/model/response/JobInfo.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/service/model/response/JobInfo.java?rev=1675243&r1=1675242&r2=1675243&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/service/model/response/JobInfo.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/service/model/response/JobInfo.java Wed Apr 22 01:46:28 2015
@@ -19,6 +19,7 @@ package org.apache.nutch.service.model.r
 import java.util.Map;
 
 import org.apache.nutch.service.JobManager.JobType;
+import org.apache.nutch.service.model.request.JobConfig;
 
 /**
  * This is the response object containing Job information
@@ -27,65 +28,75 @@ import org.apache.nutch.service.JobManag
  */
 public class JobInfo {
 
-	public static enum State {
-		IDLE, RUNNING, FINISHED, FAILED, KILLED, STOPPING, KILLING, ANY
-	};
-	
-	private String id;
-	private JobType type;
-	private String confId;
-	private Map<String, Object> args;
-	private Map<String, Object> result;
-	private State state;
-	private String msg;
-	private String crawlId;
-	
-	public String getId() {
-		return id;
-	}
-	public void setId(String id) {
-		this.id = id;
-	}
-	public JobType getType() {
-		return type;
-	}
-	public void setType(JobType type) {
-		this.type = type;
-	}
-	public String getConfId() {
-		return confId;
-	}
-	public void setConfId(String confId) {
-		this.confId = confId;
-	}
-	public Map<String, Object> getArgs() {
-		return args;
-	}
-	public void setArgs(Map<String, Object> args) {
-		this.args = args;
-	}
-	public Map<String, Object> getResult() {
-		return result;
-	}
-	public void setResult(Map<String, Object> result) {
-		this.result = result;
-	}
-	public State getState() {
-		return state;
-	}
-	public void setState(State state) {
-		this.state = state;
-	}
-	public String getMsg() {
-		return msg;
-	}
-	public void setMsg(String msg) {
-		this.msg = msg;
-	}
-	public String getCrawlId() {
-		return crawlId;
-	}
-	public void setCrawlId(String crawlId) {
-		this.crawlId = crawlId;
-	}
+  public static enum State {
+    IDLE, RUNNING, FINISHED, FAILED, KILLED, STOPPING, KILLING, ANY
+  };
+
+  private String id;
+  private JobType type;
+  private String confId;
+  private Map<String, String> args;
+  private Map<String, Object> result;
+  private State state;
+  private String msg;
+  private String crawlId;
+
+  public JobInfo(String generateId, JobConfig jobConfig, State state,
+      String msg) {
+    this.id = generateId;
+    this.type = jobConfig.getType();
+    this.confId = jobConfig.getConfId();
+    this.crawlId = jobConfig.getCrawlId();
+    this.args = jobConfig.getArgs();
+    this.msg = msg;
+    this.state = state;
+  }
+  public String getId() {
+    return id;
+  }
+  public void setId(String id) {
+    this.id = id;
+  }
+  public JobType getType() {
+    return type;
+  }
+  public void setType(JobType type) {
+    this.type = type;
+  }
+  public String getConfId() {
+    return confId;
+  }
+  public void setConfId(String confId) {
+    this.confId = confId;
+  }
+  public Map<String, String> getArgs() {
+    return args;
+  }
+  public void setArgs(Map<String, String> args) {
+    this.args = args;
+  }
+  public Map<String, Object> getResult() {
+    return result;
+  }
+  public void setResult(Map<String, Object> result) {
+    this.result = result;
+  }	
+  public State getState() {
+    return state;
+  }
+  public void setState(State state) {
+    this.state = state;
+  }
+  public String getMsg() {
+    return msg;
+  }
+  public void setMsg(String msg) {
+    this.msg = msg;
+  }
+  public String getCrawlId() {
+    return crawlId;
+  }
+  public void setCrawlId(String crawlId) {
+    this.crawlId = crawlId;
+  }
 }

Modified: nutch/trunk/src/java/org/apache/nutch/service/resources/AbstractResource.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/service/resources/AbstractResource.java?rev=1675243&r1=1675242&r2=1675243&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/service/resources/AbstractResource.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/service/resources/AbstractResource.java Wed Apr 22 01:46:28 2015
@@ -28,15 +28,16 @@ import org.apache.nutch.service.NutchSer
 
 @Produces(MediaType.APPLICATION_JSON)
 public abstract class AbstractResource {
-	
-	protected JobManager jobManager;
-	protected ConfManager configManager;
-	
-	public AbstractResource() {
-		configManager = NutchServer.getInstance().getConfManager();
-	}
-	
-	protected void throwBadRequestException(String message) {
-		throw new WebApplicationException(Response.status(Status.BAD_REQUEST).entity(message).build());
-	}
+
+  protected JobManager jobManager;
+  protected ConfManager configManager;
+
+  public AbstractResource() {
+    configManager = NutchServer.getInstance().getConfManager();
+    jobManager = NutchServer.getInstance().getJobManager();
+  }
+
+  protected void throwBadRequestException(String message) {
+    throw new WebApplicationException(Response.status(Status.BAD_REQUEST).entity(message).build());
+  }
 }

Modified: nutch/trunk/src/java/org/apache/nutch/service/resources/ConfigResource.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/service/resources/ConfigResource.java?rev=1675243&r1=1675242&r2=1675243&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/service/resources/ConfigResource.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/service/resources/ConfigResource.java Wed Apr 22 01:46:28 2015
@@ -38,45 +38,45 @@ import com.fasterxml.jackson.databind.Se
 
 @Path("/config")
 public class ConfigResource extends AbstractResource{
-	
-	public static final String DEFAULT = "default";
 
-	@GET
-	@Path("/")
+  public static final String DEFAULT = "default";
+
+  @GET
+  @Path("/")
 	@JacksonFeatures(serializationEnable =  { SerializationFeature.INDENT_OUTPUT })
-	public Set<String> getConfigs() {
-		return configManager.list();
-	}
-	
-	@GET
-	@Path("/{configId}")
+  public Set<String> getConfigs() {
+    return configManager.list();
+  }
+
+  @GET
+  @Path("/{configId}")
 	@JacksonFeatures(serializationEnable =  { SerializationFeature.INDENT_OUTPUT })
-	public Map<String, String> getConfig(@PathParam("configId") String configId) {
-		return configManager.getAsMap(configId);
-	}
-	
-	@GET
-	@Path("/{configId}/{propertyId}")
+  public Map<String, String> getConfig(@PathParam("configId") String configId) {
+    return configManager.getAsMap(configId);
+  }
+
+  @GET
+  @Path("/{configId}/{propertyId}")
 	@JacksonFeatures(serializationEnable =  { SerializationFeature.INDENT_OUTPUT })
-	public String getProperty(@PathParam("configId") String configId,
-			@PathParam("propertyId") String propertyId) {
-		return configManager.getAsMap(configId).get(propertyId);
-	}
-
-	@DELETE
-	@Path("/{configId}")
-	public void deleteConfig(@PathParam("configId") String configId) {
-		configManager.delete(configId);
-	}
-
-	@POST
-	@Path("/{configId}")
-	@Consumes(MediaType.APPLICATION_JSON)
-	public String createConfig(NutchConfig newConfig) {
-		if (newConfig == null) {
-			throw new WebApplicationException(Response.status(Status.BAD_REQUEST)
-					.entity("Nutch configuration cannot be empty!").build());
-		}
-		return configManager.create(newConfig);
-	}
+  public String getProperty(@PathParam("configId") String configId,
+      @PathParam("propertyId") String propertyId) {
+    return configManager.getAsMap(configId).get(propertyId);
+  }
+
+  @DELETE
+  @Path("/{configId}")
+  public void deleteConfig(@PathParam("configId") String configId) {
+    configManager.delete(configId);
+  }
+
+  @POST
+  @Path("/{configId}")
+  @Consumes(MediaType.APPLICATION_JSON)
+  public String createConfig(NutchConfig newConfig) {
+    if (newConfig == null) {
+      throw new WebApplicationException(Response.status(Status.BAD_REQUEST)
+          .entity("Nutch configuration cannot be empty!").build());
+    }
+    return configManager.create(newConfig);
+  }
 }

Modified: nutch/trunk/src/java/org/apache/nutch/service/resources/JobResource.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/service/resources/JobResource.java?rev=1675243&r1=1675242&r2=1675243&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/service/resources/JobResource.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/service/resources/JobResource.java Wed Apr 22 01:46:28 2015
@@ -28,7 +28,7 @@ import javax.ws.rs.core.MediaType;
 
 import com.fasterxml.jackson.databind.SerializationFeature;
 import com.fasterxml.jackson.jaxrs.annotation.JacksonFeatures;
-import org.apache.nutch.service.model.response.JobConfig;
+import org.apache.nutch.service.model.request.JobConfig;
 import org.apache.nutch.service.model.response.JobInfo;
 import org.apache.nutch.service.model.response.JobInfo.State;
 
@@ -67,10 +67,10 @@ public class JobResource extends Abstrac
   @POST
   @Path(value = "/create")
   @Consumes(MediaType.APPLICATION_JSON)
-  public String create(JobConfig config) {
+  public JobInfo create(JobConfig config) {
     if (config == null) {
       throwBadRequestException("Job configuration is required!");
-    }
-    return jobManager.create(config);
+    }   
+    return jobManager.create(config);   
   }
 }

Modified: nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java?rev=1675243&r1=1675242&r2=1675243&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java Wed Apr 22 01:46:28 2015
@@ -49,7 +49,6 @@ import org.apache.commons.compress.compr
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.io.FilenameUtils;
 
-import org.apache.commons.validator.routines.UrlValidator;
 //Hadoop
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
@@ -385,12 +384,6 @@ public class CommonCrawlDataDumper {
 					reader.getCurrentValue(content);
 					Metadata metadata = content.getMetadata();
 					String url = key.toString();
-					
-					UrlValidator urlValidator = new UrlValidator();
-					if (!urlValidator.isValid(url)) {
-						LOG.warn("Not valid URL detected: " + url);
-					}
-					
 					String baseName = FilenameUtils.getBaseName(url);
 					String extension = FilenameUtils.getExtension(url);
 					

Modified: nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java?rev=1675243&r1=1675242&r2=1675243&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java Wed Apr 22 01:46:28 2015
@@ -128,13 +128,9 @@ public class FileDumper {
    * @param mimeTypes
    *          an array of mime types we have to dump, all others will be
    *          filtered out.
-   * @param flatDir
-   *          a boolean flag specifying whether the output directory should contain
-   *          only files instead of using nested directories to prevent naming
-   *          conflicts.
    * @throws Exception
    */
-  public void dump(File outputDir, File segmentRootDir, String[] mimeTypes, boolean flatDir)
+  public void dump(File outputDir, File segmentRootDir, String[] mimeTypes)
       throws Exception {
     if (mimeTypes == null)
       LOG.info("Accepting all mimetypes.");
@@ -213,11 +209,7 @@ public class FileDumper {
 
           if (filter) {
             String md5Ofurl = DumpFileUtil.getUrlMD5(url);
-
-            String fullDir = outputDir.getAbsolutePath();
-            if (!flatDir) {
-                fullDir = DumpFileUtil.createTwoLevelsDirectory(fullDir, md5Ofurl);
-            }
+            String fullDir = DumpFileUtil.createTwoLevelsDirectory(outputDir.getAbsolutePath(), md5Ofurl);
 
             if (!Strings.isNullOrEmpty(fullDir)) {
               String outputFullPath = String.format("%s/%s", fullDir, DumpFileUtil.createFileName(md5Ofurl, baseName, extension));
@@ -281,12 +273,6 @@ public class FileDumper {
         .withDescription(
             "an optional list of mimetypes to dump, excluding all others. Defaults to all.")
         .create("mimetype");
-    @SuppressWarnings("static-access")
-    Option dirStructureOpt = OptionBuilder
-        .withArgName("flatdir")
-        .withDescription(
-            "optionally specify that the output directory should only contain files.")
-        .create("flatdir");
 
     // create the options
     Options options = new Options();
@@ -294,7 +280,6 @@ public class FileDumper {
     options.addOption(outputOpt);
     options.addOption(segOpt);
     options.addOption(mimeOpt);
-    options.addOption(dirStructureOpt);
 
     CommandLineParser parser = new GnuParser();
     try {
@@ -309,7 +294,6 @@ public class FileDumper {
       File outputDir = new File(line.getOptionValue("outputDir"));
       File segmentRootDir = new File(line.getOptionValue("segment"));
       String[] mimeTypes = line.getOptionValues("mimetype");
-      boolean flatDir = line.hasOption("flatdir");
 
       if (!outputDir.exists()) {
         LOG.warn("Output directory: [" + outputDir.getAbsolutePath()
@@ -320,7 +304,7 @@ public class FileDumper {
       }
 
       FileDumper dumper = new FileDumper();
-      dumper.dump(outputDir, segmentRootDir, mimeTypes, flatDir);
+      dumper.dump(outputDir, segmentRootDir, mimeTypes);
     } catch (Exception e) {
       LOG.error("FileDumper: " + StringUtils.stringifyException(e));
       e.printStackTrace();

Modified: nutch/trunk/src/java/org/apache/nutch/util/domain/DomainStatistics.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/domain/DomainStatistics.java?rev=1675243&r1=1675242&r2=1675243&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/domain/DomainStatistics.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/util/domain/DomainStatistics.java Wed Apr 22 01:46:28 2015
@@ -67,20 +67,8 @@ public class DomainStatistics extends Co
 
   public int run(String[] args) throws Exception {
     if (args.length < 3) {
-      System.err.println("Usage: DomainStatistics inputDirs outDir mode [numOfReducer]");
-
-      System.err.println("\tinputDirs\tComma separated list of crawldb input directories");
-      System.err.println("\t\t\tE.g.: crawl/crawldb/current/");
-
-      System.err.println("\toutDir\t\tOutput directory where results should be dumped");
-
-      System.err.println("\tmode\t\tSet statistics gathering mode");
-      System.err.println("\t\t\t\thost\tGather statistics by host");
-      System.err.println("\t\t\t\tdomain\tGather statistics by domain");
-      System.err.println("\t\t\t\tsuffix\tGather statistics by suffix");
-      System.err.println("\t\t\t\ttld\tGather statistics by top level directory");
-
-      System.err.println("\t[numOfReducers]\tOptional number of reduce jobs to use. Defaults to 1.");
+      System.out
+          .println("usage: DomainStatistics inputDirs outDir host|domain|suffix|tld [numOfReducer]");
       return 1;
     }
     String inputDir = args[0];