You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2015/04/22 03:46:28 UTC
svn commit: r1675243 [1/2] - in /nutch/trunk: ./ conf/ ivy/
src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/fetcher/
src/java/org/apache/nutch/metadata/ src/java/org/apache/nutch/parse/
src/java/org/apache/nutch/protocol/ src/java/org/apache...
Author: mattmann
Date: Wed Apr 22 01:46:28 2015
New Revision: 1675243
URL: http://svn.apache.org/r1675243
Log:
Fix for NUTCH-1973 Job Administration end point for the REST service contributed by Sujen Shah <su...@gmail.com> this closes #16.
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/conf/log4j.properties
nutch/trunk/conf/nutch-default.xml
nutch/trunk/ivy/ivy.xml
nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java
nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java
nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java
nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java
nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java
nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java
nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java
nutch/trunk/src/java/org/apache/nutch/protocol/RobotRulesParser.java
nutch/trunk/src/java/org/apache/nutch/segment/SegmentChecker.java
nutch/trunk/src/java/org/apache/nutch/service/ConfManager.java
nutch/trunk/src/java/org/apache/nutch/service/JobManager.java
nutch/trunk/src/java/org/apache/nutch/service/NutchServer.java
nutch/trunk/src/java/org/apache/nutch/service/impl/ConfManagerImpl.java
nutch/trunk/src/java/org/apache/nutch/service/model/response/JobInfo.java
nutch/trunk/src/java/org/apache/nutch/service/resources/AbstractResource.java
nutch/trunk/src/java/org/apache/nutch/service/resources/ConfigResource.java
nutch/trunk/src/java/org/apache/nutch/service/resources/JobResource.java
nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java
nutch/trunk/src/java/org/apache/nutch/util/domain/DomainStatistics.java
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java
Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1675243&r1=1675242&r2=1675243&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed Apr 22 01:46:28 2015
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Current Development 1.10-SNAPSHOT
+* NUTCH-1973 Job Administration end point for the REST service (Sujen Shah via mattmann)
+
* NUTCH-1697 SegmentMerger to implement Tool (markus, snagel)
* NUTCH-1987 - Make bin/crawl indexer agnostic (Michael Joyce, snagel via mattmann)
@@ -9,6 +11,20 @@ Nutch Current Development 1.10-SNAPSHOT
* NUTCH-1854 bin/crawl fails with a parsing fetcher (Asitang Mishra via snagel)
* NUTCH-1989 Handling invalid URLs in CommonCrawlDataDumper (Giuseppe Totaro via mattmann)
+
+* NUTCH-1988 Make nested output directory dump optional (Michael Joyce via mattmann)
+
+* NUTCH-1927 Create a whitelist of IPs/hostnames to allow skipping of RobotRules parsing (mattmann, snagel)
+
+* NUTCH-1986 Clarify Elastic Search Indexer Plugin Settings (Michael Joyce via mattmann)
+
+* NUTCH-1906 Typo in CrawlDbReader command line help (Michael Joyce via mattmann)
+
+* NUTCH-1911 Improve DomainStatistics tool command line parsing (Michael Joyce via mattmann)
+
+* NUTCH-1854 bin/crawl fails with a parsing fetcher (Asitang Mishra via snagel)
+
+* NUTCH-1989 Handling invalid URLs in CommonCrawlDataDumper (Giuseppe Totaro via mattmann)
* NUTCH-1988 Make nested output directory dump optional (Michael Joyce via mattmann)
Modified: nutch/trunk/conf/log4j.properties
URL: http://svn.apache.org/viewvc/nutch/trunk/conf/log4j.properties?rev=1675243&r1=1675242&r2=1675243&view=diff
==============================================================================
--- nutch/trunk/conf/log4j.properties (original)
+++ nutch/trunk/conf/log4j.properties Wed Apr 22 01:46:28 2015
@@ -54,7 +54,6 @@ log4j.logger.org.apache.nutch.indexer.In
log4j.logger.org.apache.nutch.tools.FreeGenerator=INFO,cmdstdout
log4j.logger.org.apache.nutch.util.domain.DomainStatistics=INFO,cmdstdout
log4j.logger.org.apache.nutch.tools.CrawlDBScanner=INFO,cmdstdout
-log4j.logger.org.apache.nutch.protocol.RobotRulesParser=INFO,cmdstdout
log4j.logger.org.apache.nutch.plugin.PluginRepository=WARN
log4j.logger.org.apache.nutch=INFO
Modified: nutch/trunk/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1675243&r1=1675242&r2=1675243&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Wed Apr 22 01:46:28 2015
@@ -118,15 +118,6 @@
</property>
<property>
- <name>http.robot.rules.whitelist</name>
- <value></value>
- <description>Comma separated list of hostnames or IP addresses to ignore
- robot rules parsing for. Use with care and only if you are explicitly
- allowed by the site owner to ignore the site's robots.txt!
- </description>
-</property>
-
-<property>
<name>http.robots.403.allow</name>
<value>true</value>
<description>Some servers return HTTP status 403 (Forbidden) if
@@ -1590,32 +1581,22 @@
<property>
<name>elastic.host</name>
- <value>localhost</value>
- <description>
- The hostname to send documents to using TransportClient. Either host
- and port must be defined or cluster to connect.
- </description>
+ <value></value>
+ <description>The hostname to send documents to using TransportClient. Either host
+ and port must be defined or cluster.</description>
</property>
<property>
<name>elastic.port</name>
- <value>9300</value>
- <description>
- The port to connect to using TransportClient. Note, the default port
- for HTTP is 9200. The TransportClient port is NOT the same. By default
- it should be 9300.
+ <value>9300</value>The port to connect to using TransportClient.<description>
</description>
</property>
<property>
<name>elastic.cluster</name>
- <value>elasticsearch</value>
- <description>
- The cluster name to discover. Either host and port must be defined
- or cluster. The default Elasticsearch cluster name is 'elasticsearch'. If
- yours is different be sure to update this value even if you specified a host
- and port for connection otherwise you may encounter issues.
- </description>
+ <value></value>
+ <description>The cluster name to discover. Either host and potr must be defined
+ or cluster.</description>
</property>
<property>
Modified: nutch/trunk/ivy/ivy.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/ivy/ivy.xml?rev=1675243&r1=1675242&r2=1675243&view=diff
==============================================================================
--- nutch/trunk/ivy/ivy.xml (original)
+++ nutch/trunk/ivy/ivy.xml Wed Apr 22 01:46:28 2015
@@ -43,8 +43,6 @@
<dependency org="commons-lang" name="commons-lang" rev="2.6"
conf="*->default" />
- <dependency org="commons-validator" name="commons-validator" rev="1.4.1"
- conf="*->default" />
<dependency org="commons-collections" name="commons-collections"
rev="3.1" conf="*->default" />
<dependency org="commons-httpclient" name="commons-httpclient"
Modified: nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java?rev=1675243&r1=1675242&r2=1675243&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java Wed Apr 22 01:46:28 2015
@@ -24,24 +24,23 @@ import java.util.*;
// Commons Logging imports
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-
import org.apache.hadoop.io.*;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.*;
-
import org.apache.nutch.util.HadoopFSUtil;
import org.apache.nutch.util.LockUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
+import org.apache.nutch.util.NutchTool;
import org.apache.nutch.util.TimingUtil;
/**
* This class takes the output of the fetcher and updates the crawldb
* accordingly.
*/
-public class CrawlDb extends Configured implements Tool {
+public class CrawlDb extends NutchTool implements Tool {
public static final Logger LOG = LoggerFactory.getLogger(CrawlDb.class);
public static final String CRAWLDB_ADDITIONS_ALLOWED = "db.update.additions.allowed";
@@ -232,4 +231,61 @@ public class CrawlDb extends Configured
return -1;
}
}
+
+ /*
+ * Used for Nutch REST service
+ */
+ @Override
+ public Map<String, Object> run(Map<String, String> args, String crawlId) throws Exception {
+
+ Map<String, Object> results = new HashMap<String, Object>();
+ String RESULT = "result";
+ boolean normalize = getConf().getBoolean(CrawlDbFilter.URL_NORMALIZING,
+ false);
+ boolean filter = getConf().getBoolean(CrawlDbFilter.URL_FILTERING, false);
+ boolean additionsAllowed = getConf().getBoolean(CRAWLDB_ADDITIONS_ALLOWED,
+ true);
+ boolean force = false;
+ HashSet<Path> dirs = new HashSet<Path>();
+
+ if (args.containsKey("normalize")) {
+ normalize = true;
+ }
+ if (args.containsKey("filter")) {
+ filter = true;
+ }
+ if (args.containsKey("force")) {
+ force = true;
+ }
+ if (args.containsKey("noAdditions")) {
+ additionsAllowed = false;
+ }
+
+ String crawldb = crawlId+"/crawldb";
+ String segment_dir = crawlId+"/segments";
+ File segmentsDir = new File(segment_dir);
+ File[] segmentsList = segmentsDir.listFiles();
+ Arrays.sort(segmentsList, new Comparator<File>(){
+ @Override
+ public int compare(File f1, File f2) {
+ if(f1.lastModified()>f2.lastModified())
+ return -1;
+ else
+ return 0;
+ }
+ });
+
+ dirs.add(new Path(segmentsList[0].getPath()));
+
+ try {
+ update(new Path(crawldb), dirs.toArray(new Path[dirs.size()]), normalize,
+ filter, additionsAllowed, force);
+ results.put(RESULT, Integer.toString(0));
+ return results;
+ } catch (Exception e) {
+ LOG.error("CrawlDb update: " + StringUtils.stringifyException(e));
+ results.put(RESULT, Integer.toString(-1));
+ return results;
+ }
+ }
}
Modified: nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java?rev=1675243&r1=1675242&r2=1675243&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java Wed Apr 22 01:46:28 2015
@@ -18,10 +18,12 @@
package org.apache.nutch.crawl;
import java.io.DataOutputStream;
+import java.io.File;
import java.io.IOException;
import java.io.Closeable;
import java.net.URL;
import java.util.Date;
+import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Map.Entry;
@@ -30,10 +32,20 @@ import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.TreeMap;
+
+
+
+
+
+
+
+
+
// Commons Logging imports
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.FloatWritable;
@@ -63,6 +75,7 @@ import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
+import org.apache.nutch.util.NutchTool;
import org.apache.nutch.util.StringUtil;
/**
@@ -345,70 +358,76 @@ public class CrawlDbReader extends Confi
closeReaders();
}
- public void processStatJob(String crawlDb, JobConf config, boolean sort)
+ private TreeMap<String, LongWritable> processStatJobHelper(String crawlDb, Configuration config, boolean sort) throws IOException{
+ Path tmpFolder = new Path(crawlDb, "stat_tmp" + System.currentTimeMillis());
+
+ JobConf job = new NutchJob(config);
+ job.setJobName("stats " + crawlDb);
+ job.setBoolean("db.reader.stats.sort", sort);
+
+ FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME));
+ job.setInputFormat(SequenceFileInputFormat.class);
+
+ job.setMapperClass(CrawlDbStatMapper.class);
+ job.setCombinerClass(CrawlDbStatCombiner.class);
+ job.setReducerClass(CrawlDbStatReducer.class);
+
+ FileOutputFormat.setOutputPath(job, tmpFolder);
+ job.setOutputFormat(SequenceFileOutputFormat.class);
+ job.setOutputKeyClass(Text.class);
+ job.setOutputValueClass(LongWritable.class);
+
+ // https://issues.apache.org/jira/browse/NUTCH-1029
+ job.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);
+
+ JobClient.runJob(job);
+
+ // reading the result
+ FileSystem fileSystem = FileSystem.get(config);
+ SequenceFile.Reader[] readers = SequenceFileOutputFormat.getReaders(config,
+ tmpFolder);
+
+ Text key = new Text();
+ LongWritable value = new LongWritable();
+
+ TreeMap<String, LongWritable> stats = new TreeMap<String, LongWritable>();
+ for (int i = 0; i < readers.length; i++) {
+ SequenceFile.Reader reader = readers[i];
+ while (reader.next(key, value)) {
+ String k = key.toString();
+ LongWritable val = stats.get(k);
+ if (val == null) {
+ val = new LongWritable();
+ if (k.equals("scx"))
+ val.set(Long.MIN_VALUE);
+ if (k.equals("scn"))
+ val.set(Long.MAX_VALUE);
+ stats.put(k, val);
+ }
+ if (k.equals("scx")) {
+ if (val.get() < value.get())
+ val.set(value.get());
+ } else if (k.equals("scn")) {
+ if (val.get() > value.get())
+ val.set(value.get());
+ } else {
+ val.set(val.get() + value.get());
+ }
+ }
+ reader.close();
+ }
+ // removing the tmp folder
+ fileSystem.delete(tmpFolder, true);
+ return stats;
+ }
+
+ public void processStatJob(String crawlDb, Configuration config, boolean sort)
throws IOException {
if (LOG.isInfoEnabled()) {
LOG.info("CrawlDb statistics start: " + crawlDb);
}
-
- Path tmpFolder = new Path(crawlDb, "stat_tmp" + System.currentTimeMillis());
-
- JobConf job = new NutchJob(config);
- job.setJobName("stats " + crawlDb);
- job.setBoolean("db.reader.stats.sort", sort);
-
- FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME));
- job.setInputFormat(SequenceFileInputFormat.class);
-
- job.setMapperClass(CrawlDbStatMapper.class);
- job.setCombinerClass(CrawlDbStatCombiner.class);
- job.setReducerClass(CrawlDbStatReducer.class);
-
- FileOutputFormat.setOutputPath(job, tmpFolder);
- job.setOutputFormat(SequenceFileOutputFormat.class);
- job.setOutputKeyClass(Text.class);
- job.setOutputValueClass(LongWritable.class);
-
- // https://issues.apache.org/jira/browse/NUTCH-1029
- job.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);
-
- JobClient.runJob(job);
-
- // reading the result
- FileSystem fileSystem = FileSystem.get(config);
- SequenceFile.Reader[] readers = SequenceFileOutputFormat.getReaders(config,
- tmpFolder);
-
- Text key = new Text();
- LongWritable value = new LongWritable();
-
- TreeMap<String, LongWritable> stats = new TreeMap<String, LongWritable>();
- for (int i = 0; i < readers.length; i++) {
- SequenceFile.Reader reader = readers[i];
- while (reader.next(key, value)) {
- String k = key.toString();
- LongWritable val = stats.get(k);
- if (val == null) {
- val = new LongWritable();
- if (k.equals("scx"))
- val.set(Long.MIN_VALUE);
- if (k.equals("scn"))
- val.set(Long.MAX_VALUE);
- stats.put(k, val);
- }
- if (k.equals("scx")) {
- if (val.get() < value.get())
- val.set(value.get());
- } else if (k.equals("scn")) {
- if (val.get() > value.get())
- val.set(value.get());
- } else {
- val.set(val.get() + value.get());
- }
- }
- reader.close();
- }
+ TreeMap<String, LongWritable> stats = processStatJobHelper(crawlDb, config, sort);
if (LOG.isInfoEnabled()) {
LOG.info("Statistics for CrawlDb: " + crawlDb);
@@ -437,8 +456,6 @@ public class CrawlDbReader extends Confi
LOG.info(k + ":\t" + val);
}
}
- // removing the tmp folder
- fileSystem.delete(tmpFolder, true);
if (LOG.isInfoEnabled()) {
LOG.info("CrawlDb statistics: done");
}
@@ -622,7 +639,7 @@ public class CrawlDbReader extends Confi
.println("\t<crawldb>\tdirectory name where crawldb is located");
System.err
.println("\t-stats [-sort] \tprint overall statistics to System.out");
- System.err.println("\t\t\tand optionally sort by host");
+ System.err.println("\t\t[-sort]\tlist status sorted by host");
System.err
.println("\t-dump <out_dir> [-format normal|csv|crawldb]\tdump the whole db to a text file in <out_dir>");
System.err.println("\t\t[-format csv]\tdump in Csv format");
@@ -702,5 +719,118 @@ public class CrawlDbReader extends Confi
int result = ToolRunner.run(NutchConfiguration.create(),
new CrawlDbReader(), args);
System.exit(result);
+ }
+ public Object query(Map<String, String> args, Configuration conf, String type, String crawlId) throws Exception {
+
+
+ Map<String, Object> results = new HashMap<String, Object>();
+ String crawlDb = crawlId + "/crawldb";
+
+ if(type.equalsIgnoreCase("stats")){
+ boolean sort = false;
+ if(args.containsKey("sort")){
+ if(args.get("sort").equalsIgnoreCase("true"))
+ sort = true;
+ }
+ TreeMap<String , LongWritable> stats = processStatJobHelper(crawlDb, NutchConfiguration.create(), sort);
+ LongWritable totalCnt = stats.get("T");
+ stats.remove("T");
+ results.put("totalUrls", String.valueOf(totalCnt.get()));
+ Map<String, Object> statusMap = new HashMap<String, Object>();
+
+ for (Map.Entry<String, LongWritable> entry : stats.entrySet()) {
+ String k = entry.getKey();
+ LongWritable val = entry.getValue();
+ if (k.equals("scn")) {
+
+ results.put("minScore", String.valueOf((val.get() / 1000.0f)));
+ } else if (k.equals("scx")) {
+ results.put("maxScore", String.valueOf((val.get() / 1000.0f)));
+ } else if (k.equals("sct")) {
+ results.put("avgScore", String.valueOf((float) ((((double) val.get()) / totalCnt.get()) / 1000.0)));
+ } else if (k.startsWith("status")) {
+ String[] st = k.split(" ");
+ int code = Integer.parseInt(st[1]);
+ if (st.length > 2){
+ Map<String, Object> individualStatusInfo = (Map<String, Object>) statusMap.get(String.valueOf(code));
+ Map<String, String> hostValues;
+ if(individualStatusInfo.containsKey("hostValues")){
+ hostValues= (Map<String, String>) individualStatusInfo.get("hostValues");
+ }
+ else{
+ hostValues = new HashMap<String, String>();
+ individualStatusInfo.put("hostValues", hostValues);
+ }
+ hostValues.put(st[2], String.valueOf(val));
+ }
+ else{
+ Map<String, Object> individualStatusInfo = new HashMap<String, Object>();
+
+ individualStatusInfo.put("statusValue", CrawlDatum.getStatusName((byte) code));
+ individualStatusInfo.put("count", String.valueOf(val));
+
+ statusMap.put(String.valueOf(code), individualStatusInfo);
+ }
+ } else
+ results.put(k, String.valueOf(val));
+ }
+ results.put("status", statusMap);
+ return results;
+ }
+ if(type.equalsIgnoreCase("dump")){
+ String output = args.get("out_dir");
+ String format = "normal";
+ String regex = null;
+ Integer retry = null;
+ String status = null;
+ if (args.containsKey("format")) {
+ format = args.get("format");
+ }
+ if (args.containsKey("regex")) {
+ regex = args.get("regex");
+ }
+ if (args.containsKey("retry")) {
+ retry = Integer.parseInt(args.get("retry"));
+ }
+ if (args.containsKey("status")) {
+ status = args.get("status");
+ }
+ processDumpJob(crawlDb, output, new NutchJob(conf), format, regex, status, retry);
+ File dumpFile = new File(output+"/part-00000");
+ return dumpFile;
+ }
+ if (type.equalsIgnoreCase("topN")) {
+ String output = args.get("out_dir");
+ long topN = Long.parseLong(args.get("nnn"));
+ float min = 0.0f;
+ if(args.containsKey("min")){
+ min = Float.parseFloat(args.get("min"));
+ }
+ processTopNJob(crawlDb, topN, min, output, new NutchJob(conf));
+ File dumpFile = new File(output+"/part-00000");
+ return dumpFile;
+ }
+
+ if(type.equalsIgnoreCase("url")){
+ String url = args.get("url");
+ CrawlDatum res = get(crawlDb, url, new NutchJob(conf));
+ results.put("status", res.getStatus());
+ results.put("fetchTime", new Date(res.getFetchTime()));
+ results.put("modifiedTime", new Date(res.getModifiedTime()));
+ results.put("retriesSinceFetch", res.getRetriesSinceFetch());
+ results.put("retryInterval", res.getFetchInterval());
+ results.put("score", res.getScore());
+ results.put("signature", StringUtil.toHexString(res.getSignature()));
+ Map<String, String> metadata = new HashMap<String, String>();
+ if(res.getMetaData()!=null){
+ for (Entry<Writable, Writable> e : res.getMetaData().entrySet()) {
+ metadata.put(String.valueOf(e.getKey()), String.valueOf(e.getValue()));
+ }
+ }
+ results.put("metadata", metadata);
+
+ return results;
+ }
+ return results;
}
-}
+}
\ No newline at end of file
Modified: nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java?rev=1675243&r1=1675242&r2=1675243&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java Wed Apr 22 01:46:28 2015
@@ -18,7 +18,9 @@ package org.apache.nutch.crawl;
import java.io.IOException;
import java.text.SimpleDateFormat;
+import java.util.HashMap;
import java.util.Iterator;
+import java.util.Map;
import java.util.Random;
import org.apache.hadoop.conf.Configured;
@@ -45,6 +47,7 @@ import org.apache.nutch.crawl.CrawlDatum
import org.apache.nutch.crawl.CrawlDb;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
+import org.apache.nutch.util.NutchTool;
import org.apache.nutch.util.TimingUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -58,7 +61,7 @@ import org.slf4j.LoggerFactory;
* then the one with the shortest URL is kept. The documents marked as duplicate
* can then be deleted with the command CleaningJob.
***/
-public class DeduplicationJob extends Configured implements Tool {
+public class DeduplicationJob extends NutchTool implements Tool {
public static final Logger LOG = LoggerFactory
.getLogger(DeduplicationJob.class);
@@ -294,4 +297,19 @@ public class DeduplicationJob extends Co
new DeduplicationJob(), args);
System.exit(result);
}
+
+ @Override
+ public Map<String, Object> run(Map<String, String> args, String crawlId) throws Exception {
+// if(args.size()<1){
+// throw new IllegalArgumentException("Required argument <crawldb>");
+// }
+ Map<String, Object> results = new HashMap<String, Object>();
+ String RESULT = "result";
+ String[] arg = new String[1];
+ String crawldb = crawlId+"/crawldb";
+ arg[0] = crawldb;
+ int res = run(arg);
+ results.put(RESULT, Integer.toString(res));
+ return results;
+ }
}
Modified: nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java?rev=1675243&r1=1675242&r2=1675243&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Wed Apr 22 01:46:28 2015
@@ -25,7 +25,6 @@ import java.text.*;
// rLogging imports
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-
import org.apache.hadoop.io.*;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.mapred.*;
@@ -34,7 +33,6 @@ import org.apache.hadoop.util.*;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
-
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.net.URLFilterException;
import org.apache.nutch.net.URLFilters;
@@ -44,6 +42,7 @@ import org.apache.nutch.scoring.ScoringF
import org.apache.nutch.util.LockUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
+import org.apache.nutch.util.NutchTool;
import org.apache.nutch.util.TimingUtil;
import org.apache.nutch.util.URLUtil;
@@ -55,7 +54,7 @@ import org.apache.nutch.util.URLUtil;
* a segment. We can chose separately how to count the URLS i.e. by domain or
* host to limit the entries.
**/
-public class Generator extends Configured implements Tool {
+public class Generator extends NutchTool implements Tool {
public static final Logger LOG = LoggerFactory.getLogger(Generator.class);
@@ -751,4 +750,63 @@ public class Generator extends Configure
}
return 0;
}
+
+ @Override
+ public Map<String, Object> run(Map<String, String> args, String crawlId) throws Exception {
+
+
+ Map<String, Object> results = new HashMap<String, Object>();
+ String RESULT = "result";
+ String crawldb = crawlId+"/crawldb";
+ Path dbDir = new Path(crawldb);
+ String segments_dir = crawlId+"/segments";
+ Path segmentsDir = new Path(segments_dir);
+ long curTime = System.currentTimeMillis();
+ long topN = Long.MAX_VALUE;
+ int numFetchers = -1;
+ boolean filter = true;
+ boolean norm = true;
+ boolean force = false;
+ int maxNumSegments = 1;
+
+
+ if (args.containsKey("topN")) {
+ topN = Long.parseLong(args.get("topN"));
+ }
+ if (args.containsKey("numFetchers")) {
+ numFetchers = Integer.parseInt(args.get("numFetchers"));
+ }
+ if (args.containsKey("adddays")) {
+ long numDays = Integer.parseInt(args.get("adddays"));
+ curTime += numDays * 1000L * 60 * 60 * 24;
+ }
+ if (args.containsKey("noFilter")) {
+ filter = false;
+ }
+ if (args.containsKey("noNorm")) {
+ norm = false;
+ }
+ if (args.containsKey("force")) {
+ force = true;
+ }
+ if (args.containsKey("maxNumSegments")) {
+ maxNumSegments = Integer.parseInt(args.get("maxNumSegments"));
+ }
+
+ try {
+ Path[] segs = generate(dbDir, segmentsDir, numFetchers, topN, curTime,
+ filter, norm, force, maxNumSegments);
+ if (segs == null){
+ results.put(RESULT, Integer.toString(1));
+ return results;
+ }
+
+ } catch (Exception e) {
+ LOG.error("Generator: " + StringUtils.stringifyException(e));
+ results.put(RESULT, Integer.toString(-1));
+ return results;
+ }
+ results.put(RESULT, Integer.toString(0));
+ return results;
+ }
}
Modified: nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java?rev=1675243&r1=1675242&r2=1675243&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java Wed Apr 22 01:46:28 2015
@@ -24,19 +24,18 @@ import java.util.*;
// Commons Logging imports
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-
import org.apache.hadoop.io.*;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.*;
-
import org.apache.nutch.net.*;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.scoring.ScoringFilterException;
import org.apache.nutch.scoring.ScoringFilters;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
+import org.apache.nutch.util.NutchTool;
import org.apache.nutch.util.TimingUtil;
/**
@@ -53,7 +52,7 @@ import org.apache.nutch.util.TimingUtil;
* e.g. http://www.nutch.org/ \t nutch.score=10 \t nutch.fetchInterval=2592000
* \t userType=open_source
**/
-public class Injector extends Configured implements Tool {
+public class Injector extends NutchTool implements Tool {
public static final Logger LOG = LoggerFactory.getLogger(Injector.class);
/** metadata key reserved for setting a custom score for a specific URL */
@@ -385,4 +384,23 @@ public class Injector extends Configured
}
}
+ @Override
+ /**
+ * Used by the Nutch REST service
+ */
+ public Map<String, Object> run(Map<String, String> args, String crawlId) throws Exception {
+ if(args.size()<1){
+ throw new IllegalArgumentException("Required arguments <url_dir>");
+ }
+ Map<String, Object> results = new HashMap<String, Object>();
+ String RESULT = "result";
+ String crawldb = crawlId+"/crawldb";
+ String url_dir = args.get("url_dir");
+
+ inject(new Path(crawldb), new Path(url_dir));
+ results.put(RESULT, Integer.toString(0));
+ return results;
+
+ }
+
}
Modified: nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java?rev=1675243&r1=1675242&r2=1675243&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java Wed Apr 22 01:46:28 2015
@@ -25,14 +25,12 @@ import java.net.*;
// Commons Logging imports
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-
import org.apache.hadoop.io.*;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.*;
-
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.parse.*;
@@ -40,10 +38,11 @@ import org.apache.nutch.util.HadoopFSUti
import org.apache.nutch.util.LockUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
+import org.apache.nutch.util.NutchTool;
import org.apache.nutch.util.TimingUtil;
/** Maintains an inverted link map, listing incoming links for each url. */
-public class LinkDb extends Configured implements Tool,
+public class LinkDb extends NutchTool implements Tool,
Mapper<Text, ParseData, Text, Inlinks> {
public static final Logger LOG = LoggerFactory.getLogger(LinkDb.class);
@@ -338,4 +337,53 @@ public class LinkDb extends Configured i
}
}
-}
+ /*
+ * Used for Nutch REST service
+ */
+ @Override
+ public Map<String, Object> run(Map<String, String> args, String crawlId) throws Exception {
+// if (args.size() < 2) {
+// throw new IllegalArgumentException("Required arguments <linkdb> (-dir <segmentsDir> | <seg1> <seg2> ...) [-force] [-noNormalize] [-noFilter]");
+// }
+
+ Map<String, Object> results = new HashMap<String, Object>();
+ String RESULT = "result";
+ String linkdb = crawlId + "/linkdb";
+ Path db = new Path(linkdb);
+ ArrayList<Path> segs = new ArrayList<Path>();
+ boolean filter = true;
+ boolean normalize = true;
+ boolean force = false;
+ if (args.containsKey("noNormalize")) {
+ normalize = false;
+ }
+ if (args.containsKey("noFilter")) {
+ filter = false;
+ }
+ if (args.containsKey("force")) {
+ force = true;
+ }
+ String segment_dir = crawlId+"/segments";
+ File segmentsDir = new File(segment_dir);
+ File[] segmentsList = segmentsDir.listFiles();
+ Arrays.sort(segmentsList, new Comparator<File>(){
+ @Override
+ public int compare(File f1, File f2) {
+ if(f1.lastModified()>f2.lastModified())
+ return -1;
+ else
+ return 0;
+ }
+ });
+ segs.add(new Path(segmentsList[0].getPath()));
+ try {
+ invert(db, segs.toArray(new Path[segs.size()]), normalize, filter, force);
+ results.put(RESULT, Integer.toString(0));
+ return results;
+ } catch (Exception e) {
+ LOG.error("LinkDb: " + StringUtils.stringifyException(e));
+ results.put(RESULT, Integer.toString(-1));
+ return results;
+ }
+ }
+}
\ No newline at end of file
Modified: nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=1675243&r1=1675242&r2=1675243&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Wed Apr 22 01:46:28 2015
@@ -16,6 +16,7 @@
*/
package org.apache.nutch.fetcher;
+import java.io.File;
import java.io.IOException;
import java.net.InetAddress;
import java.net.MalformedURLException;
@@ -27,10 +28,18 @@ import java.util.Map.Entry;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
+
+
+
+
+
+
+
+
+
// Slf4j Logging imports
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-
import org.apache.hadoop.io.*;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.conf.*;
@@ -38,7 +47,6 @@ import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
-
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.NutchWritable;
import org.apache.nutch.crawl.SignatureFactory;
@@ -91,7 +99,7 @@ import crawlercommons.robots.BaseRobotRu
*
* @author Andrzej Bialecki
*/
-public class Fetcher extends Configured implements Tool,
+public class Fetcher extends NutchTool implements Tool,
MapRunnable<Text, CrawlDatum, Text, NutchWritable> {
public static final int PERM_REFRESH_TIME = 5;
@@ -1191,7 +1199,7 @@ public class Fetcher extends Configured
}
public Fetcher() {
- super(null);
+ super(null);
}
public Fetcher(Configuration conf) {
@@ -1618,4 +1626,44 @@ public class Fetcher extends Configured
}
}
+ @Override
+ public Map<String, Object> run(Map<String, String> args, String crawlId) throws Exception {
+
+ Map<String, Object> results = new HashMap<String, Object>();
+ String RESULT = "result";
+ String segment_dir = crawlId+"/segments";
+ File segmentsDir = new File(segment_dir);
+ File[] segmentsList = segmentsDir.listFiles();
+ Arrays.sort(segmentsList, new Comparator<File>(){
+ @Override
+ public int compare(File f1, File f2) {
+ if(f1.lastModified()>f2.lastModified())
+ return -1;
+ else
+ return 0;
+ }
+ });
+
+ Path segment = new Path(segmentsList[0].getPath());
+
+ int threads = getConf().getInt("fetcher.threads.fetch", 10);
+ boolean parsing = false;
+
+ // parse command line
+ if (args.containsKey("threads")) { // found -threads option
+ threads = Integer.parseInt(args.get("threads"));
+ }
+ getConf().setInt("fetcher.threads.fetch", threads);
+
+ try {
+ fetch(segment, threads);
+ results.put(RESULT, Integer.toString(0));
+ return results;
+ } catch (Exception e) {
+ LOG.error("Fetcher: " + StringUtils.stringifyException(e));
+ results.put(RESULT, Integer.toString(-1));
+ return results;
+ }
+ }
+
}
Modified: nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java?rev=1675243&r1=1675242&r2=1675243&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java Wed Apr 22 01:46:28 2015
@@ -26,52 +26,58 @@ import org.apache.hadoop.io.Text;
*/
public interface Nutch {
- public static final String ORIGINAL_CHAR_ENCODING = "OriginalCharEncoding";
+ public static final String ORIGINAL_CHAR_ENCODING = "OriginalCharEncoding";
- public static final String CHAR_ENCODING_FOR_CONVERSION = "CharEncodingForConversion";
+ public static final String CHAR_ENCODING_FOR_CONVERSION = "CharEncodingForConversion";
- public static final String SIGNATURE_KEY = "nutch.content.digest";
+ public static final String SIGNATURE_KEY = "nutch.content.digest";
- public static final String SEGMENT_NAME_KEY = "nutch.segment.name";
+ public static final String SEGMENT_NAME_KEY = "nutch.segment.name";
- public static final String SCORE_KEY = "nutch.crawl.score";
+ public static final String SCORE_KEY = "nutch.crawl.score";
- public static final String GENERATE_TIME_KEY = "_ngt_";
+ public static final String GENERATE_TIME_KEY = "_ngt_";
- public static final Text WRITABLE_GENERATE_TIME_KEY = new Text(
- GENERATE_TIME_KEY);
+ public static final Text WRITABLE_GENERATE_TIME_KEY = new Text(
+ GENERATE_TIME_KEY);
- public static final String PROTO_STATUS_KEY = "_pst_";
+ public static final String PROTO_STATUS_KEY = "_pst_";
- public static final Text WRITABLE_PROTO_STATUS_KEY = new Text(
- PROTO_STATUS_KEY);
+ public static final Text WRITABLE_PROTO_STATUS_KEY = new Text(
+ PROTO_STATUS_KEY);
- public static final String FETCH_TIME_KEY = "_ftk_";
+ public static final String FETCH_TIME_KEY = "_ftk_";
- public static final String FETCH_STATUS_KEY = "_fst_";
+ public static final String FETCH_STATUS_KEY = "_fst_";
- /**
- * Sites may request that search engines don't provide access to cached
- * documents.
- */
- public static final String CACHING_FORBIDDEN_KEY = "caching.forbidden";
+ /**
+ * Sites may request that search engines don't provide access to cached
+ * documents.
+ */
+ public static final String CACHING_FORBIDDEN_KEY = "caching.forbidden";
- /** Show both original forbidden content and summaries (default). */
- public static final String CACHING_FORBIDDEN_NONE = "none";
+ /** Show both original forbidden content and summaries (default). */
+ public static final String CACHING_FORBIDDEN_NONE = "none";
- /** Don't show either original forbidden content or summaries. */
- public static final String CACHING_FORBIDDEN_ALL = "all";
+ /** Don't show either original forbidden content or summaries. */
+ public static final String CACHING_FORBIDDEN_ALL = "all";
- /** Don't show original forbidden content, but show summaries. */
- public static final String CACHING_FORBIDDEN_CONTENT = "content";
+ /** Don't show original forbidden content, but show summaries. */
+ public static final String CACHING_FORBIDDEN_CONTENT = "content";
- public static final String REPR_URL_KEY = "_repr_";
+ public static final String REPR_URL_KEY = "_repr_";
- public static final Text WRITABLE_REPR_URL_KEY = new Text(REPR_URL_KEY);
+ public static final Text WRITABLE_REPR_URL_KEY = new Text(REPR_URL_KEY);
- /** Used by AdaptiveFetchSchedule to maintain custom fetch interval */
- public static final String FIXED_INTERVAL_KEY = "fixedInterval";
+ /** Used by AdaptiveFetchSchedule to maintain custom fetch interval */
+ public static final String FIXED_INTERVAL_KEY = "fixedInterval";
- public static final Text WRITABLE_FIXED_INTERVAL_KEY = new Text(
- FIXED_INTERVAL_KEY);
+ public static final Text WRITABLE_FIXED_INTERVAL_KEY = new Text(
+ FIXED_INTERVAL_KEY);
+
+
+ /** For progress of job. Used by the Nutch REST service */
+ public static final String STAT_PROGRESS = "progress";
+ /**Used by Nutch REST service */
+ public static final String CRAWL_ID_KEY = "storage.crawl.id";
}
Modified: nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java?rev=1675243&r1=1675242&r2=1675243&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java Wed Apr 22 01:46:28 2015
@@ -19,10 +19,8 @@ package org.apache.nutch.parse;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.SignatureFactory;
-import org.apache.nutch.segment.SegmentChecker;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.*;
@@ -33,7 +31,6 @@ import org.apache.nutch.net.protocols.Re
import org.apache.nutch.protocol.*;
import org.apache.nutch.scoring.ScoringFilterException;
import org.apache.nutch.scoring.ScoringFilters;
-import org.apache.hadoop.fs.FileSystem;
import org.apache.nutch.util.*;
import org.apache.hadoop.fs.Path;
@@ -43,7 +40,7 @@ import java.util.*;
import java.util.Map.Entry;
/* Parse content in a segment. */
-public class ParseSegment extends Configured implements Tool,
+public class ParseSegment extends NutchTool implements Tool,
Mapper<WritableComparable<?>, Content, Text, ParseImpl>,
Reducer<Text, Writable, Text, Writable> {
@@ -200,11 +197,6 @@ public class ParseSegment extends Config
}
public void parse(Path segment) throws IOException {
- if (SegmentChecker.isParsed(segment, FileSystem.get(getConf()))) {
- LOG.warn("Segment: " + segment
- + " already parsed!! Skipped parsing this segment!!"); // NUTCH-1854
- return;
- }
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
long start = System.currentTimeMillis();
@@ -265,4 +257,37 @@ public class ParseSegment extends Config
parse(segment);
return 0;
}
+
+ /*
+ * Used for Nutch REST service
+ */
+ public Map<String, Object> run(Map<String, String> args, String crawlId) throws Exception {
+
+ Map<String, Object> results = new HashMap<String, Object>();
+ String RESULT = "result";
+ if (args.containsKey("nofilter")) {
+ getConf().setBoolean("parse.filter.urls", false);
+ }
+ if (args.containsKey("nonormalize")) {
+ getConf().setBoolean("parse.normalize.urls", false);
+ }
+
+ String segment_dir = crawlId+"/segments";
+ File segmentsDir = new File(segment_dir);
+ File[] segmentsList = segmentsDir.listFiles();
+ Arrays.sort(segmentsList, new Comparator<File>(){
+ @Override
+ public int compare(File f1, File f2) {
+ if(f1.lastModified()>f2.lastModified())
+ return -1;
+ else
+ return 0;
+ }
+ });
+
+ Path segment = new Path(segmentsList[0].getPath());
+ parse(segment);
+ results.put(RESULT, Integer.toString(0));
+ return results;
+ }
}
Modified: nutch/trunk/src/java/org/apache/nutch/protocol/RobotRulesParser.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/protocol/RobotRulesParser.java?rev=1675243&r1=1675242&r2=1675243&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/protocol/RobotRulesParser.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/protocol/RobotRulesParser.java Wed Apr 22 01:46:28 2015
@@ -20,15 +20,10 @@ package org.apache.nutch.protocol;
// JDK imports
import java.io.File;
import java.io.FileReader;
-import java.io.IOException;
-import java.io.InputStream;
import java.io.LineNumberReader;
-import java.net.MalformedURLException;
import java.net.URL;
-import java.util.Arrays;
-import java.util.HashSet;
+import java.util.ArrayList;
import java.util.Hashtable;
-import java.util.Set;
import java.util.StringTokenizer;
// Commons Logging imports
@@ -37,11 +32,10 @@ import org.slf4j.LoggerFactory;
// Nutch imports
import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.io.Text;
-import org.apache.hadoop.util.StringUtils;
-import org.apache.hadoop.util.Tool;
-import org.apache.hadoop.util.ToolRunner;
-import org.apache.nutch.util.NutchConfiguration;
+
+import com.google.common.io.Files;
import crawlercommons.robots.BaseRobotRules;
import crawlercommons.robots.SimpleRobotRules;
@@ -52,11 +46,8 @@ import crawlercommons.robots.SimpleRobot
* This class uses crawler-commons for handling the parsing of
* {@code robots.txt} files. It emits SimpleRobotRules objects, which describe
* the download permissions as described in SimpleRobotRulesParser.
- *
- * Protocol-specific implementations have to implement the method
- * {@link getRobotRulesSet}.
*/
-public abstract class RobotRulesParser implements Tool {
+public abstract class RobotRulesParser implements Configurable {
public static final Logger LOG = LoggerFactory
.getLogger(RobotRulesParser.class);
@@ -79,13 +70,9 @@ public abstract class RobotRulesParser i
RobotRulesMode.ALLOW_NONE);
private static SimpleRobotRulesParser robotParser = new SimpleRobotRulesParser();
- protected Configuration conf;
+ private Configuration conf;
protected String agentNames;
- /** set of host names or IPs to be explicitly excluded from robots.txt checking */
- protected Set<String> whiteList = new HashSet<String>();;
-
-
public RobotRulesParser() {
}
@@ -125,12 +112,6 @@ public abstract class RobotRulesParser i
agentNames = sb.toString();
}
-
- String[] confWhiteList = conf.getStrings("http.robot.rules.whitelist");
- if (confWhiteList != null && confWhiteList.length > 0) {
- whiteList.addAll(Arrays.asList(confWhiteList));
- LOG.info("Whitelisted hosts: " + whiteList);
- }
}
/**
@@ -140,14 +121,6 @@ public abstract class RobotRulesParser i
return conf;
}
-
- /**
- * Check whether a URL belongs to a whitelisted host.
- */
- public boolean isWhiteListed(URL url) {
- return whiteList.contains(url.getHost());
- }
-
/**
* Parses the robots content using the {@link SimpleRobotRulesParser} from
* crawler commons
@@ -178,127 +151,41 @@ public abstract class RobotRulesParser i
return getRobotRulesSet(protocol, u);
}
- /**
- * Fetch robots.txt (or it's protocol-specific equivalent) which applies to
- * the given URL, parse it and return the set of robot rules applicable for
- * the configured agent name(s).
- *
- * @param protocol
- * protocol implementation
- * @param url
- * URL to be checked whether fetching is allowed by robot rules
- * @return robot rules
- */
public abstract BaseRobotRules getRobotRulesSet(Protocol protocol, URL url);
- @Override
- public int run(String[] args) {
+ /** command-line main for testing */
+ public static void main(String[] argv) {
- if (args.length < 2) {
- String[] help = {
- "Usage: RobotRulesParser <robots-file> <url-file> [<agent-names>]\n",
- "\tThe <robots-file> will be parsed as a robots.txt file,",
- "\tusing the given <agent-name> to select rules.",
- "\tURLs will be read (one per line) from <url-file>,",
- "\tand tested against the rules.",
- "\tMultiple agent names can be provided using",
- "\tcomma as a delimiter without any spaces.",
- "\tIf no agent name is given the property http.agent.name",
- "\tis used. If http.agent.name is empty, robots.txt is checked",
- "\tfor rules assigned to the user agent `*' (meaning any other)." };
- for (String s : help) {
- System.err.println(s);
- }
+ if (argv.length != 3) {
+ System.err
+ .println("Usage: RobotRulesParser <robots-file> <url-file> <agent-names>\n");
+ System.err
+ .println("\tThe <robots-file> will be parsed as a robots.txt file,");
+ System.err
+ .println("\tusing the given <agent-name> to select rules. URLs ");
+ System.err
+ .println("\twill be read (one per line) from <url-file>, and tested");
+ System.err
+ .println("\tagainst the rules. Multiple agent names can be provided using");
+ System.err.println("\tcomma as a delimiter without any spaces.");
System.exit(-1);
}
- File robotsFile = new File(args[0]);
- File urlFile = new File(args[1]);
-
- if (args.length > 2) {
- // set agent name from command-line in configuration and update parser
- String agents = args[2];
- conf.set("http.agent.name", agents);
- setConf(conf);
- }
-
try {
- BaseRobotRules rules = getRobotRulesSet(null, robotsFile.toURI().toURL());
+ byte[] robotsBytes = Files.toByteArray(new File(argv[0]));
+ BaseRobotRules rules = robotParser.parseContent(argv[0], robotsBytes,
+ "text/plain", argv[2]);
- LineNumberReader testsIn = new LineNumberReader(new FileReader(urlFile));
- String testPath;
- testPath = testsIn.readLine().trim();
+ LineNumberReader testsIn = new LineNumberReader(new FileReader(argv[1]));
+ String testPath = testsIn.readLine().trim();
while (testPath != null) {
- try {
- // testPath can be just a path or a complete URL
- URL url = new URL(testPath);
- String status;
- if (isWhiteListed(url)) {
- status = "whitelisted";
- } else if (rules.isAllowed(testPath)) {
- status = "allowed";
- } else {
- status = "not allowed";
- }
- System.out.println(status + ":\t" + testPath);
- } catch (MalformedURLException e) {
- }
+ System.out.println((rules.isAllowed(testPath) ? "allowed"
+ : "not allowed") + ":\t" + testPath);
testPath = testsIn.readLine();
}
testsIn.close();
- } catch (IOException e) {
- LOG.error("Failed to run: " + StringUtils.stringifyException(e));
- return -1;
- }
-
- return 0;
- }
-
- /**
- * {@link RobotRulesParser} implementation which expects the location of the
- * robots.txt passed by URL (usually pointing to a local file) in
- * {@link getRobotRulesSet}.
- */
- private static class TestRobotRulesParser extends RobotRulesParser {
-
- public TestRobotRulesParser(Configuration conf) {
- // make sure that agent name is set so that setConf() does not complain,
- // the agent name is later overwritten by command-line argument
- if (conf.get("http.agent.name") == null) {
- conf.set("http.agent.name", "*");
- }
- setConf(conf);
- }
-
- /**
- * @param protocol (ignored)
- * @param url
- * location of the robots.txt file
- * */
- public BaseRobotRules getRobotRulesSet(Protocol protocol, URL url) {
- BaseRobotRules rules;
- try {
- int contentLength = url.openConnection().getContentLength();
- byte[] robotsBytes = new byte[contentLength];
- InputStream openStream = url.openStream();
- openStream.read(robotsBytes);
- openStream.close();
- rules = robotParser.parseContent(url.toString(), robotsBytes,
- "text/plain", this.conf.get("http.agent.name"));
- } catch (IOException e) {
- LOG.error("Failed to open robots.txt file " + url
- + StringUtils.stringifyException(e));
- rules = EMPTY_RULES;
- }
- return rules;
+ } catch (Exception e) {
+ e.printStackTrace();
}
-
- }
-
- public static void main(String[] args) throws Exception {
- Configuration conf = NutchConfiguration.create();
- int res = ToolRunner.run(conf, new TestRobotRulesParser(conf), args);
- System.exit(res);
}
-
}
Modified: nutch/trunk/src/java/org/apache/nutch/segment/SegmentChecker.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/segment/SegmentChecker.java?rev=1675243&r1=1675242&r2=1675243&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/segment/SegmentChecker.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/segment/SegmentChecker.java Wed Apr 22 01:46:28 2015
@@ -115,16 +115,4 @@ public class SegmentChecker {
}
}
- /**
- * Check the segment to see if it is has been parsed before.
- */
- public static boolean isParsed(Path segment, FileSystem fs)
- throws IOException {
-
- if (fs.exists(new Path(segment, CrawlDatum.PARSE_DIR_NAME)))
- return true;
- return false;
-
- }
-
-}
+}
\ No newline at end of file
Modified: nutch/trunk/src/java/org/apache/nutch/service/ConfManager.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/service/ConfManager.java?rev=1675243&r1=1675242&r2=1675243&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/service/ConfManager.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/service/ConfManager.java Wed Apr 22 01:46:28 2015
@@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-
+
package org.apache.nutch.service;
import java.util.Map;
@@ -30,10 +30,10 @@ public interface ConfManager {
public Map<String, String> getAsMap(String confId);
public void setProperty(String confId, String propName, String propValue);
-
+
public Set<String> list();
public String create(NutchConfig nutchConfig);
-
+
public void delete(String confId);
}
Modified: nutch/trunk/src/java/org/apache/nutch/service/JobManager.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/service/JobManager.java?rev=1675243&r1=1675242&r2=1675243&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/service/JobManager.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/service/JobManager.java Wed Apr 22 01:46:28 2015
@@ -14,27 +14,33 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-
+
package org.apache.nutch.service;
import java.util.Collection;
+import java.util.Map;
-import org.apache.nutch.service.model.response.JobConfig;
+import org.apache.nutch.service.model.request.JobConfig;
import org.apache.nutch.service.model.response.JobInfo;
import org.apache.nutch.service.model.response.JobInfo.State;
public interface JobManager {
-
- public static enum JobType{
- INJECT, GENERATE, FETCH, PARSE, UPDATEDB, INDEX, READDB, CLASS
- };
- public Collection<JobInfo> list(String crawlId, State state);
-
- public JobInfo get(String crawlId, String id);
-
- public String create(JobConfig jobConfig);
-
- public boolean abort(String crawlId, String id);
- public boolean stop(String crawlId, String id);
+ public static enum JobType{
+ INJECT, GENERATE, FETCH, PARSE, UPDATEDB, INDEX, READDB, CLASS, INVERTLINKS, DEDUP
+ };
+ public Collection<JobInfo> list(String crawlId, State state);
+
+ public JobInfo get(String crawlId, String id);
+
+ /**
+ * Creates specified job
+ * @param jobConfig
+ * @return JobInfo
+ */
+ public JobInfo create(JobConfig jobConfig);
+
+ public boolean abort(String crawlId, String id);
+
+ public boolean stop(String crawlId, String id);
}
Modified: nutch/trunk/src/java/org/apache/nutch/service/NutchServer.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/service/NutchServer.java?rev=1675243&r1=1675242&r2=1675243&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/service/NutchServer.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/service/NutchServer.java Wed Apr 22 01:46:28 2015
@@ -20,6 +20,8 @@ package org.apache.nutch.service;
import java.util.ArrayList;
import java.util.List;
+import java.util.concurrent.BlockingQueue;
+import java.util.concurrent.TimeUnit;
import com.fasterxml.jackson.jaxrs.json.JacksonJaxbJsonProvider;
import org.apache.commons.cli.CommandLineParser;
@@ -35,132 +37,154 @@ import org.apache.cxf.jaxrs.JAXRSServerF
import org.apache.cxf.jaxrs.lifecycle.ResourceProvider;
import org.apache.cxf.jaxrs.lifecycle.SingletonResourceProvider;
import org.apache.nutch.service.impl.ConfManagerImpl;
+import org.apache.nutch.service.impl.JobFactory;
+import org.apache.nutch.service.impl.JobManagerImpl;
+import org.apache.nutch.service.impl.NutchServerPoolExecutor;
import org.apache.nutch.service.resources.ConfigResource;
+import org.apache.nutch.service.resources.DbResource;
import org.apache.nutch.service.resources.JobResource;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import com.google.common.collect.Queues;
+
public class NutchServer {
- private static final Logger LOG = LoggerFactory.getLogger(NutchServer.class);
+ private static final Logger LOG = LoggerFactory.getLogger(NutchServer.class);
- private static final String LOCALHOST = "localhost";
- private static final Integer DEFAULT_PORT = 8081;
- private static final int JOB_CAPACITY = 100;
+ private static final String LOCALHOST = "localhost";
+ private static final Integer DEFAULT_PORT = 8081;
+ private static final int JOB_CAPACITY = 100;
- private static Integer port = DEFAULT_PORT;
+ private static Integer port = DEFAULT_PORT;
private static String host = LOCALHOST;
- private static final String CMD_HELP = "help";
- private static final String CMD_PORT = "port";
+ private static final String CMD_HELP = "help";
+ private static final String CMD_PORT = "port";
private static final String CMD_HOST = "host";
- private long started;
- private boolean running;
- private ConfManager configManager;
- private JAXRSServerFactoryBean sf;
-
- private static NutchServer server;
-
- static {
- server = new NutchServer();
- }
-
- private NutchServer() {
- configManager = new ConfManagerImpl();
-
- sf = new JAXRSServerFactoryBean();
- BindingFactoryManager manager = sf.getBus().getExtension(BindingFactoryManager.class);
- JAXRSBindingFactory factory = new JAXRSBindingFactory();
- factory.setBus(sf.getBus());
- manager.registerBindingFactory(JAXRSBindingFactory.JAXRS_BINDING_ID, factory);
- sf.setResourceClasses(getClasses());
- sf.setResourceProviders(getResourceProviders());
- sf.setProvider(new JacksonJaxbJsonProvider());
-
-
- }
-
- public static NutchServer getInstance() {
- return server;
- }
+ private long started;
+ private boolean running;
+ private ConfManager configManager;
+ private JobManager jobManager;
+ private JAXRSServerFactoryBean sf;
+
+ private static NutchServer server;
+
+ static {
+ server = new NutchServer();
+ }
+
+ private NutchServer() {
+ configManager = new ConfManagerImpl();
+ BlockingQueue<Runnable> runnables = Queues.newArrayBlockingQueue(JOB_CAPACITY);
+ NutchServerPoolExecutor executor = new NutchServerPoolExecutor(10, JOB_CAPACITY, 1, TimeUnit.HOURS, runnables);
+ jobManager = new JobManagerImpl(new JobFactory(), configManager, executor);
+
+ sf = new JAXRSServerFactoryBean();
+ BindingFactoryManager manager = sf.getBus().getExtension(BindingFactoryManager.class);
+ JAXRSBindingFactory factory = new JAXRSBindingFactory();
+ factory.setBus(sf.getBus());
+ manager.registerBindingFactory(JAXRSBindingFactory.JAXRS_BINDING_ID, factory);
+ sf.setResourceClasses(getClasses());
+ sf.setResourceProviders(getResourceProviders());
+ sf.setProvider(new JacksonJaxbJsonProvider());
+
+
+ }
+
+ public static NutchServer getInstance() {
+ return server;
+ }
+
+ private static void startServer() {
+ server.start();
+ }
- private static void startServer() {
- server.start();
- }
-
- private void start() {
+ private void start() {
LOG.info("Starting NutchServer on {}:{} ...", host, port);
- try{
+ try{
String address = "http://" + host + ":" + port;
- sf.setAddress(address);
- sf.create();
- }catch(Exception e){
- throw new IllegalStateException("Server could not be started", e);
- }
+ sf.setAddress(address);
+ sf.create();
+ }catch(Exception e){
+ throw new IllegalStateException("Server could not be started", e);
+ }
- started = System.currentTimeMillis();
- running = true;
+ started = System.currentTimeMillis();
+ running = true;
LOG.info("Started Nutch Server on {}:{} at {}", host, port, started);
System.out.println("Started Nutch Server on " + host + ":" + port + " at " + started);
- }
+ }
- public List<Class<?>> getClasses() {
- List<Class<?>> resources = new ArrayList<Class<?>>();
- resources.add(JobResource.class);
- resources.add(ConfigResource.class);
- return resources;
- }
-
- public List<ResourceProvider> getResourceProviders() {
- List<ResourceProvider> resourceProviders = new ArrayList<ResourceProvider>();
- resourceProviders.add(new SingletonResourceProvider(getConfManager()));
-
- return resourceProviders;
- }
-
- public ConfManager getConfManager() {
- return configManager;
- }
-
- public static void main(String[] args) throws ParseException {
- CommandLineParser parser = new PosixParser();
- Options options = createOptions();
- CommandLine commandLine = parser.parse(options, args);
- if (commandLine.hasOption(CMD_HELP)) {
- HelpFormatter formatter = new HelpFormatter();
- formatter.printHelp("NutchServer", options, true);
- return;
- }
-
- if (commandLine.hasOption(CMD_PORT)) {
- port = Integer.parseInt(commandLine.getOptionValue(CMD_PORT));
- }
+ private List<Class<?>> getClasses() {
+ List<Class<?>> resources = new ArrayList<Class<?>>();
+ resources.add(JobResource.class);
+ resources.add(ConfigResource.class);
+ resources.add(DbResource.class);
+ return resources;
+ }
+
+ private List<ResourceProvider> getResourceProviders() {
+ List<ResourceProvider> resourceProviders = new ArrayList<ResourceProvider>();
+ resourceProviders.add(new SingletonResourceProvider(getConfManager()));
+ return resourceProviders;
+ }
+
+ public ConfManager getConfManager() {
+ return configManager;
+ }
+
+ public JobManager getJobManager() {
+ return jobManager;
+ }
+
+ public boolean isRunning(){
+ return running;
+ }
+
+ public long getStarted(){
+ return started;
+ }
+
+ public static void main(String[] args) throws ParseException {
+ CommandLineParser parser = new PosixParser();
+ Options options = createOptions();
+ CommandLine commandLine = parser.parse(options, args);
+ if (commandLine.hasOption(CMD_HELP)) {
+ HelpFormatter formatter = new HelpFormatter();
+ formatter.printHelp("NutchServer", options, true);
+ return;
+ }
+
+ if (commandLine.hasOption(CMD_PORT)) {
+ port = Integer.parseInt(commandLine.getOptionValue(CMD_PORT));
+ }
if (commandLine.hasOption(CMD_HOST)) {
host = commandLine.getOptionValue(CMD_HOST);
}
- startServer();
- }
+ startServer();
+ }
- private static Options createOptions() {
- Options options = new Options();
+ private static Options createOptions() {
+ Options options = new Options();
- OptionBuilder.withDescription("Show this help");
- options.addOption(OptionBuilder.create(CMD_HELP));
+ OptionBuilder.withDescription("Show this help");
+ options.addOption(OptionBuilder.create(CMD_HELP));
- OptionBuilder.withArgName("port");
- OptionBuilder.hasOptionalArg();
- OptionBuilder.withDescription("The port to run the Nutch Server. Default port 8081");
- options.addOption(OptionBuilder.create(CMD_PORT));
+ OptionBuilder.withArgName("port");
+ OptionBuilder.hasOptionalArg();
+ OptionBuilder.withDescription("The port to run the Nutch Server. Default port 8081");
+ options.addOption(OptionBuilder.create(CMD_PORT));
OptionBuilder.withArgName("host");
OptionBuilder.hasOptionalArg();
OptionBuilder.withDescription("The host to bind the Nutch Server to. Default is localhost.");
options.addOption(OptionBuilder.create(CMD_PORT));
- return options;
- }
+ return options;
+ }
}
Modified: nutch/trunk/src/java/org/apache/nutch/service/impl/ConfManagerImpl.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/service/impl/ConfManagerImpl.java?rev=1675243&r1=1675242&r2=1675243&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/service/impl/ConfManagerImpl.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/service/impl/ConfManagerImpl.java Wed Apr 22 01:46:28 2015
@@ -34,99 +34,99 @@ import org.apache.nutch.util.NutchConfig
import com.google.common.collect.Maps;
public class ConfManagerImpl implements ConfManager {
-
- private Map<String, Configuration> configurations = Maps.newConcurrentMap();
- private AtomicInteger newConfigId = new AtomicInteger();
+ private Map<String, Configuration> configurations = Maps.newConcurrentMap();
- public ConfManagerImpl() {
- configurations.put(ConfigResource.DEFAULT, NutchConfiguration.create());
- }
-
- /**
- * Returns the configuration associatedConfManagerImpl with the given confId
- */
- public Configuration get(String confId) {
- if (confId == null) {
- return configurations.get(ConfigResource.DEFAULT);
- }
- return configurations.get(confId);
- }
-
- public Map<String, String> getAsMap(String confId) {
- Configuration configuration = configurations.get(confId);
- if (configuration == null) {
- return Collections.emptyMap();
- }
-
- Iterator<Entry<String, String>> iterator = configuration.iterator();
- Map<String, String> configMap = Maps.newTreeMap();
- while (iterator.hasNext()) {
- Entry<String, String> entry = iterator.next();
- configMap.put(entry.getKey(), entry.getValue());
- }
- return configMap;
- }
-
- /**
- * Sets the given property in the configuration associated with the confId
- */
- public void setProperty(String confId, String propName, String propValue) {
- if (!configurations.containsKey(confId)) {
- throw new IllegalArgumentException("Unknown configId '" + confId + "'");
- }
- Configuration conf = configurations.get(confId);
- conf.set(propName, propValue);
- }
-
- public Set<String> list() {
- return configurations.keySet();
- }
-
- /**
- * Created a new configuration based on the values provided.
- * @param NutchConfig
- * @return String - confId
- */
- public String create(NutchConfig nutchConfig) {
- if (StringUtils.isBlank(nutchConfig.getConfigId())) {
- nutchConfig.setConfigId(String.valueOf(newConfigId.incrementAndGet()));
- }
-
- if (!canCreate(nutchConfig)) {
- throw new IllegalArgumentException("Config already exists.");
- }
-
- createHadoopConfig(nutchConfig);
- return nutchConfig.getConfigId();
- }
-
-
- public void delete(String confId) {
- configurations.remove(confId);
- }
-
- private boolean canCreate(NutchConfig nutchConfig) {
- if (nutchConfig.isForce()) {
- return true;
- }
- if (!configurations.containsKey(nutchConfig.getConfigId())) {
- return true;
- }
- return false;
- }
-
- private void createHadoopConfig(NutchConfig nutchConfig) {
- Configuration conf = NutchConfiguration.create();
- configurations.put(nutchConfig.getConfigId(), conf);
-
- if (MapUtils.isEmpty(nutchConfig.getParams())) {
- return;
- }
- for (Entry<String, String> e : nutchConfig.getParams().entrySet()) {
- conf.set(e.getKey(), e.getValue());
- }
- }
+ private AtomicInteger newConfigId = new AtomicInteger();
+
+ public ConfManagerImpl() {
+ configurations.put(ConfigResource.DEFAULT, NutchConfiguration.create());
+ }
+
+ /**
+ * Returns the configuration associatedConfManagerImpl with the given confId
+ */
+ public Configuration get(String confId) {
+ if (confId == null) {
+ return configurations.get(ConfigResource.DEFAULT);
+ }
+ return configurations.get(confId);
+ }
+
+ public Map<String, String> getAsMap(String confId) {
+ Configuration configuration = configurations.get(confId);
+ if (configuration == null) {
+ return Collections.emptyMap();
+ }
+
+ Iterator<Entry<String, String>> iterator = configuration.iterator();
+ Map<String, String> configMap = Maps.newTreeMap();
+ while (iterator.hasNext()) {
+ Entry<String, String> entry = iterator.next();
+ configMap.put(entry.getKey(), entry.getValue());
+ }
+ return configMap;
+ }
+
+ /**
+ * Sets the given property in the configuration associated with the confId
+ */
+ public void setProperty(String confId, String propName, String propValue) {
+ if (!configurations.containsKey(confId)) {
+ throw new IllegalArgumentException("Unknown configId '" + confId + "'");
+ }
+ Configuration conf = configurations.get(confId);
+ conf.set(propName, propValue);
+ }
+
+ public Set<String> list() {
+ return configurations.keySet();
+ }
+
+ /**
+ * Created a new configuration based on the values provided.
+ * @param NutchConfig
+ * @return String - confId
+ */
+ public String create(NutchConfig nutchConfig) {
+ if (StringUtils.isBlank(nutchConfig.getConfigId())) {
+ nutchConfig.setConfigId(String.valueOf(newConfigId.incrementAndGet()));
+ }
+
+ if (!canCreate(nutchConfig)) {
+ throw new IllegalArgumentException("Config already exists.");
+ }
+
+ createHadoopConfig(nutchConfig);
+ return nutchConfig.getConfigId();
+ }
+
+
+ public void delete(String confId) {
+ configurations.remove(confId);
+ }
+
+ private boolean canCreate(NutchConfig nutchConfig) {
+ if (nutchConfig.isForce()) {
+ return true;
+ }
+ if (!configurations.containsKey(nutchConfig.getConfigId())) {
+ return true;
+ }
+ return false;
+ }
+
+ private void createHadoopConfig(NutchConfig nutchConfig) {
+ Configuration conf = NutchConfiguration.create();
+ configurations.put(nutchConfig.getConfigId(), conf);
+
+ if (MapUtils.isEmpty(nutchConfig.getParams())) {
+ return;
+ }
+ for (Entry<String, String> e : nutchConfig.getParams().entrySet()) {
+ conf.set(e.getKey(), e.getValue());
+ }
+ }
}
Modified: nutch/trunk/src/java/org/apache/nutch/service/model/response/JobInfo.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/service/model/response/JobInfo.java?rev=1675243&r1=1675242&r2=1675243&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/service/model/response/JobInfo.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/service/model/response/JobInfo.java Wed Apr 22 01:46:28 2015
@@ -19,6 +19,7 @@ package org.apache.nutch.service.model.r
import java.util.Map;
import org.apache.nutch.service.JobManager.JobType;
+import org.apache.nutch.service.model.request.JobConfig;
/**
* This is the response object containing Job information
@@ -27,65 +28,75 @@ import org.apache.nutch.service.JobManag
*/
public class JobInfo {
- public static enum State {
- IDLE, RUNNING, FINISHED, FAILED, KILLED, STOPPING, KILLING, ANY
- };
-
- private String id;
- private JobType type;
- private String confId;
- private Map<String, Object> args;
- private Map<String, Object> result;
- private State state;
- private String msg;
- private String crawlId;
-
- public String getId() {
- return id;
- }
- public void setId(String id) {
- this.id = id;
- }
- public JobType getType() {
- return type;
- }
- public void setType(JobType type) {
- this.type = type;
- }
- public String getConfId() {
- return confId;
- }
- public void setConfId(String confId) {
- this.confId = confId;
- }
- public Map<String, Object> getArgs() {
- return args;
- }
- public void setArgs(Map<String, Object> args) {
- this.args = args;
- }
- public Map<String, Object> getResult() {
- return result;
- }
- public void setResult(Map<String, Object> result) {
- this.result = result;
- }
- public State getState() {
- return state;
- }
- public void setState(State state) {
- this.state = state;
- }
- public String getMsg() {
- return msg;
- }
- public void setMsg(String msg) {
- this.msg = msg;
- }
- public String getCrawlId() {
- return crawlId;
- }
- public void setCrawlId(String crawlId) {
- this.crawlId = crawlId;
- }
+ public static enum State {
+ IDLE, RUNNING, FINISHED, FAILED, KILLED, STOPPING, KILLING, ANY
+ };
+
+ private String id;
+ private JobType type;
+ private String confId;
+ private Map<String, String> args;
+ private Map<String, Object> result;
+ private State state;
+ private String msg;
+ private String crawlId;
+
+ public JobInfo(String generateId, JobConfig jobConfig, State state,
+ String msg) {
+ this.id = generateId;
+ this.type = jobConfig.getType();
+ this.confId = jobConfig.getConfId();
+ this.crawlId = jobConfig.getCrawlId();
+ this.args = jobConfig.getArgs();
+ this.msg = msg;
+ this.state = state;
+ }
+ public String getId() {
+ return id;
+ }
+ public void setId(String id) {
+ this.id = id;
+ }
+ public JobType getType() {
+ return type;
+ }
+ public void setType(JobType type) {
+ this.type = type;
+ }
+ public String getConfId() {
+ return confId;
+ }
+ public void setConfId(String confId) {
+ this.confId = confId;
+ }
+ public Map<String, String> getArgs() {
+ return args;
+ }
+ public void setArgs(Map<String, String> args) {
+ this.args = args;
+ }
+ public Map<String, Object> getResult() {
+ return result;
+ }
+ public void setResult(Map<String, Object> result) {
+ this.result = result;
+ }
+ public State getState() {
+ return state;
+ }
+ public void setState(State state) {
+ this.state = state;
+ }
+ public String getMsg() {
+ return msg;
+ }
+ public void setMsg(String msg) {
+ this.msg = msg;
+ }
+ public String getCrawlId() {
+ return crawlId;
+ }
+ public void setCrawlId(String crawlId) {
+ this.crawlId = crawlId;
+ }
}
Modified: nutch/trunk/src/java/org/apache/nutch/service/resources/AbstractResource.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/service/resources/AbstractResource.java?rev=1675243&r1=1675242&r2=1675243&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/service/resources/AbstractResource.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/service/resources/AbstractResource.java Wed Apr 22 01:46:28 2015
@@ -28,15 +28,16 @@ import org.apache.nutch.service.NutchSer
@Produces(MediaType.APPLICATION_JSON)
public abstract class AbstractResource {
-
- protected JobManager jobManager;
- protected ConfManager configManager;
-
- public AbstractResource() {
- configManager = NutchServer.getInstance().getConfManager();
- }
-
- protected void throwBadRequestException(String message) {
- throw new WebApplicationException(Response.status(Status.BAD_REQUEST).entity(message).build());
- }
+
+ protected JobManager jobManager;
+ protected ConfManager configManager;
+
+ public AbstractResource() {
+ configManager = NutchServer.getInstance().getConfManager();
+ jobManager = NutchServer.getInstance().getJobManager();
+ }
+
+ protected void throwBadRequestException(String message) {
+ throw new WebApplicationException(Response.status(Status.BAD_REQUEST).entity(message).build());
+ }
}
Modified: nutch/trunk/src/java/org/apache/nutch/service/resources/ConfigResource.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/service/resources/ConfigResource.java?rev=1675243&r1=1675242&r2=1675243&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/service/resources/ConfigResource.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/service/resources/ConfigResource.java Wed Apr 22 01:46:28 2015
@@ -38,45 +38,45 @@ import com.fasterxml.jackson.databind.Se
@Path("/config")
public class ConfigResource extends AbstractResource{
-
- public static final String DEFAULT = "default";
- @GET
- @Path("/")
+ public static final String DEFAULT = "default";
+
+ @GET
+ @Path("/")
@JacksonFeatures(serializationEnable = { SerializationFeature.INDENT_OUTPUT })
- public Set<String> getConfigs() {
- return configManager.list();
- }
-
- @GET
- @Path("/{configId}")
+ public Set<String> getConfigs() {
+ return configManager.list();
+ }
+
+ @GET
+ @Path("/{configId}")
@JacksonFeatures(serializationEnable = { SerializationFeature.INDENT_OUTPUT })
- public Map<String, String> getConfig(@PathParam("configId") String configId) {
- return configManager.getAsMap(configId);
- }
-
- @GET
- @Path("/{configId}/{propertyId}")
+ public Map<String, String> getConfig(@PathParam("configId") String configId) {
+ return configManager.getAsMap(configId);
+ }
+
+ @GET
+ @Path("/{configId}/{propertyId}")
@JacksonFeatures(serializationEnable = { SerializationFeature.INDENT_OUTPUT })
- public String getProperty(@PathParam("configId") String configId,
- @PathParam("propertyId") String propertyId) {
- return configManager.getAsMap(configId).get(propertyId);
- }
-
- @DELETE
- @Path("/{configId}")
- public void deleteConfig(@PathParam("configId") String configId) {
- configManager.delete(configId);
- }
-
- @POST
- @Path("/{configId}")
- @Consumes(MediaType.APPLICATION_JSON)
- public String createConfig(NutchConfig newConfig) {
- if (newConfig == null) {
- throw new WebApplicationException(Response.status(Status.BAD_REQUEST)
- .entity("Nutch configuration cannot be empty!").build());
- }
- return configManager.create(newConfig);
- }
+ public String getProperty(@PathParam("configId") String configId,
+ @PathParam("propertyId") String propertyId) {
+ return configManager.getAsMap(configId).get(propertyId);
+ }
+
+ @DELETE
+ @Path("/{configId}")
+ public void deleteConfig(@PathParam("configId") String configId) {
+ configManager.delete(configId);
+ }
+
+ @POST
+ @Path("/{configId}")
+ @Consumes(MediaType.APPLICATION_JSON)
+ public String createConfig(NutchConfig newConfig) {
+ if (newConfig == null) {
+ throw new WebApplicationException(Response.status(Status.BAD_REQUEST)
+ .entity("Nutch configuration cannot be empty!").build());
+ }
+ return configManager.create(newConfig);
+ }
}
Modified: nutch/trunk/src/java/org/apache/nutch/service/resources/JobResource.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/service/resources/JobResource.java?rev=1675243&r1=1675242&r2=1675243&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/service/resources/JobResource.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/service/resources/JobResource.java Wed Apr 22 01:46:28 2015
@@ -28,7 +28,7 @@ import javax.ws.rs.core.MediaType;
import com.fasterxml.jackson.databind.SerializationFeature;
import com.fasterxml.jackson.jaxrs.annotation.JacksonFeatures;
-import org.apache.nutch.service.model.response.JobConfig;
+import org.apache.nutch.service.model.request.JobConfig;
import org.apache.nutch.service.model.response.JobInfo;
import org.apache.nutch.service.model.response.JobInfo.State;
@@ -67,10 +67,10 @@ public class JobResource extends Abstrac
@POST
@Path(value = "/create")
@Consumes(MediaType.APPLICATION_JSON)
- public String create(JobConfig config) {
+ public JobInfo create(JobConfig config) {
if (config == null) {
throwBadRequestException("Job configuration is required!");
- }
- return jobManager.create(config);
+ }
+ return jobManager.create(config);
}
}
Modified: nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java?rev=1675243&r1=1675242&r2=1675243&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java Wed Apr 22 01:46:28 2015
@@ -49,7 +49,6 @@ import org.apache.commons.compress.compr
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.FilenameUtils;
-import org.apache.commons.validator.routines.UrlValidator;
//Hadoop
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
@@ -385,12 +384,6 @@ public class CommonCrawlDataDumper {
reader.getCurrentValue(content);
Metadata metadata = content.getMetadata();
String url = key.toString();
-
- UrlValidator urlValidator = new UrlValidator();
- if (!urlValidator.isValid(url)) {
- LOG.warn("Not valid URL detected: " + url);
- }
-
String baseName = FilenameUtils.getBaseName(url);
String extension = FilenameUtils.getExtension(url);
Modified: nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java?rev=1675243&r1=1675242&r2=1675243&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java Wed Apr 22 01:46:28 2015
@@ -128,13 +128,9 @@ public class FileDumper {
* @param mimeTypes
* an array of mime types we have to dump, all others will be
* filtered out.
- * @param flatDir
- * a boolean flag specifying whether the output directory should contain
- * only files instead of using nested directories to prevent naming
- * conflicts.
* @throws Exception
*/
- public void dump(File outputDir, File segmentRootDir, String[] mimeTypes, boolean flatDir)
+ public void dump(File outputDir, File segmentRootDir, String[] mimeTypes)
throws Exception {
if (mimeTypes == null)
LOG.info("Accepting all mimetypes.");
@@ -213,11 +209,7 @@ public class FileDumper {
if (filter) {
String md5Ofurl = DumpFileUtil.getUrlMD5(url);
-
- String fullDir = outputDir.getAbsolutePath();
- if (!flatDir) {
- fullDir = DumpFileUtil.createTwoLevelsDirectory(fullDir, md5Ofurl);
- }
+ String fullDir = DumpFileUtil.createTwoLevelsDirectory(outputDir.getAbsolutePath(), md5Ofurl);
if (!Strings.isNullOrEmpty(fullDir)) {
String outputFullPath = String.format("%s/%s", fullDir, DumpFileUtil.createFileName(md5Ofurl, baseName, extension));
@@ -281,12 +273,6 @@ public class FileDumper {
.withDescription(
"an optional list of mimetypes to dump, excluding all others. Defaults to all.")
.create("mimetype");
- @SuppressWarnings("static-access")
- Option dirStructureOpt = OptionBuilder
- .withArgName("flatdir")
- .withDescription(
- "optionally specify that the output directory should only contain files.")
- .create("flatdir");
// create the options
Options options = new Options();
@@ -294,7 +280,6 @@ public class FileDumper {
options.addOption(outputOpt);
options.addOption(segOpt);
options.addOption(mimeOpt);
- options.addOption(dirStructureOpt);
CommandLineParser parser = new GnuParser();
try {
@@ -309,7 +294,6 @@ public class FileDumper {
File outputDir = new File(line.getOptionValue("outputDir"));
File segmentRootDir = new File(line.getOptionValue("segment"));
String[] mimeTypes = line.getOptionValues("mimetype");
- boolean flatDir = line.hasOption("flatdir");
if (!outputDir.exists()) {
LOG.warn("Output directory: [" + outputDir.getAbsolutePath()
@@ -320,7 +304,7 @@ public class FileDumper {
}
FileDumper dumper = new FileDumper();
- dumper.dump(outputDir, segmentRootDir, mimeTypes, flatDir);
+ dumper.dump(outputDir, segmentRootDir, mimeTypes);
} catch (Exception e) {
LOG.error("FileDumper: " + StringUtils.stringifyException(e));
e.printStackTrace();
Modified: nutch/trunk/src/java/org/apache/nutch/util/domain/DomainStatistics.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/domain/DomainStatistics.java?rev=1675243&r1=1675242&r2=1675243&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/domain/DomainStatistics.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/util/domain/DomainStatistics.java Wed Apr 22 01:46:28 2015
@@ -67,20 +67,8 @@ public class DomainStatistics extends Co
public int run(String[] args) throws Exception {
if (args.length < 3) {
- System.err.println("Usage: DomainStatistics inputDirs outDir mode [numOfReducer]");
-
- System.err.println("\tinputDirs\tComma separated list of crawldb input directories");
- System.err.println("\t\t\tE.g.: crawl/crawldb/current/");
-
- System.err.println("\toutDir\t\tOutput directory where results should be dumped");
-
- System.err.println("\tmode\t\tSet statistics gathering mode");
- System.err.println("\t\t\t\thost\tGather statistics by host");
- System.err.println("\t\t\t\tdomain\tGather statistics by domain");
- System.err.println("\t\t\t\tsuffix\tGather statistics by suffix");
- System.err.println("\t\t\t\ttld\tGather statistics by top level directory");
-
- System.err.println("\t[numOfReducers]\tOptional number of reduce jobs to use. Defaults to 1.");
+ System.out
+ .println("usage: DomainStatistics inputDirs outDir host|domain|suffix|tld [numOfReducer]");
return 1;
}
String inputDir = args[0];