You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2015/01/29 06:39:03 UTC
svn commit: r1655526 [3/26] - in /nutch/trunk: ./
src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/fetcher/
src/java/org/apache/nutch/indexer/ src/java/org/apache/nutch/metadata/
src/java/org/apache/nutch/net/ src/java/org/apache/nutch/net/pr...
Modified: nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Thu Jan 29 05:38:59 2015
@@ -51,9 +51,9 @@ import org.apache.nutch.util.URLUtil;
* Generates a subset of a crawl db to fetch. This version allows to generate
* fetchlists for several segments in one go. Unlike in the initial version
* (OldGenerator), the IP resolution is done ONLY on the entries which have been
- * selected for fetching. The URLs are partitioned by IP, domain or host within a
- * segment. We can chose separately how to count the URLS i.e. by domain or host
- * to limit the entries.
+ * selected for fetching. The URLs are partitioned by IP, domain or host within
+ * a segment. We can chose separately how to count the URLS i.e. by domain or
+ * host to limit the entries.
**/
public class Generator extends Configured implements Tool {
@@ -73,7 +73,7 @@ public class Generator extends Configure
public static final String GENERATOR_CUR_TIME = "generate.curTime";
public static final String GENERATOR_DELAY = "crawl.gen.delay";
public static final String GENERATOR_MAX_NUM_SEGMENTS = "generate.max.num.segments";
-
+
public static class SelectorEntry implements Writable {
public Text url;
public CrawlDatum datum;
@@ -98,25 +98,25 @@ public class Generator extends Configure
}
public String toString() {
- return "url=" + url.toString() + ", datum=" + datum.toString() + ", segnum="
- + segnum.toString();
+ return "url=" + url.toString() + ", datum=" + datum.toString()
+ + ", segnum=" + segnum.toString();
}
}
/** Selects entries due for fetch. */
public static class Selector implements
- Mapper<Text,CrawlDatum,FloatWritable,SelectorEntry>,
- Partitioner<FloatWritable,Writable>,
- Reducer<FloatWritable,SelectorEntry,FloatWritable,SelectorEntry> {
+ Mapper<Text, CrawlDatum, FloatWritable, SelectorEntry>,
+ Partitioner<FloatWritable, Writable>,
+ Reducer<FloatWritable, SelectorEntry, FloatWritable, SelectorEntry> {
private LongWritable genTime = new LongWritable(System.currentTimeMillis());
private long curTime;
private long limit;
private long count;
- private HashMap<String,int[]> hostCounts = new HashMap<String,int[]>();
+ private HashMap<String, int[]> hostCounts = new HashMap<String, int[]>();
private int segCounts[];
private int maxCount;
private boolean byDomain = false;
- private Partitioner<Text,Writable> partitioner = new URLPartitioner();
+ private Partitioner<Text, Writable> partitioner = new URLPartitioner();
private URLFilters filters;
private URLNormalizers normalizers;
private ScoringFilters scfilters;
@@ -134,22 +134,26 @@ public class Generator extends Configure
public void configure(JobConf job) {
curTime = job.getLong(GENERATOR_CUR_TIME, System.currentTimeMillis());
- limit = job.getLong(GENERATOR_TOP_N, Long.MAX_VALUE) / job.getNumReduceTasks();
+ limit = job.getLong(GENERATOR_TOP_N, Long.MAX_VALUE)
+ / job.getNumReduceTasks();
maxCount = job.getInt(GENERATOR_MAX_COUNT, -1);
- if (maxCount==-1){
+ if (maxCount == -1) {
byDomain = false;
}
- if (GENERATOR_COUNT_VALUE_DOMAIN.equals(job.get(GENERATOR_COUNT_MODE))) byDomain = true;
+ if (GENERATOR_COUNT_VALUE_DOMAIN.equals(job.get(GENERATOR_COUNT_MODE)))
+ byDomain = true;
filters = new URLFilters(job);
normalise = job.getBoolean(GENERATOR_NORMALISE, true);
- if (normalise) normalizers = new URLNormalizers(job,
- URLNormalizers.SCOPE_GENERATE_HOST_COUNT);
+ if (normalise)
+ normalizers = new URLNormalizers(job,
+ URLNormalizers.SCOPE_GENERATE_HOST_COUNT);
scfilters = new ScoringFilters(job);
partitioner.configure(job);
filter = job.getBoolean(GENERATOR_FILTER, true);
genDelay = job.getLong(GENERATOR_DELAY, 7L) * 3600L * 24L * 1000L;
long time = job.getLong(Nutch.GENERATE_TIME_KEY, 0L);
- if (time > 0) genTime.set(time);
+ if (time > 0)
+ genTime.set(time);
schedule = FetchScheduleFactory.getFetchSchedule(job);
scoreThreshold = job.getFloat(GENERATOR_MIN_SCORE, Float.NaN);
intervalThreshold = job.getInt(GENERATOR_MIN_INTERVAL, -1);
@@ -158,21 +162,24 @@ public class Generator extends Configure
segCounts = new int[maxNumSegments];
}
- public void close() {}
+ public void close() {
+ }
/** Select & invert subset due for fetch. */
public void map(Text key, CrawlDatum value,
- OutputCollector<FloatWritable,SelectorEntry> output, Reporter reporter)
+ OutputCollector<FloatWritable, SelectorEntry> output, Reporter reporter)
throws IOException {
Text url = key;
if (filter) {
// If filtering is on don't generate URLs that don't pass
// URLFilters
try {
- if (filters.filter(url.toString()) == null) return;
+ if (filters.filter(url.toString()) == null)
+ return;
} catch (URLFilterException e) {
if (LOG.isWarnEnabled()) {
- LOG.warn("Couldn't filter url: " + url + " (" + e.getMessage() + ")");
+ LOG.warn("Couldn't filter url: " + url + " (" + e.getMessage()
+ + ")");
}
}
}
@@ -189,8 +196,8 @@ public class Generator extends Configure
Nutch.WRITABLE_GENERATE_TIME_KEY);
if (oldGenTime != null) { // awaiting fetch & update
if (oldGenTime.get() + genDelay > curTime) // still wait for
- // update
- return;
+ // update
+ return;
}
float sort = 1.0f;
try {
@@ -202,13 +209,19 @@ public class Generator extends Configure
}
if (restrictStatus != null
- && !restrictStatus.equalsIgnoreCase(CrawlDatum.getStatusName(crawlDatum.getStatus()))) return;
+ && !restrictStatus.equalsIgnoreCase(CrawlDatum
+ .getStatusName(crawlDatum.getStatus())))
+ return;
// consider only entries with a score superior to the threshold
- if (scoreThreshold != Float.NaN && sort < scoreThreshold) return;
+ if (scoreThreshold != Float.NaN && sort < scoreThreshold)
+ return;
- // consider only entries with a retry (or fetch) interval lower than threshold
- if (intervalThreshold != -1 && crawlDatum.getFetchInterval() > intervalThreshold) return;
+ // consider only entries with a retry (or fetch) interval lower than
+ // threshold
+ if (intervalThreshold != -1
+ && crawlDatum.getFetchInterval() > intervalThreshold)
+ return;
// sort by decreasing score, using DecreasingFloatComparator
sortValue.set(sort);
@@ -220,13 +233,15 @@ public class Generator extends Configure
}
/** Partition by host / domain or IP. */
- public int getPartition(FloatWritable key, Writable value, int numReduceTasks) {
- return partitioner.getPartition(((SelectorEntry) value).url, key, numReduceTasks);
+ public int getPartition(FloatWritable key, Writable value,
+ int numReduceTasks) {
+ return partitioner.getPartition(((SelectorEntry) value).url, key,
+ numReduceTasks);
}
/** Collect until limit is reached. */
public void reduce(FloatWritable key, Iterator<SelectorEntry> values,
- OutputCollector<FloatWritable,SelectorEntry> output, Reporter reporter)
+ OutputCollector<FloatWritable, SelectorEntry> output, Reporter reporter)
throws IOException {
while (values.hasNext()) {
@@ -236,7 +251,8 @@ public class Generator extends Configure
if (currentsegmentnum < maxNumSegments) {
count = 0;
currentsegmentnum++;
- } else break;
+ } else
+ break;
}
SelectorEntry entry = values.next();
@@ -270,7 +286,7 @@ public class Generator extends Configure
if (maxCount > 0) {
int[] hostCount = hostCounts.get(hostordomain);
if (hostCount == null) {
- hostCount = new int[] {1, 0};
+ hostCount = new int[] { 1, 0 };
hostCounts.put(hostordomain, hostCount);
}
@@ -278,7 +294,8 @@ public class Generator extends Configure
hostCount[1]++;
// check if topN reached, select next segment if it is
- while (segCounts[hostCount[0]-1] >= limit && hostCount[0] < maxNumSegments) {
+ while (segCounts[hostCount[0] - 1] >= limit
+ && hostCount[0] < maxNumSegments) {
hostCount[0]++;
hostCount[1] = 0;
}
@@ -291,18 +308,23 @@ public class Generator extends Configure
hostCount[1] = 0;
} else {
if (hostCount[1] == maxCount + 1 && LOG.isInfoEnabled()) {
- LOG.info("Host or domain " + hostordomain + " has more than " + maxCount
- + " URLs for all " + maxNumSegments + " segments. Additional URLs won't be included in the fetchlist.");
+ LOG.info("Host or domain "
+ + hostordomain
+ + " has more than "
+ + maxCount
+ + " URLs for all "
+ + maxNumSegments
+ + " segments. Additional URLs won't be included in the fetchlist.");
}
// skip this entry
continue;
}
}
entry.segnum = new IntWritable(hostCount[0]);
- segCounts[hostCount[0]-1]++;
+ segCounts[hostCount[0] - 1]++;
} else {
entry.segnum = new IntWritable(currentsegmentnum);
- segCounts[currentsegmentnum-1]++;
+ segCounts[currentsegmentnum - 1]++;
}
output.collect(key, entry);
@@ -316,16 +338,17 @@ public class Generator extends Configure
// Allows the reducers to generate one subfile per
public static class GeneratorOutputFormat extends
- MultipleSequenceFileOutputFormat<FloatWritable,SelectorEntry> {
+ MultipleSequenceFileOutputFormat<FloatWritable, SelectorEntry> {
// generate a filename based on the segnum stored for this entry
- protected String generateFileNameForKeyValue(FloatWritable key, SelectorEntry value,
- String name) {
+ protected String generateFileNameForKeyValue(FloatWritable key,
+ SelectorEntry value, String name) {
return "fetchlist-" + value.segnum.toString() + "/" + name;
}
}
- public static class DecreasingFloatComparator extends FloatWritable.Comparator {
+ public static class DecreasingFloatComparator extends
+ FloatWritable.Comparator {
/** Compares two FloatWritables decreasing. */
public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
@@ -334,20 +357,22 @@ public class Generator extends Configure
}
public static class SelectorInverseMapper extends MapReduceBase implements
- Mapper<FloatWritable,SelectorEntry,Text,SelectorEntry> {
+ Mapper<FloatWritable, SelectorEntry, Text, SelectorEntry> {
public void map(FloatWritable key, SelectorEntry value,
- OutputCollector<Text,SelectorEntry> output, Reporter reporter) throws IOException {
+ OutputCollector<Text, SelectorEntry> output, Reporter reporter)
+ throws IOException {
SelectorEntry entry = value;
output.collect(entry.url, entry);
}
}
public static class PartitionReducer extends MapReduceBase implements
- Reducer<Text,SelectorEntry,Text,CrawlDatum> {
+ Reducer<Text, SelectorEntry, Text, CrawlDatum> {
public void reduce(Text key, Iterator<SelectorEntry> values,
- OutputCollector<Text,CrawlDatum> output, Reporter reporter) throws IOException {
+ OutputCollector<Text, CrawlDatum> output, Reporter reporter)
+ throws IOException {
// if using HashComparator, we get only one input key in case of
// hash collision
// so use only URLs from values
@@ -365,7 +390,7 @@ public class Generator extends Configure
super(Text.class);
}
- @SuppressWarnings("rawtypes" )
+ @SuppressWarnings("rawtypes")
public int compare(WritableComparable a, WritableComparable b) {
Text url1 = (Text) a;
Text url2 = (Text) b;
@@ -395,15 +420,17 @@ public class Generator extends Configure
* Update the CrawlDB so that the next generate won't include the same URLs.
*/
public static class CrawlDbUpdater extends MapReduceBase implements
- Mapper<Text,CrawlDatum,Text,CrawlDatum>, Reducer<Text,CrawlDatum,Text,CrawlDatum> {
+ Mapper<Text, CrawlDatum, Text, CrawlDatum>,
+ Reducer<Text, CrawlDatum, Text, CrawlDatum> {
long generateTime;
public void configure(JobConf job) {
generateTime = job.getLong(Nutch.GENERATE_TIME_KEY, 0L);
}
- public void map(Text key, CrawlDatum value, OutputCollector<Text,CrawlDatum> output,
- Reporter reporter) throws IOException {
+ public void map(Text key, CrawlDatum value,
+ OutputCollector<Text, CrawlDatum> output, Reporter reporter)
+ throws IOException {
output.collect(key, value);
}
@@ -411,7 +438,8 @@ public class Generator extends Configure
private LongWritable genTime = new LongWritable(0L);
public void reduce(Text key, Iterator<CrawlDatum> values,
- OutputCollector<Text,CrawlDatum> output, Reporter reporter) throws IOException {
+ OutputCollector<Text, CrawlDatum> output, Reporter reporter)
+ throws IOException {
genTime.set(0L);
while (values.hasNext()) {
CrawlDatum val = values.next();
@@ -435,19 +463,21 @@ public class Generator extends Configure
}
}
- public Generator() {}
+ public Generator() {
+ }
public Generator(Configuration conf) {
setConf(conf);
}
- public Path[] generate(Path dbDir, Path segments, int numLists, long topN, long curTime)
- throws IOException {
+ public Path[] generate(Path dbDir, Path segments, int numLists, long topN,
+ long curTime) throws IOException {
JobConf job = new NutchJob(getConf());
boolean filter = job.getBoolean(GENERATOR_FILTER, true);
boolean normalise = job.getBoolean(GENERATOR_NORMALISE, true);
- return generate(dbDir, segments, numLists, topN, curTime, filter, normalise, false, 1);
+ return generate(dbDir, segments, numLists, topN, curTime, filter,
+ normalise, false, 1);
}
/**
@@ -456,7 +486,8 @@ public class Generator extends Configure
**/
public Path[] generate(Path dbDir, Path segments, int numLists, long topN,
long curTime, boolean filter, boolean force) throws IOException {
- return generate(dbDir, segments, numLists, topN, curTime, filter, true, force, 1);
+ return generate(dbDir, segments, numLists, topN, curTime, filter, true,
+ force, 1);
}
/**
@@ -482,11 +513,11 @@ public class Generator extends Configure
* When an I/O error occurs
*/
public Path[] generate(Path dbDir, Path segments, int numLists, long topN,
- long curTime, boolean filter, boolean norm, boolean force, int maxNumSegments)
- throws IOException {
+ long curTime, boolean filter, boolean norm, boolean force,
+ int maxNumSegments) throws IOException {
- Path tempDir = new Path(getConf().get("mapred.temp.dir", ".") + "/generate-temp-"
- + java.util.UUID.randomUUID().toString());
+ Path tempDir = new Path(getConf().get("mapred.temp.dir", ".")
+ + "/generate-temp-" + java.util.UUID.randomUUID().toString());
Path lock = new Path(dbDir, CrawlDb.LOCK_NAME);
FileSystem fs = FileSystem.get(getConf());
@@ -501,7 +532,7 @@ public class Generator extends Configure
if (topN != Long.MAX_VALUE) {
LOG.info("Generator: topN: " + topN);
}
-
+
// map to inverted subset due for fetch, sort by score
JobConf job = new NutchJob(getConf());
job.setJobName("generate: select from " + dbDir);
@@ -553,7 +584,8 @@ public class Generator extends Configure
try {
for (FileStatus stat : status) {
Path subfetchlist = stat.getPath();
- if (!subfetchlist.getName().startsWith("fetchlist-")) continue;
+ if (!subfetchlist.getName().startsWith("fetchlist-"))
+ continue;
// start a new partition job for this segment
Path newSeg = partitionSegment(fs, segments, subfetchlist, numLists);
generatedSegments.add(newSeg);
@@ -573,8 +605,8 @@ public class Generator extends Configure
if (getConf().getBoolean(GENERATE_UPDATE_CRAWLDB, false)) {
// update the db from tempDir
- Path tempDir2 = new Path(getConf().get("mapred.temp.dir", ".") + "/generate-temp-"
- + java.util.UUID.randomUUID().toString());
+ Path tempDir2 = new Path(getConf().get("mapred.temp.dir", ".")
+ + "/generate-temp-" + java.util.UUID.randomUUID().toString());
job = new NutchJob(getConf());
job.setJobName("generate: updatedb " + dbDir);
@@ -607,7 +639,8 @@ public class Generator extends Configure
fs.delete(tempDir, true);
long end = System.currentTimeMillis();
- LOG.info("Generator: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
+ LOG.info("Generator: finished at " + sdf.format(end) + ", elapsed: "
+ + TimingUtil.elapsedTime(start, end));
Path[] patharray = new Path[generatedSegments.size()];
return generatedSegments.toArray(patharray);
@@ -653,7 +686,8 @@ public class Generator extends Configure
public static synchronized String generateSegmentName() {
try {
Thread.sleep(1000);
- } catch (Throwable t) {}
+ } catch (Throwable t) {
+ }
;
return sdf.format(new Date(System.currentTimeMillis()));
}
@@ -662,7 +696,8 @@ public class Generator extends Configure
* Generate a fetchlist from the crawldb.
*/
public static void main(String args[]) throws Exception {
- int res = ToolRunner.run(NutchConfiguration.create(), new Generator(), args);
+ int res = ToolRunner
+ .run(NutchConfiguration.create(), new Generator(), args);
System.exit(res);
}
@@ -706,9 +741,10 @@ public class Generator extends Configure
}
try {
- Path[] segs = generate(dbDir, segmentsDir, numFetchers, topN, curTime, filter,
- norm, force, maxNumSegments);
- if (segs == null) return 1;
+ Path[] segs = generate(dbDir, segmentsDir, numFetchers, topN, curTime,
+ filter, norm, force, maxNumSegments);
+ if (segs == null)
+ return 1;
} catch (Exception e) {
LOG.error("Generator: " + StringUtils.stringifyException(e));
return -1;
Modified: nutch/trunk/src/java/org/apache/nutch/crawl/Inlink.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/Inlink.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/Inlink.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/Inlink.java Thu Jan 29 05:38:59 2015
@@ -26,7 +26,8 @@ public class Inlink implements Writable
private String fromUrl;
private String anchor;
- public Inlink() {}
+ public Inlink() {
+ }
public Inlink(String fromUrl, String anchor) {
this.fromUrl = fromUrl;
@@ -40,8 +41,8 @@ public class Inlink implements Writable
/** Skips over one Inlink in the input. */
public static void skip(DataInput in) throws IOException {
- Text.skip(in); // skip fromUrl
- Text.skip(in); // skip anchor
+ Text.skip(in); // skip fromUrl
+ Text.skip(in); // skip anchor
}
public void write(DataOutput out) throws IOException {
@@ -55,16 +56,20 @@ public class Inlink implements Writable
return inlink;
}
- public String getFromUrl() { return fromUrl; }
- public String getAnchor() { return anchor; }
+ public String getFromUrl() {
+ return fromUrl;
+ }
+
+ public String getAnchor() {
+ return anchor;
+ }
public boolean equals(Object o) {
if (!(o instanceof Inlink))
return false;
- Inlink other = (Inlink)o;
- return
- this.fromUrl.equals(other.fromUrl) &&
- this.anchor.equals(other.anchor);
+ Inlink other = (Inlink) o;
+ return this.fromUrl.equals(other.fromUrl)
+ && this.anchor.equals(other.anchor);
}
public int hashCode() {
Modified: nutch/trunk/src/java/org/apache/nutch/crawl/Inlinks.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/Inlinks.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/Inlinks.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/Inlinks.java Thu Jan 29 05:38:59 2015
@@ -27,17 +27,25 @@ import org.apache.hadoop.io.*;
public class Inlinks implements Writable {
private HashSet<Inlink> inlinks = new HashSet<Inlink>(1);
- public void add(Inlink inlink) { inlinks.add(inlink); }
+ public void add(Inlink inlink) {
+ inlinks.add(inlink);
+ }
- public void add(Inlinks inlinks) { this.inlinks.addAll(inlinks.inlinks); }
+ public void add(Inlinks inlinks) {
+ this.inlinks.addAll(inlinks.inlinks);
+ }
public Iterator<Inlink> iterator() {
return this.inlinks.iterator();
}
-
- public int size() { return inlinks.size(); }
- public void clear() { inlinks.clear(); }
+ public int size() {
+ return inlinks.size();
+ }
+
+ public void clear() {
+ inlinks.clear();
+ }
public void readFields(DataInput in) throws IOException {
int length = in.readInt();
@@ -67,30 +75,32 @@ public class Inlinks implements Writable
return buffer.toString();
}
- /** Return the set of anchor texts. Only a single anchor with a given text
- * is permitted from a given domain. */
+ /**
+ * Return the set of anchor texts. Only a single anchor with a given text is
+ * permitted from a given domain.
+ */
public String[] getAnchors() {
- HashMap<String, Set<String>> domainToAnchors =
- new HashMap<String, Set<String>>();
+ HashMap<String, Set<String>> domainToAnchors = new HashMap<String, Set<String>>();
ArrayList<String> results = new ArrayList<String>();
Iterator<Inlink> it = inlinks.iterator();
while (it.hasNext()) {
Inlink inlink = it.next();
String anchor = inlink.getAnchor();
- if (anchor.length() == 0) // skip empty anchors
+ if (anchor.length() == 0) // skip empty anchors
continue;
- String domain = null; // extract domain name
+ String domain = null; // extract domain name
try {
domain = new URL(inlink.getFromUrl()).getHost();
- } catch (MalformedURLException e) {}
+ } catch (MalformedURLException e) {
+ }
Set<String> domainAnchors = domainToAnchors.get(domain);
if (domainAnchors == null) {
domainAnchors = new HashSet<String>();
domainToAnchors.put(domain, domainAnchors);
}
- if (domainAnchors.add(anchor)) { // new anchor from domain
- results.add(anchor); // collect it
+ if (domainAnchors.add(anchor)) { // new anchor from domain
+ results.add(anchor); // collect it
}
}
Modified: nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java Thu Jan 29 05:38:59 2015
@@ -43,7 +43,8 @@ import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.TimingUtil;
/** Maintains an inverted link map, listing incoming links for each url. */
-public class LinkDb extends Configured implements Tool, Mapper<Text, ParseData, Text, Inlinks> {
+public class LinkDb extends Configured implements Tool,
+ Mapper<Text, ParseData, Text, Inlinks> {
public static final Logger LOG = LoggerFactory.getLogger(LinkDb.class);
@@ -56,13 +57,14 @@ public class LinkDb extends Configured i
private boolean ignoreInternalLinks;
private URLFilters urlFilters;
private URLNormalizers urlNormalizers;
-
- public LinkDb() {}
-
+
+ public LinkDb() {
+ }
+
public LinkDb(Configuration conf) {
setConf(conf);
}
-
+
public void configure(JobConf job) {
maxAnchorLength = job.getInt("db.max.anchor.length", 100);
ignoreInternalLinks = job.getBoolean(IGNORE_INTERNAL_LINKS, true);
@@ -74,16 +76,19 @@ public class LinkDb extends Configured i
}
}
- public void close() {}
+ public void close() {
+ }
public void map(Text key, ParseData parseData,
- OutputCollector<Text, Inlinks> output, Reporter reporter)
- throws IOException {
+ OutputCollector<Text, Inlinks> output, Reporter reporter)
+ throws IOException {
String fromUrl = key.toString();
String fromHost = getHost(fromUrl);
if (urlNormalizers != null) {
try {
- fromUrl = urlNormalizers.normalize(fromUrl, URLNormalizers.SCOPE_LINKDB); // normalize the url
+ fromUrl = urlNormalizers
+ .normalize(fromUrl, URLNormalizers.SCOPE_LINKDB); // normalize the
+ // url
} catch (Exception e) {
LOG.warn("Skipping " + fromUrl + ":" + e);
fromUrl = null;
@@ -97,7 +102,8 @@ public class LinkDb extends Configured i
fromUrl = null;
}
}
- if (fromUrl == null) return; // discard all outlinks
+ if (fromUrl == null)
+ return; // discard all outlinks
Outlink[] outlinks = parseData.getOutlinks();
Inlinks inlinks = new Inlinks();
for (int i = 0; i < outlinks.length; i++) {
@@ -107,12 +113,14 @@ public class LinkDb extends Configured i
if (ignoreInternalLinks) {
String toHost = getHost(toUrl);
if (toHost == null || toHost.equals(fromHost)) { // internal link
- continue; // skip it
+ continue; // skip it
}
}
if (urlNormalizers != null) {
try {
- toUrl = urlNormalizers.normalize(toUrl, URLNormalizers.SCOPE_LINKDB); // normalize the url
+ toUrl = urlNormalizers.normalize(toUrl, URLNormalizers.SCOPE_LINKDB); // normalize
+ // the
+ // url
} catch (Exception e) {
LOG.warn("Skipping " + toUrl + ":" + e);
toUrl = null;
@@ -126,13 +134,14 @@ public class LinkDb extends Configured i
toUrl = null;
}
}
- if (toUrl == null) continue;
+ if (toUrl == null)
+ continue;
inlinks.clear();
- String anchor = outlink.getAnchor(); // truncate long anchors
+ String anchor = outlink.getAnchor(); // truncate long anchors
if (anchor.length() > maxAnchorLength) {
anchor = anchor.substring(0, maxAnchorLength);
}
- inlinks.add(new Inlink(fromUrl, anchor)); // collect inverted link
+ inlinks.add(new Inlink(fromUrl, anchor)); // collect inverted link
output.collect(new Text(toUrl), inlinks);
}
}
@@ -145,13 +154,16 @@ public class LinkDb extends Configured i
}
}
- public void invert(Path linkDb, final Path segmentsDir, boolean normalize, boolean filter, boolean force) throws IOException {
+ public void invert(Path linkDb, final Path segmentsDir, boolean normalize,
+ boolean filter, boolean force) throws IOException {
final FileSystem fs = FileSystem.get(getConf());
- FileStatus[] files = fs.listStatus(segmentsDir, HadoopFSUtil.getPassDirectoriesFilter(fs));
+ FileStatus[] files = fs.listStatus(segmentsDir,
+ HadoopFSUtil.getPassDirectoriesFilter(fs));
invert(linkDb, HadoopFSUtil.getPaths(files), normalize, filter, force);
}
- public void invert(Path linkDb, Path[] segments, boolean normalize, boolean filter, boolean force) throws IOException {
+ public void invert(Path linkDb, Path[] segments, boolean normalize,
+ boolean filter, boolean force) throws IOException {
JobConf job = LinkDb.createJob(getConf(), linkDb, normalize, filter);
Path lock = new Path(linkDb, LOCK_NAME);
FileSystem fs = FileSystem.get(getConf());
@@ -174,7 +186,8 @@ public class LinkDb extends Configured i
if (LOG.isInfoEnabled()) {
LOG.info("LinkDb: adding segment: " + segments[i]);
}
- FileInputFormat.addInputPath(job, new Path(segments[i], ParseData.DIR_NAME));
+ FileInputFormat.addInputPath(job, new Path(segments[i],
+ ParseData.DIR_NAME));
}
try {
JobClient.runJob(job);
@@ -203,13 +216,14 @@ public class LinkDb extends Configured i
LinkDb.install(job, linkDb);
long end = System.currentTimeMillis();
- LOG.info("LinkDb: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
+ LOG.info("LinkDb: finished at " + sdf.format(end) + ", elapsed: "
+ + TimingUtil.elapsedTime(start, end));
}
- private static JobConf createJob(Configuration config, Path linkDb, boolean normalize, boolean filter) {
- Path newLinkDb =
- new Path("linkdb-" +
- Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
+ private static JobConf createJob(Configuration config, Path linkDb,
+ boolean normalize, boolean filter) {
+ Path newLinkDb = new Path("linkdb-"
+ + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
JobConf job = new NutchJob(config);
job.setJobName("linkdb " + linkDb);
@@ -247,12 +261,14 @@ public class LinkDb extends Configured i
Path old = new Path(linkDb, "old");
Path current = new Path(linkDb, CURRENT_NAME);
if (fs.exists(current)) {
- if (fs.exists(old)) fs.delete(old, true);
+ if (fs.exists(old))
+ fs.delete(old, true);
fs.rename(current, old);
}
fs.mkdirs(linkDb);
fs.rename(newLinkDb, current);
- if (fs.exists(old)) fs.delete(old, true);
+ if (fs.exists(old))
+ fs.delete(old, true);
LockUtil.removeLockFile(fs, new Path(linkDb, LOCK_NAME));
}
@@ -263,11 +279,14 @@ public class LinkDb extends Configured i
public int run(String[] args) throws Exception {
if (args.length < 2) {
- System.err.println("Usage: LinkDb <linkdb> (-dir <segmentsDir> | <seg1> <seg2> ...) [-force] [-noNormalize] [-noFilter]");
+ System.err
+ .println("Usage: LinkDb <linkdb> (-dir <segmentsDir> | <seg1> <seg2> ...) [-force] [-noNormalize] [-noFilter]");
System.err.println("\tlinkdb\toutput LinkDb to create or update");
- System.err.println("\t-dir segmentsDir\tparent directory of several segments, OR");
+ System.err
+ .println("\t-dir segmentsDir\tparent directory of several segments, OR");
System.err.println("\tseg1 seg2 ...\t list of segment directories");
- System.err.println("\t-force\tforce update even if LinkDb appears to be locked (CAUTION advised)");
+ System.err
+ .println("\t-force\tforce update even if LinkDb appears to be locked (CAUTION advised)");
System.err.println("\t-noNormalize\tdon't normalize link URLs");
System.err.println("\t-noFilter\tdon't apply URLFilters to link URLs");
return -1;
@@ -281,7 +300,8 @@ public class LinkDb extends Configured i
boolean force = false;
for (int i = 1; i < args.length; i++) {
if (args[i].equals("-dir")) {
- FileStatus[] paths = fs.listStatus(new Path(args[++i]), HadoopFSUtil.getPassDirectoriesFilter(fs));
+ FileStatus[] paths = fs.listStatus(new Path(args[++i]),
+ HadoopFSUtil.getPassDirectoriesFilter(fs));
segs.addAll(Arrays.asList(HadoopFSUtil.getPaths(paths)));
} else if (args[i].equalsIgnoreCase("-noNormalize")) {
normalize = false;
@@ -289,7 +309,8 @@ public class LinkDb extends Configured i
filter = false;
} else if (args[i].equalsIgnoreCase("-force")) {
force = true;
- } else segs.add(new Path(args[i]));
+ } else
+ segs.add(new Path(args[i]));
}
try {
invert(db, segs.toArray(new Path[segs.size()]), normalize, filter, force);
Modified: nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbFilter.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbFilter.java Thu Jan 29 05:38:59 2015
@@ -31,8 +31,8 @@ import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;
/**
- * This class provides a way to separate the URL normalization
- * and filtering steps from the rest of LinkDb manipulation code.
+ * This class provides a way to separate the URL normalization and filtering
+ * steps from the rest of LinkDb manipulation code.
*
* @author Andrzej Bialecki
*/
@@ -50,13 +50,13 @@ public class LinkDbFilter implements Map
private URLFilters filters;
private URLNormalizers normalizers;
-
+
private String scope;
-
+
public static final Logger LOG = LoggerFactory.getLogger(LinkDbFilter.class);
private Text newKey = new Text();
-
+
public void configure(JobConf job) {
filter = job.getBoolean(URL_FILTERING, false);
normalize = job.getBoolean(URL_NORMALIZING, false);
@@ -69,10 +69,12 @@ public class LinkDbFilter implements Map
}
}
- public void close() {}
+ public void close() {
+ }
public void map(Text key, Inlinks value,
- OutputCollector<Text, Inlinks> output, Reporter reporter) throws IOException {
+ OutputCollector<Text, Inlinks> output, Reporter reporter)
+ throws IOException {
String url = key.toString();
Inlinks result = new Inlinks();
if (normalize) {
@@ -91,7 +93,8 @@ public class LinkDbFilter implements Map
url = null;
}
}
- if (url == null) return; // didn't pass the filters
+ if (url == null)
+ return; // didn't pass the filters
Iterator<Inlink> it = value.iterator();
String fromUrl = null;
while (it.hasNext()) {
@@ -113,7 +116,7 @@ public class LinkDbFilter implements Map
fromUrl = null;
}
}
- if (fromUrl != null) {
+ if (fromUrl != null) {
result.add(new Inlink(fromUrl, inlink.getAnchor()));
}
}
Modified: nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbMerger.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbMerger.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbMerger.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbMerger.java Thu Jan 29 05:38:59 2015
@@ -46,37 +46,44 @@ import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.TimingUtil;
/**
- * This tool merges several LinkDb-s into one, optionally filtering
- * URLs through the current URLFilters, to skip prohibited URLs and
- * links.
+ * This tool merges several LinkDb-s into one, optionally filtering URLs through
+ * the current URLFilters, to skip prohibited URLs and links.
*
- * <p>It's possible to use this tool just for filtering - in that case
- * only one LinkDb should be specified in arguments.</p>
- * <p>If more than one LinkDb contains information about the same URL,
- * all inlinks are accumulated, but only at most <code>db.max.inlinks</code>
- * inlinks will ever be added.</p>
- * <p>If activated, URLFilters will be applied to both the target URLs and
- * to any incoming link URL. If a target URL is prohibited, all
- * inlinks to that target will be removed, including the target URL. If
- * some of incoming links are prohibited, only they will be removed, and they
- * won't count when checking the above-mentioned maximum limit.
+ * <p>
+ * It's possible to use this tool just for filtering - in that case only one
+ * LinkDb should be specified in arguments.
+ * </p>
+ * <p>
+ * If more than one LinkDb contains information about the same URL, all inlinks
+ * are accumulated, but only at most <code>db.max.inlinks</code> inlinks will
+ * ever be added.
+ * </p>
+ * <p>
+ * If activated, URLFilters will be applied to both the target URLs and to any
+ * incoming link URL. If a target URL is prohibited, all inlinks to that target
+ * will be removed, including the target URL. If some of incoming links are
+ * prohibited, only they will be removed, and they won't count when checking the
+ * above-mentioned maximum limit.
*
* @author Andrzej Bialecki
*/
-public class LinkDbMerger extends Configured implements Tool, Reducer<Text, Inlinks, Text, Inlinks> {
+public class LinkDbMerger extends Configured implements Tool,
+ Reducer<Text, Inlinks, Text, Inlinks> {
private static final Logger LOG = LoggerFactory.getLogger(LinkDbMerger.class);
-
+
private int maxInlinks;
-
+
public LinkDbMerger() {
-
+
}
-
+
public LinkDbMerger(Configuration conf) {
setConf(conf);
}
- public void reduce(Text key, Iterator<Inlinks> values, OutputCollector<Text, Inlinks> output, Reporter reporter) throws IOException {
+ public void reduce(Text key, Iterator<Inlinks> values,
+ OutputCollector<Text, Inlinks> output, Reporter reporter)
+ throws IOException {
Inlinks result = new Inlinks();
@@ -86,43 +93,48 @@ public class LinkDbMerger extends Config
int end = Math.min(maxInlinks - result.size(), inlinks.size());
Iterator<Inlink> it = inlinks.iterator();
int i = 0;
- while(it.hasNext() && i++ < end) {
+ while (it.hasNext() && i++ < end) {
result.add(it.next());
}
}
- if (result.size() == 0) return;
+ if (result.size() == 0)
+ return;
output.collect(key, result);
-
+
}
public void configure(JobConf job) {
maxInlinks = job.getInt("db.max.inlinks", 10000);
}
- public void close() throws IOException { }
+ public void close() throws IOException {
+ }
- public void merge(Path output, Path[] dbs, boolean normalize, boolean filter) throws Exception {
+ public void merge(Path output, Path[] dbs, boolean normalize, boolean filter)
+ throws Exception {
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
long start = System.currentTimeMillis();
LOG.info("LinkDb merge: starting at " + sdf.format(start));
JobConf job = createMergeJob(getConf(), output, normalize, filter);
for (int i = 0; i < dbs.length; i++) {
- FileInputFormat.addInputPath(job, new Path(dbs[i], LinkDb.CURRENT_NAME));
+ FileInputFormat.addInputPath(job, new Path(dbs[i], LinkDb.CURRENT_NAME));
}
JobClient.runJob(job);
FileSystem fs = FileSystem.get(getConf());
fs.mkdirs(output);
- fs.rename(FileOutputFormat.getOutputPath(job), new Path(output, LinkDb.CURRENT_NAME));
+ fs.rename(FileOutputFormat.getOutputPath(job), new Path(output,
+ LinkDb.CURRENT_NAME));
long end = System.currentTimeMillis();
- LOG.info("LinkDb merge: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
+ LOG.info("LinkDb merge: finished at " + sdf.format(end) + ", elapsed: "
+ + TimingUtil.elapsedTime(start, end));
}
- public static JobConf createMergeJob(Configuration config, Path linkDb, boolean normalize, boolean filter) {
- Path newLinkDb =
- new Path("linkdb-merge-" +
- Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
+ public static JobConf createMergeJob(Configuration config, Path linkDb,
+ boolean normalize, boolean filter) {
+ Path newLinkDb = new Path("linkdb-merge-"
+ + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
JobConf job = new NutchJob(config);
job.setJobName("linkdb merge " + linkDb);
@@ -145,22 +157,27 @@ public class LinkDbMerger extends Config
return job;
}
-
+
/**
* @param args
*/
public static void main(String[] args) throws Exception {
- int res = ToolRunner.run(NutchConfiguration.create(), new LinkDbMerger(), args);
+ int res = ToolRunner.run(NutchConfiguration.create(), new LinkDbMerger(),
+ args);
System.exit(res);
}
-
+
public int run(String[] args) throws Exception {
if (args.length < 2) {
- System.err.println("Usage: LinkDbMerger <output_linkdb> <linkdb1> [<linkdb2> <linkdb3> ...] [-normalize] [-filter]");
+ System.err
+ .println("Usage: LinkDbMerger <output_linkdb> <linkdb1> [<linkdb2> <linkdb3> ...] [-normalize] [-filter]");
System.err.println("\toutput_linkdb\toutput LinkDb");
- System.err.println("\tlinkdb1 ...\tinput LinkDb-s (single input LinkDb is ok)");
- System.err.println("\t-normalize\tuse URLNormalizer on both fromUrls and toUrls in linkdb(s) (usually not needed)");
- System.err.println("\t-filter\tuse URLFilters on both fromUrls and toUrls in linkdb(s)");
+ System.err
+ .println("\tlinkdb1 ...\tinput LinkDb-s (single input LinkDb is ok)");
+ System.err
+ .println("\t-normalize\tuse URLNormalizer on both fromUrls and toUrls in linkdb(s) (usually not needed)");
+ System.err
+ .println("\t-filter\tuse URLFilters on both fromUrls and toUrls in linkdb(s)");
return -1;
}
Path output = new Path(args[0]);
@@ -172,7 +189,8 @@ public class LinkDbMerger extends Config
filter = true;
} else if (args[i].equals("-normalize")) {
normalize = true;
- } else dbs.add(new Path(args[i]));
+ } else
+ dbs.add(new Path(args[i]));
}
try {
merge(output, dbs.toArray(new Path[dbs.size()]), normalize, filter);
Modified: nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java Thu Jan 29 05:38:59 2015
@@ -50,14 +50,14 @@ public class LinkDbReader extends Config
private MapFile.Reader[] readers;
public LinkDbReader() {
-
+
}
-
+
public LinkDbReader(Configuration conf, Path directory) throws Exception {
setConf(conf);
init(directory);
}
-
+
public void init(Path directory) throws Exception {
this.fs = FileSystem.get(getConf());
this.directory = directory;
@@ -73,16 +73,16 @@ public class LinkDbReader extends Config
public Inlinks getInlinks(Text url) throws IOException {
if (readers == null) {
- synchronized(this) {
- readers = MapFileOutputFormat.getReaders
- (fs, new Path(directory, LinkDb.CURRENT_NAME), getConf());
+ synchronized (this) {
+ readers = MapFileOutputFormat.getReaders(fs, new Path(directory,
+ LinkDb.CURRENT_NAME), getConf());
}
}
-
- return (Inlinks)MapFileOutputFormat.getEntry
- (readers, PARTITIONER, url, new Inlinks());
+
+ return (Inlinks) MapFileOutputFormat.getEntry(readers, PARTITIONER, url,
+ new Inlinks());
}
-
+
public void close() throws IOException {
if (readers != null) {
for (int i = 0; i < readers.length; i++) {
@@ -90,7 +90,7 @@ public class LinkDbReader extends Config
}
}
}
-
+
public void processDumpJob(String linkdb, String output) throws IOException {
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
long start = System.currentTimeMillis();
@@ -114,19 +114,24 @@ public class LinkDbReader extends Config
JobClient.runJob(job);
long end = System.currentTimeMillis();
- LOG.info("LinkDb dump: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
+ LOG.info("LinkDb dump: finished at " + sdf.format(end) + ", elapsed: "
+ + TimingUtil.elapsedTime(start, end));
}
-
+
public static void main(String[] args) throws Exception {
- int res = ToolRunner.run(NutchConfiguration.create(), new LinkDbReader(), args);
+ int res = ToolRunner.run(NutchConfiguration.create(), new LinkDbReader(),
+ args);
System.exit(res);
}
-
+
public int run(String[] args) throws Exception {
if (args.length < 2) {
- System.err.println("Usage: LinkDbReader <linkdb> (-dump <out_dir> | -url <url>)");
- System.err.println("\t-dump <out_dir>\tdump whole link db to a text file in <out_dir>");
- System.err.println("\t-url <url>\tprint information about <url> to System.out");
+ System.err
+ .println("Usage: LinkDbReader <linkdb> (-dump <out_dir> | -url <url>)");
+ System.err
+ .println("\t-dump <out_dir>\tdump whole link db to a text file in <out_dir>");
+ System.err
+ .println("\t-url <url>\tprint information about <url> to System.out");
return -1;
}
try {
Modified: nutch/trunk/src/java/org/apache/nutch/crawl/MD5Signature.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/MD5Signature.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/MD5Signature.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/MD5Signature.java Thu Jan 29 05:38:59 2015
@@ -22,9 +22,9 @@ import org.apache.nutch.parse.Parse;
import org.apache.nutch.protocol.Content;
/**
- * Default implementation of a page signature. It calculates an MD5 hash
- * of the raw binary content of a page. In case there is no content, it
- * calculates a hash from the page's URL.
+ * Default implementation of a page signature. It calculates an MD5 hash of the
+ * raw binary content of a page. In case there is no content, it calculates a
+ * hash from the page's URL.
*
* @author Andrzej Bialecki <ab@getopt.org>
*/
@@ -32,7 +32,8 @@ public class MD5Signature extends Signat
public byte[] calculate(Content content, Parse parse) {
byte[] data = content.getContent();
- if (data == null) data = content.getUrl().getBytes();
+ if (data == null)
+ data = content.getUrl().getBytes();
return MD5Hash.digest(data).getDigest();
}
}
Modified: nutch/trunk/src/java/org/apache/nutch/crawl/MapWritable.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/MapWritable.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/MapWritable.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/MapWritable.java Thu Jan 29 05:38:59 2015
@@ -47,19 +47,19 @@ import org.apache.hadoop.util.StringUtil
import org.apache.nutch.protocol.ProtocolStatus;
/**
- * A writable map, with a similar behavior as <code>java.util.HashMap</code>.
- * In addition to the size of key and value writable tuple two additional bytes
- * are stored to identify the Writable classes. This means that a maximum of
- * 255 different class types can be used for key and value objects.
- * A binary-id to class mapping is defined in a static block of this class.
- * However it is possible to use custom implementations of Writable.
- * For these custom Writables we write the byte id - utf class name tuple
- * into the header of each MapWritable that uses these types.
- *
+ * A writable map, with a similar behavior as <code>java.util.HashMap</code>. In
+ * addition to the size of key and value writable tuple two additional bytes are
+ * stored to identify the Writable classes. This means that a maximum of 255
+ * different class types can be used for key and value objects. A binary-id to
+ * class mapping is defined in a static block of this class. However it is
+ * possible to use custom implementations of Writable. For these custom
+ * Writables we write the byte id - utf class name tuple into the header of each
+ * MapWritable that uses these types.
+ *
* @author Stefan Groschupf
* @deprecated Use org.apache.hadoop.io.MapWritable instead.
*/
-
+
@Deprecated
public class MapWritable implements Writable {
@@ -105,14 +105,16 @@ public class MapWritable implements Writ
CLASS_ID_MAP.put(clazz, byteId);
ID_CLASS_MAP.put(byteId, clazz);
}
-
- public MapWritable() { }
-
+
+ public MapWritable() {
+ }
+
/**
* Copy constructor. This constructor makes a deep copy, using serialization /
* deserialization to break any possible references to contained objects.
*
- * @param map map to copy from
+ * @param map
+ * map to copy from
*/
public MapWritable(MapWritable map) {
if (map != null) {
@@ -123,8 +125,8 @@ public class MapWritable implements Writ
dib.reset(dob.getData(), dob.getLength());
readFields(dib);
} catch (IOException e) {
- throw new IllegalArgumentException("this map cannot be copied: " +
- StringUtils.stringifyException(e));
+ throw new IllegalArgumentException("this map cannot be copied: "
+ + StringUtils.stringifyException(e));
}
}
}
@@ -177,7 +179,8 @@ public class MapWritable implements Writ
public Set<Writable> keySet() {
HashSet<Writable> set = new HashSet<Writable>();
- if (isEmpty()) return set;
+ if (isEmpty())
+ return set;
set.add(fFirst.fKey);
KeyValueEntry entry = fFirst;
while ((entry = entry.fNextEntry) != null) {
@@ -257,7 +260,8 @@ public class MapWritable implements Writ
public boolean equals(Object obj) {
if (obj instanceof MapWritable) {
MapWritable map = (MapWritable) obj;
- if (fSize != map.fSize) return false;
+ if (fSize != map.fSize)
+ return false;
HashSet<KeyValueEntry> set1 = new HashSet<KeyValueEntry>();
KeyValueEntry e1 = fFirst;
while (e1 != null) {
@@ -345,7 +349,7 @@ public class MapWritable implements Writ
clazz = Class.forName(Text.readString(in));
addIdEntry(id, clazz);
} catch (Exception e) {
- if (LOG.isWarnEnabled()) {
+ if (LOG.isWarnEnabled()) {
LOG.warn("Unable to load internal map entry" + e.toString());
}
fIdCount--;
@@ -364,8 +368,8 @@ public class MapWritable implements Writ
}
} catch (IOException e) {
if (LOG.isWarnEnabled()) {
- LOG.warn("Unable to load meta data entry, ignoring.. : " +
- e.toString());
+ LOG.warn("Unable to load meta data entry, ignoring.. : "
+ + e.toString());
}
fSize--;
}
Modified: nutch/trunk/src/java/org/apache/nutch/crawl/MimeAdaptiveFetchSchedule.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/MimeAdaptiveFetchSchedule.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/MimeAdaptiveFetchSchedule.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/MimeAdaptiveFetchSchedule.java Thu Jan 29 05:38:59 2015
@@ -34,29 +34,31 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
- * Extension of @see AdaptiveFetchSchedule that allows for more flexible configuration
- * of DEC and INC factors for various MIME-types.
- *
- * This class can be typically used in cases where a recrawl consists of many different
- * MIME-types. It's not very common for MIME-types other than text/html to change frequently.
- * Using this class you can configure different factors per MIME-type so to prefer frequently
- * changing MIME-types over others.
+ * Extension of @see AdaptiveFetchSchedule that allows for more flexible
+ * configuration of DEC and INC factors for various MIME-types.
+ *
+ * This class can be typically used in cases where a recrawl consists of many
+ * different MIME-types. It's not very common for MIME-types other than
+ * text/html to change frequently. Using this class you can configure different
+ * factors per MIME-type so to prefer frequently changing MIME-types over
+ * others.
+ *
+ * For it to work this class relies on the Content-Type MetaData key being
+ * present in the CrawlDB. This can either be done when injecting new URL's or
+ * by adding "Content-Type" to the db.parsemeta.to.crawldb configuration setting
+ * to force MIME-types of newly discovered URL's to be added to the CrawlDB.
*
- * For it to work this class relies on the Content-Type MetaData key being present in the CrawlDB.
- * This can either be done when injecting new URL's or by adding "Content-Type" to the
- * db.parsemeta.to.crawldb configuration setting to force MIME-types of newly discovered URL's to
- * be added to the CrawlDB.
- *
* @author markus
*/
public class MimeAdaptiveFetchSchedule extends AdaptiveFetchSchedule {
// Loggg
- public static final Logger LOG = LoggerFactory.getLogger(MimeAdaptiveFetchSchedule.class);
+ public static final Logger LOG = LoggerFactory
+ .getLogger(MimeAdaptiveFetchSchedule.class);
// Conf directives
public static final String SCHEDULE_INC_RATE = "db.fetch.schedule.adaptive.inc_rate";
public static final String SCHEDULE_DEC_RATE = "db.fetch.schedule.adaptive.dec_rate";
- public static final String SCHEDULE_MIME_FILE= "db.fetch.schedule.mime.file";
+ public static final String SCHEDULE_MIME_FILE = "db.fetch.schedule.mime.file";
// Default values for DEC and INC rate
private float defaultIncRate;
@@ -74,18 +76,21 @@ public class MimeAdaptiveFetchSchedule e
}
// Here we store the mime's and their delta's
- private HashMap<String,AdaptiveRate> mimeMap;
+ private HashMap<String, AdaptiveRate> mimeMap;
public void setConf(Configuration conf) {
super.setConf(conf);
- if (conf == null) return;
+ if (conf == null)
+ return;
- // Read and set the default INC and DEC rates in case we cannot set values based on MIME-type
+ // Read and set the default INC and DEC rates in case we cannot set values
+ // based on MIME-type
defaultIncRate = conf.getFloat(SCHEDULE_INC_RATE, 0.2f);
defaultDecRate = conf.getFloat(SCHEDULE_DEC_RATE, 0.2f);
// Where's the mime/factor file?
- Reader mimeFile = conf.getConfResourceAsReader(conf.get(SCHEDULE_MIME_FILE, "adaptive-mimetypes.txt"));
+ Reader mimeFile = conf.getConfResourceAsReader(conf.get(SCHEDULE_MIME_FILE,
+ "adaptive-mimetypes.txt"));
try {
readMimeFile(mimeFile);
@@ -96,8 +101,8 @@ public class MimeAdaptiveFetchSchedule e
@Override
public CrawlDatum setFetchSchedule(Text url, CrawlDatum datum,
- long prevFetchTime, long prevModifiedTime,
- long fetchTime, long modifiedTime, int state) {
+ long prevFetchTime, long prevModifiedTime, long fetchTime,
+ long modifiedTime, int state) {
// Set defaults
INC_RATE = defaultIncRate;
@@ -106,7 +111,8 @@ public class MimeAdaptiveFetchSchedule e
// Check if the Content-Type field is available in the CrawlDatum
if (datum.getMetaData().containsKey(HttpHeaders.WRITABLE_CONTENT_TYPE)) {
// Get the MIME-type of the current URL
- String currentMime = datum.getMetaData().get(HttpHeaders.WRITABLE_CONTENT_TYPE).toString();
+ String currentMime = datum.getMetaData()
+ .get(HttpHeaders.WRITABLE_CONTENT_TYPE).toString();
// Get rid of charset
currentMime = currentMime.substring(0, currentMime.indexOf(';'));
@@ -120,18 +126,19 @@ public class MimeAdaptiveFetchSchedule e
}
return super.setFetchSchedule(url, datum, prevFetchTime, prevModifiedTime,
- fetchTime, modifiedTime, state);
+ fetchTime, modifiedTime, state);
}
/**
* Reads the mime types and their associated INC/DEC factors in a HashMap
- *
- * @param mimeFile Reader
+ *
+ * @param mimeFile
+ * Reader
* @return void
*/
private void readMimeFile(Reader mimeFile) throws IOException {
// Instance of our mime/factor map
- mimeMap = new HashMap<String,AdaptiveRate>();
+ mimeMap = new HashMap<String, AdaptiveRate>();
// Open a reader
BufferedReader reader = new BufferedReader(mimeFile);
@@ -149,7 +156,8 @@ public class MimeAdaptiveFetchSchedule e
// Sanity check, we need two or three items
if (splits.length == 3) {
// Add a lower cased MIME-type and the factor to the map
- mimeMap.put(StringUtils.lowerCase(splits[0]), new AdaptiveRate(new Float(splits[1]), new Float(splits[2])));
+ mimeMap.put(StringUtils.lowerCase(splits[0]), new AdaptiveRate(
+ new Float(splits[1]), new Float(splits[2])));
} else {
LOG.warn("Invalid configuration line in: " + line);
}
@@ -178,7 +186,8 @@ public class MimeAdaptiveFetchSchedule e
// Set a default MIME-type to test with
org.apache.hadoop.io.MapWritable x = new org.apache.hadoop.io.MapWritable();
- x.put(HttpHeaders.WRITABLE_CONTENT_TYPE, new Text("text/html; charset=utf-8"));
+ x.put(HttpHeaders.WRITABLE_CONTENT_TYPE, new Text(
+ "text/html; charset=utf-8"));
p.setMetaData(x);
p.setFetchTime(0);
@@ -187,37 +196,45 @@ public class MimeAdaptiveFetchSchedule e
// let's move the timeline a couple of deltas
for (int i = 0; i < 10000; i++) {
if (lastModified + update < curTime) {
- //System.out.println("i=" + i + ", lastModified=" + lastModified + ", update=" + update + ", curTime=" + curTime);
+ // System.out.println("i=" + i + ", lastModified=" + lastModified +
+ // ", update=" + update + ", curTime=" + curTime);
changed = true;
changeCnt++;
lastModified = curTime;
}
- LOG.info(i + ". " + changed + "\twill fetch at " + (p.getFetchTime() / delta) + "\tinterval "
- + (p.getFetchInterval() / SECONDS_PER_DAY ) + " days" + "\t missed " + miss);
+ LOG.info(i + ". " + changed + "\twill fetch at "
+ + (p.getFetchTime() / delta) + "\tinterval "
+ + (p.getFetchInterval() / SECONDS_PER_DAY) + " days" + "\t missed "
+ + miss);
if (p.getFetchTime() <= curTime) {
fetchCnt++;
- fs.setFetchSchedule(new Text("http://www.example.com"), p,
- p.getFetchTime(), p.getModifiedTime(), curTime, lastModified,
- changed ? FetchSchedule.STATUS_MODIFIED : FetchSchedule.STATUS_NOTMODIFIED);
-
- LOG.info("\tfetched & adjusted: " + "\twill fetch at " + (p.getFetchTime() / delta) + "\tinterval "
- + (p.getFetchInterval() / SECONDS_PER_DAY ) + " days");
-
- if (!changed) miss++;
- if (miss > maxMiss) maxMiss = miss;
+ fs.setFetchSchedule(new Text("http://www.example.com"), p, p
+ .getFetchTime(), p.getModifiedTime(), curTime, lastModified,
+ changed ? FetchSchedule.STATUS_MODIFIED
+ : FetchSchedule.STATUS_NOTMODIFIED);
+
+ LOG.info("\tfetched & adjusted: " + "\twill fetch at "
+ + (p.getFetchTime() / delta) + "\tinterval "
+ + (p.getFetchInterval() / SECONDS_PER_DAY) + " days");
+
+ if (!changed)
+ miss++;
+ if (miss > maxMiss)
+ maxMiss = miss;
changed = false;
totalMiss += miss;
miss = 0;
}
- if (changed) miss++;
+ if (changed)
+ miss++;
curTime += delta;
}
LOG.info("Total missed: " + totalMiss + ", max miss: " + maxMiss);
- LOG.info("Page changed " + changeCnt + " times, fetched " + fetchCnt + " times.");
+ LOG.info("Page changed " + changeCnt + " times, fetched " + fetchCnt
+ + " times.");
}
-
}
\ No newline at end of file
Modified: nutch/trunk/src/java/org/apache/nutch/crawl/NutchWritable.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/NutchWritable.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/NutchWritable.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/NutchWritable.java Thu Jan 29 05:38:59 2015
@@ -26,32 +26,31 @@ public class NutchWritable extends Gener
static {
CLASSES = (Class<? extends Writable>[]) new Class<?>[] {
- org.apache.hadoop.io.NullWritable.class,
- org.apache.hadoop.io.BooleanWritable.class,
- org.apache.hadoop.io.LongWritable.class,
- org.apache.hadoop.io.BytesWritable.class,
- org.apache.hadoop.io.FloatWritable.class,
- org.apache.hadoop.io.IntWritable.class,
- org.apache.hadoop.io.MapWritable.class,
- org.apache.hadoop.io.Text.class,
- org.apache.hadoop.io.MD5Hash.class,
- org.apache.nutch.crawl.CrawlDatum.class,
- org.apache.nutch.crawl.Inlink.class,
- org.apache.nutch.crawl.Inlinks.class,
- org.apache.nutch.indexer.NutchIndexAction.class,
- org.apache.nutch.metadata.Metadata.class,
- org.apache.nutch.parse.Outlink.class,
- org.apache.nutch.parse.ParseText.class,
- org.apache.nutch.parse.ParseData.class,
- org.apache.nutch.parse.ParseImpl.class,
- org.apache.nutch.parse.ParseStatus.class,
- org.apache.nutch.protocol.Content.class,
- org.apache.nutch.protocol.ProtocolStatus.class,
- org.apache.nutch.scoring.webgraph.LinkDatum.class
- };
+ org.apache.hadoop.io.NullWritable.class,
+ org.apache.hadoop.io.BooleanWritable.class,
+ org.apache.hadoop.io.LongWritable.class,
+ org.apache.hadoop.io.BytesWritable.class,
+ org.apache.hadoop.io.FloatWritable.class,
+ org.apache.hadoop.io.IntWritable.class,
+ org.apache.hadoop.io.MapWritable.class,
+ org.apache.hadoop.io.Text.class, org.apache.hadoop.io.MD5Hash.class,
+ org.apache.nutch.crawl.CrawlDatum.class,
+ org.apache.nutch.crawl.Inlink.class,
+ org.apache.nutch.crawl.Inlinks.class,
+ org.apache.nutch.indexer.NutchIndexAction.class,
+ org.apache.nutch.metadata.Metadata.class,
+ org.apache.nutch.parse.Outlink.class,
+ org.apache.nutch.parse.ParseText.class,
+ org.apache.nutch.parse.ParseData.class,
+ org.apache.nutch.parse.ParseImpl.class,
+ org.apache.nutch.parse.ParseStatus.class,
+ org.apache.nutch.protocol.Content.class,
+ org.apache.nutch.protocol.ProtocolStatus.class,
+ org.apache.nutch.scoring.webgraph.LinkDatum.class };
}
- public NutchWritable() { }
+ public NutchWritable() {
+ }
public NutchWritable(Writable instance) {
set(instance);
Modified: nutch/trunk/src/java/org/apache/nutch/crawl/Signature.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/Signature.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/Signature.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/Signature.java Thu Jan 29 05:38:59 2015
@@ -24,7 +24,7 @@ import org.apache.hadoop.conf.Configurab
public abstract class Signature implements Configurable {
protected Configuration conf;
-
+
public abstract byte[] calculate(Content content, Parse parse);
public Configuration getConf() {
Modified: nutch/trunk/src/java/org/apache/nutch/crawl/SignatureComparator.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/SignatureComparator.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/SignatureComparator.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/SignatureComparator.java Thu Jan 29 05:38:59 2015
@@ -23,25 +23,34 @@ public class SignatureComparator impleme
public int compare(Object o1, Object o2) {
return _compare(o1, o2);
}
-
+
public static int _compare(Object o1, Object o2) {
- if (o1 == null && o2 == null) return 0;
- if (o1 == null) return -1;
- if (o2 == null) return 1;
- if (!(o1 instanceof byte[])) return -1;
- if (!(o2 instanceof byte[])) return 1;
- byte[] data1 = (byte[])o1;
- byte[] data2 = (byte[])o2;
+ if (o1 == null && o2 == null)
+ return 0;
+ if (o1 == null)
+ return -1;
+ if (o2 == null)
+ return 1;
+ if (!(o1 instanceof byte[]))
+ return -1;
+ if (!(o2 instanceof byte[]))
+ return 1;
+ byte[] data1 = (byte[]) o1;
+ byte[] data2 = (byte[]) o2;
return _compare(data1, 0, data1.length, data2, 0, data2.length);
}
-
- public static int _compare(byte[] data1, int s1, int l1, byte[] data2, int s2, int l2) {
- if (l2 > l1) return -1;
- if (l2 < l1) return 1;
+
+ public static int _compare(byte[] data1, int s1, int l1, byte[] data2,
+ int s2, int l2) {
+ if (l2 > l1)
+ return -1;
+ if (l2 < l1)
+ return 1;
int res = 0;
for (int i = 0; i < l1; i++) {
res = (data1[s1 + i] - data2[s2 + i]);
- if (res != 0) return res;
+ if (res != 0)
+ return res;
}
return 0;
}
Modified: nutch/trunk/src/java/org/apache/nutch/crawl/SignatureFactory.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/SignatureFactory.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/SignatureFactory.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/SignatureFactory.java Thu Jan 29 05:38:59 2015
@@ -27,28 +27,30 @@ import org.apache.nutch.util.ObjectCache
/**
* Factory class, which instantiates a Signature implementation according to the
- * current Configuration configuration. This newly created instance is cached in the
- * Configuration instance, so that it could be later retrieved.
+ * current Configuration configuration. This newly created instance is cached in
+ * the Configuration instance, so that it could be later retrieved.
*
* @author Andrzej Bialecki <ab@getopt.org>
*/
public class SignatureFactory {
- private static final Logger LOG = LoggerFactory.getLogger(SignatureFactory.class);
+ private static final Logger LOG = LoggerFactory
+ .getLogger(SignatureFactory.class);
- private SignatureFactory() {} // no public ctor
+ private SignatureFactory() {
+ } // no public ctor
/** Return the default Signature implementation. */
public synchronized static Signature getSignature(Configuration conf) {
String clazz = conf.get("db.signature.class", MD5Signature.class.getName());
ObjectCache objectCache = ObjectCache.get(conf);
- Signature impl = (Signature)objectCache.getObject(clazz);
+ Signature impl = (Signature) objectCache.getObject(clazz);
if (impl == null) {
try {
if (LOG.isInfoEnabled()) {
LOG.info("Using Signature impl: " + clazz);
}
Class<?> implClass = Class.forName(clazz);
- impl = (Signature)implClass.newInstance();
+ impl = (Signature) implClass.newInstance();
impl.setConf(conf);
objectCache.setObject(clazz, impl);
} catch (Exception e) {
Modified: nutch/trunk/src/java/org/apache/nutch/crawl/TextMD5Signature.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/TextMD5Signature.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/TextMD5Signature.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/TextMD5Signature.java Thu Jan 29 05:38:59 2015
@@ -22,9 +22,9 @@ import org.apache.nutch.parse.Parse;
import org.apache.nutch.protocol.Content;
/**
- * Implementation of a page signature. It calculates an MD5 hash
- * of the textual content of a page. In case there is no content, it
- * calculates a hash from the page's URL.
+ * Implementation of a page signature. It calculates an MD5 hash of the textual
+ * content of a page. In case there is no content, it calculates a hash from the
+ * page's URL.
*/
public class TextMD5Signature extends Signature {
@@ -36,7 +36,7 @@ public class TextMD5Signature extends Si
if (text == null || text.length() == 0) {
return fallback.calculate(content, parse);
}
-
+
return MD5Hash.digest(text).getDigest();
}
}
Modified: nutch/trunk/src/java/org/apache/nutch/crawl/TextProfileSignature.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/TextProfileSignature.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/TextProfileSignature.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/TextProfileSignature.java Thu Jan 29 05:38:59 2015
@@ -35,41 +35,50 @@ import org.apache.nutch.util.StringUtil;
import org.apache.nutch.util.NutchConfiguration;
/**
- * <p>An implementation of a page signature. It calculates an MD5 hash
- * of a plain text "profile" of a page. In case there is no text, it
- * calculates a hash using the {@link MD5Signature}.</p>
- * <p>The algorithm to calculate a page "profile" takes the plain text version of
- * a page and performs the following steps:
+ * <p>
+ * An implementation of a page signature. It calculates an MD5 hash of a plain
+ * text "profile" of a page. In case there is no text, it calculates a hash
+ * using the {@link MD5Signature}.
+ * </p>
+ * <p>
+ * The algorithm to calculate a page "profile" takes the plain text version of a
+ * page and performs the following steps:
* <ul>
* <li>remove all characters except letters and digits, and bring all characters
* to lower case,</li>
* <li>split the text into tokens (all consecutive non-whitespace characters),</li>
- * <li>discard tokens equal or shorter than MIN_TOKEN_LEN (default 2 characters),</li>
+ * <li>discard tokens equal or shorter than MIN_TOKEN_LEN (default 2
+ * characters),</li>
* <li>sort the list of tokens by decreasing frequency,</li>
- * <li>round down the counts of tokens to the nearest multiple of QUANT
- * (<code>QUANT = QUANT_RATE * maxFreq</code>, where <code>QUANT_RATE</code> is 0.01f
- * by default, and <code>maxFreq</code> is the maximum token frequency). If
- * <code>maxFreq</code> is higher than 1, then QUANT is always higher than 2 (which
- * means that tokens with frequency 1 are always discarded).</li>
- * <li>tokens, which frequency after quantization falls below QUANT, are discarded.</li>
- * <li>create a list of tokens and their quantized frequency, separated by spaces,
- * in the order of decreasing frequency.</li>
+ * <li>round down the counts of tokens to the nearest multiple of QUANT (
+ * <code>QUANT = QUANT_RATE * maxFreq</code>, where <code>QUANT_RATE</code> is
+ * 0.01f by default, and <code>maxFreq</code> is the maximum token frequency).
+ * If <code>maxFreq</code> is higher than 1, then QUANT is always higher than 2
+ * (which means that tokens with frequency 1 are always discarded).</li>
+ * <li>tokens, which frequency after quantization falls below QUANT, are
+ * discarded.</li>
+ * <li>create a list of tokens and their quantized frequency, separated by
+ * spaces, in the order of decreasing frequency.</li>
* </ul>
* This list is then submitted to an MD5 hash calculation.
*
* @author Andrzej Bialecki <ab@getopt.org>
*/
public class TextProfileSignature extends Signature {
-
+
Signature fallback = new MD5Signature();
public byte[] calculate(Content content, Parse parse) {
- int MIN_TOKEN_LEN = getConf().getInt("db.signature.text_profile.min_token_len", 2);
- float QUANT_RATE = getConf().getFloat("db.signature.text_profile.quant_rate", 0.01f);
+ int MIN_TOKEN_LEN = getConf().getInt(
+ "db.signature.text_profile.min_token_len", 2);
+ float QUANT_RATE = getConf().getFloat(
+ "db.signature.text_profile.quant_rate", 0.01f);
HashMap<String, Token> tokens = new HashMap<String, Token>();
String text = null;
- if (parse != null) text = parse.getText();
- if (text == null || text.length() == 0) return fallback.calculate(content, parse);
+ if (parse != null)
+ text = parse.getText();
+ if (text == null || text.length() == 0)
+ return fallback.calculate(content, parse);
StringBuffer curToken = new StringBuffer();
int maxFreq = 0;
for (int i = 0; i < text.length(); i++) {
@@ -87,7 +96,8 @@ public class TextProfileSignature extend
tokens.put(s, tok);
}
tok.cnt++;
- if (tok.cnt > maxFreq) maxFreq = tok.cnt;
+ if (tok.cnt > maxFreq)
+ maxFreq = tok.cnt;
}
curToken.setLength(0);
}
@@ -103,17 +113,20 @@ public class TextProfileSignature extend
tokens.put(s, tok);
}
tok.cnt++;
- if (tok.cnt > maxFreq) maxFreq = tok.cnt;
+ if (tok.cnt > maxFreq)
+ maxFreq = tok.cnt;
}
Iterator<Token> it = tokens.values().iterator();
ArrayList<Token> profile = new ArrayList<Token>();
// calculate the QUANT value
int QUANT = Math.round(maxFreq * QUANT_RATE);
if (QUANT < 2) {
- if (maxFreq > 1) QUANT = 2;
- else QUANT = 1;
+ if (maxFreq > 1)
+ QUANT = 2;
+ else
+ QUANT = 1;
}
- while(it.hasNext()) {
+ while (it.hasNext()) {
Token t = it.next();
// round down to the nearest QUANT
t.cnt = (t.cnt / QUANT) * QUANT;
@@ -128,32 +141,33 @@ public class TextProfileSignature extend
it = profile.iterator();
while (it.hasNext()) {
Token t = it.next();
- if (newText.length() > 0) newText.append("\n");
+ if (newText.length() > 0)
+ newText.append("\n");
newText.append(t.toString());
}
return MD5Hash.digest(newText.toString()).getDigest();
}
-
+
private static class Token {
public int cnt;
public String val;
-
+
public Token(int cnt, String val) {
this.cnt = cnt;
this.val = val;
}
-
+
public String toString() {
return val + " " + cnt;
}
}
-
+
private static class TokenComparator implements Comparator<Token> {
public int compare(Token t1, Token t2) {
return t2.cnt - t1.cnt;
}
}
-
+
public static void main(String[] args) throws Exception {
TextProfileSignature sig = new TextProfileSignature();
sig.setConf(NutchConfiguration.create());
@@ -161,15 +175,18 @@ public class TextProfileSignature extend
File[] files = new File(args[0]).listFiles();
for (int i = 0; i < files.length; i++) {
FileInputStream fis = new FileInputStream(files[i]);
- BufferedReader br = new BufferedReader(new InputStreamReader(fis, "UTF-8"));
+ BufferedReader br = new BufferedReader(
+ new InputStreamReader(fis, "UTF-8"));
StringBuffer text = new StringBuffer();
String line = null;
while ((line = br.readLine()) != null) {
- if (text.length() > 0) text.append("\n");
+ if (text.length() > 0)
+ text.append("\n");
text.append(line);
}
br.close();
- byte[] signature = sig.calculate(null, new ParseImpl(text.toString(), null));
+ byte[] signature = sig.calculate(null, new ParseImpl(text.toString(),
+ null));
res.put(files[i].toString(), signature);
}
Iterator<String> it = res.keySet().iterator();
Modified: nutch/trunk/src/java/org/apache/nutch/crawl/URLPartitioner.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/URLPartitioner.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/URLPartitioner.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/URLPartitioner.java Thu Jan 29 05:38:59 2015
@@ -33,8 +33,9 @@ import org.apache.nutch.util.URLUtil;
* Partition urls by host, domain name or IP depending on the value of the
* parameter 'partition.url.mode' which can be 'byHost', 'byDomain' or 'byIP'
*/
-public class URLPartitioner implements Partitioner<Text,Writable> {
- private static final Logger LOG = LoggerFactory.getLogger(URLPartitioner.class);
+public class URLPartitioner implements Partitioner<Text, Writable> {
+ private static final Logger LOG = LoggerFactory
+ .getLogger(URLPartitioner.class);
public static final String PARTITION_MODE_KEY = "partition.url.mode";
@@ -58,7 +59,8 @@ public class URLPartitioner implements P
normalizers = new URLNormalizers(job, URLNormalizers.SCOPE_PARTITION);
}
- public void close() {}
+ public void close() {
+ }
/** Hash by domain name. */
public int getPartition(Text key, Writable value, int numReduceTasks) {
@@ -66,15 +68,16 @@ public class URLPartitioner implements P
URL url = null;
int hashCode = urlString.hashCode();
try {
- urlString = normalizers.normalize(urlString, URLNormalizers.SCOPE_PARTITION);
+ urlString = normalizers.normalize(urlString,
+ URLNormalizers.SCOPE_PARTITION);
url = new URL(urlString);
hashCode = url.getHost().hashCode();
} catch (MalformedURLException e) {
LOG.warn("Malformed URL: '" + urlString + "'");
}
- if (mode.equals(PARTITION_MODE_DOMAIN) && url != null) hashCode = URLUtil
- .getDomainName(url).hashCode();
+ if (mode.equals(PARTITION_MODE_DOMAIN) && url != null)
+ hashCode = URLUtil.getDomainName(url).hashCode();
else if (mode.equals(PARTITION_MODE_IP)) {
try {
InetAddress address = InetAddress.getByName(url.getHost());