You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2015/01/29 06:39:03 UTC

svn commit: r1655526 [3/26] - in /nutch/trunk: ./ src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/fetcher/ src/java/org/apache/nutch/indexer/ src/java/org/apache/nutch/metadata/ src/java/org/apache/nutch/net/ src/java/org/apache/nutch/net/pr...

Modified: nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Thu Jan 29 05:38:59 2015
@@ -51,9 +51,9 @@ import org.apache.nutch.util.URLUtil;
  * Generates a subset of a crawl db to fetch. This version allows to generate
  * fetchlists for several segments in one go. Unlike in the initial version
  * (OldGenerator), the IP resolution is done ONLY on the entries which have been
- * selected for fetching. The URLs are partitioned by IP, domain or host within a 
- * segment. We can chose separately how to count the URLS i.e. by domain or host
- * to limit the entries.
+ * selected for fetching. The URLs are partitioned by IP, domain or host within
+ * a segment. We can chose separately how to count the URLS i.e. by domain or
+ * host to limit the entries.
  **/
 public class Generator extends Configured implements Tool {
 
@@ -73,7 +73,7 @@ public class Generator extends Configure
   public static final String GENERATOR_CUR_TIME = "generate.curTime";
   public static final String GENERATOR_DELAY = "crawl.gen.delay";
   public static final String GENERATOR_MAX_NUM_SEGMENTS = "generate.max.num.segments";
-  
+
   public static class SelectorEntry implements Writable {
     public Text url;
     public CrawlDatum datum;
@@ -98,25 +98,25 @@ public class Generator extends Configure
     }
 
     public String toString() {
-      return "url=" + url.toString() + ", datum=" + datum.toString() + ", segnum="
-          + segnum.toString();
+      return "url=" + url.toString() + ", datum=" + datum.toString()
+          + ", segnum=" + segnum.toString();
     }
   }
 
   /** Selects entries due for fetch. */
   public static class Selector implements
-      Mapper<Text,CrawlDatum,FloatWritable,SelectorEntry>,
-      Partitioner<FloatWritable,Writable>,
-      Reducer<FloatWritable,SelectorEntry,FloatWritable,SelectorEntry> {
+      Mapper<Text, CrawlDatum, FloatWritable, SelectorEntry>,
+      Partitioner<FloatWritable, Writable>,
+      Reducer<FloatWritable, SelectorEntry, FloatWritable, SelectorEntry> {
     private LongWritable genTime = new LongWritable(System.currentTimeMillis());
     private long curTime;
     private long limit;
     private long count;
-    private HashMap<String,int[]> hostCounts = new HashMap<String,int[]>();
+    private HashMap<String, int[]> hostCounts = new HashMap<String, int[]>();
     private int segCounts[];
     private int maxCount;
     private boolean byDomain = false;
-    private Partitioner<Text,Writable> partitioner = new URLPartitioner();
+    private Partitioner<Text, Writable> partitioner = new URLPartitioner();
     private URLFilters filters;
     private URLNormalizers normalizers;
     private ScoringFilters scfilters;
@@ -134,22 +134,26 @@ public class Generator extends Configure
 
     public void configure(JobConf job) {
       curTime = job.getLong(GENERATOR_CUR_TIME, System.currentTimeMillis());
-      limit = job.getLong(GENERATOR_TOP_N, Long.MAX_VALUE) / job.getNumReduceTasks();
+      limit = job.getLong(GENERATOR_TOP_N, Long.MAX_VALUE)
+          / job.getNumReduceTasks();
       maxCount = job.getInt(GENERATOR_MAX_COUNT, -1);
-      if (maxCount==-1){
+      if (maxCount == -1) {
         byDomain = false;
       }
-      if (GENERATOR_COUNT_VALUE_DOMAIN.equals(job.get(GENERATOR_COUNT_MODE))) byDomain = true;
+      if (GENERATOR_COUNT_VALUE_DOMAIN.equals(job.get(GENERATOR_COUNT_MODE)))
+        byDomain = true;
       filters = new URLFilters(job);
       normalise = job.getBoolean(GENERATOR_NORMALISE, true);
-      if (normalise) normalizers = new URLNormalizers(job,
-          URLNormalizers.SCOPE_GENERATE_HOST_COUNT);
+      if (normalise)
+        normalizers = new URLNormalizers(job,
+            URLNormalizers.SCOPE_GENERATE_HOST_COUNT);
       scfilters = new ScoringFilters(job);
       partitioner.configure(job);
       filter = job.getBoolean(GENERATOR_FILTER, true);
       genDelay = job.getLong(GENERATOR_DELAY, 7L) * 3600L * 24L * 1000L;
       long time = job.getLong(Nutch.GENERATE_TIME_KEY, 0L);
-      if (time > 0) genTime.set(time);
+      if (time > 0)
+        genTime.set(time);
       schedule = FetchScheduleFactory.getFetchSchedule(job);
       scoreThreshold = job.getFloat(GENERATOR_MIN_SCORE, Float.NaN);
       intervalThreshold = job.getInt(GENERATOR_MIN_INTERVAL, -1);
@@ -158,21 +162,24 @@ public class Generator extends Configure
       segCounts = new int[maxNumSegments];
     }
 
-    public void close() {}
+    public void close() {
+    }
 
     /** Select & invert subset due for fetch. */
     public void map(Text key, CrawlDatum value,
-        OutputCollector<FloatWritable,SelectorEntry> output, Reporter reporter)
+        OutputCollector<FloatWritable, SelectorEntry> output, Reporter reporter)
         throws IOException {
       Text url = key;
       if (filter) {
         // If filtering is on don't generate URLs that don't pass
         // URLFilters
         try {
-          if (filters.filter(url.toString()) == null) return;
+          if (filters.filter(url.toString()) == null)
+            return;
         } catch (URLFilterException e) {
           if (LOG.isWarnEnabled()) {
-            LOG.warn("Couldn't filter url: " + url + " (" + e.getMessage() + ")");
+            LOG.warn("Couldn't filter url: " + url + " (" + e.getMessage()
+                + ")");
           }
         }
       }
@@ -189,8 +196,8 @@ public class Generator extends Configure
           Nutch.WRITABLE_GENERATE_TIME_KEY);
       if (oldGenTime != null) { // awaiting fetch & update
         if (oldGenTime.get() + genDelay > curTime) // still wait for
-        // update
-        return;
+          // update
+          return;
       }
       float sort = 1.0f;
       try {
@@ -202,13 +209,19 @@ public class Generator extends Configure
       }
 
       if (restrictStatus != null
-        && !restrictStatus.equalsIgnoreCase(CrawlDatum.getStatusName(crawlDatum.getStatus()))) return;
+          && !restrictStatus.equalsIgnoreCase(CrawlDatum
+              .getStatusName(crawlDatum.getStatus())))
+        return;
 
       // consider only entries with a score superior to the threshold
-      if (scoreThreshold != Float.NaN && sort < scoreThreshold) return;
+      if (scoreThreshold != Float.NaN && sort < scoreThreshold)
+        return;
 
-      // consider only entries with a retry (or fetch) interval lower than threshold
-      if (intervalThreshold != -1 && crawlDatum.getFetchInterval() > intervalThreshold) return;
+      // consider only entries with a retry (or fetch) interval lower than
+      // threshold
+      if (intervalThreshold != -1
+          && crawlDatum.getFetchInterval() > intervalThreshold)
+        return;
 
       // sort by decreasing score, using DecreasingFloatComparator
       sortValue.set(sort);
@@ -220,13 +233,15 @@ public class Generator extends Configure
     }
 
     /** Partition by host / domain or IP. */
-    public int getPartition(FloatWritable key, Writable value, int numReduceTasks) {
-      return partitioner.getPartition(((SelectorEntry) value).url, key, numReduceTasks);
+    public int getPartition(FloatWritable key, Writable value,
+        int numReduceTasks) {
+      return partitioner.getPartition(((SelectorEntry) value).url, key,
+          numReduceTasks);
     }
 
     /** Collect until limit is reached. */
     public void reduce(FloatWritable key, Iterator<SelectorEntry> values,
-        OutputCollector<FloatWritable,SelectorEntry> output, Reporter reporter)
+        OutputCollector<FloatWritable, SelectorEntry> output, Reporter reporter)
         throws IOException {
 
       while (values.hasNext()) {
@@ -236,7 +251,8 @@ public class Generator extends Configure
           if (currentsegmentnum < maxNumSegments) {
             count = 0;
             currentsegmentnum++;
-          } else break;
+          } else
+            break;
         }
 
         SelectorEntry entry = values.next();
@@ -270,7 +286,7 @@ public class Generator extends Configure
         if (maxCount > 0) {
           int[] hostCount = hostCounts.get(hostordomain);
           if (hostCount == null) {
-            hostCount = new int[] {1, 0};
+            hostCount = new int[] { 1, 0 };
             hostCounts.put(hostordomain, hostCount);
           }
 
@@ -278,7 +294,8 @@ public class Generator extends Configure
           hostCount[1]++;
 
           // check if topN reached, select next segment if it is
-          while (segCounts[hostCount[0]-1] >= limit && hostCount[0] < maxNumSegments) {
+          while (segCounts[hostCount[0] - 1] >= limit
+              && hostCount[0] < maxNumSegments) {
             hostCount[0]++;
             hostCount[1] = 0;
           }
@@ -291,18 +308,23 @@ public class Generator extends Configure
               hostCount[1] = 0;
             } else {
               if (hostCount[1] == maxCount + 1 && LOG.isInfoEnabled()) {
-                LOG.info("Host or domain " + hostordomain + " has more than " + maxCount
-                    + " URLs for all " + maxNumSegments + " segments. Additional URLs won't be included in the fetchlist.");
+                LOG.info("Host or domain "
+                    + hostordomain
+                    + " has more than "
+                    + maxCount
+                    + " URLs for all "
+                    + maxNumSegments
+                    + " segments. Additional URLs won't be included in the fetchlist.");
               }
               // skip this entry
               continue;
             }
           }
           entry.segnum = new IntWritable(hostCount[0]);
-          segCounts[hostCount[0]-1]++;
+          segCounts[hostCount[0] - 1]++;
         } else {
           entry.segnum = new IntWritable(currentsegmentnum);
-          segCounts[currentsegmentnum-1]++;
+          segCounts[currentsegmentnum - 1]++;
         }
 
         output.collect(key, entry);
@@ -316,16 +338,17 @@ public class Generator extends Configure
 
   // Allows the reducers to generate one subfile per
   public static class GeneratorOutputFormat extends
-      MultipleSequenceFileOutputFormat<FloatWritable,SelectorEntry> {
+      MultipleSequenceFileOutputFormat<FloatWritable, SelectorEntry> {
     // generate a filename based on the segnum stored for this entry
-    protected String generateFileNameForKeyValue(FloatWritable key, SelectorEntry value,
-        String name) {
+    protected String generateFileNameForKeyValue(FloatWritable key,
+        SelectorEntry value, String name) {
       return "fetchlist-" + value.segnum.toString() + "/" + name;
     }
 
   }
 
-  public static class DecreasingFloatComparator extends FloatWritable.Comparator {
+  public static class DecreasingFloatComparator extends
+      FloatWritable.Comparator {
 
     /** Compares two FloatWritables decreasing. */
     public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
@@ -334,20 +357,22 @@ public class Generator extends Configure
   }
 
   public static class SelectorInverseMapper extends MapReduceBase implements
-      Mapper<FloatWritable,SelectorEntry,Text,SelectorEntry> {
+      Mapper<FloatWritable, SelectorEntry, Text, SelectorEntry> {
 
     public void map(FloatWritable key, SelectorEntry value,
-        OutputCollector<Text,SelectorEntry> output, Reporter reporter) throws IOException {
+        OutputCollector<Text, SelectorEntry> output, Reporter reporter)
+        throws IOException {
       SelectorEntry entry = value;
       output.collect(entry.url, entry);
     }
   }
 
   public static class PartitionReducer extends MapReduceBase implements
-      Reducer<Text,SelectorEntry,Text,CrawlDatum> {
+      Reducer<Text, SelectorEntry, Text, CrawlDatum> {
 
     public void reduce(Text key, Iterator<SelectorEntry> values,
-        OutputCollector<Text,CrawlDatum> output, Reporter reporter) throws IOException {
+        OutputCollector<Text, CrawlDatum> output, Reporter reporter)
+        throws IOException {
       // if using HashComparator, we get only one input key in case of
       // hash collision
       // so use only URLs from values
@@ -365,7 +390,7 @@ public class Generator extends Configure
       super(Text.class);
     }
 
-    @SuppressWarnings("rawtypes" )
+    @SuppressWarnings("rawtypes")
     public int compare(WritableComparable a, WritableComparable b) {
       Text url1 = (Text) a;
       Text url2 = (Text) b;
@@ -395,15 +420,17 @@ public class Generator extends Configure
    * Update the CrawlDB so that the next generate won't include the same URLs.
    */
   public static class CrawlDbUpdater extends MapReduceBase implements
-      Mapper<Text,CrawlDatum,Text,CrawlDatum>, Reducer<Text,CrawlDatum,Text,CrawlDatum> {
+      Mapper<Text, CrawlDatum, Text, CrawlDatum>,
+      Reducer<Text, CrawlDatum, Text, CrawlDatum> {
     long generateTime;
 
     public void configure(JobConf job) {
       generateTime = job.getLong(Nutch.GENERATE_TIME_KEY, 0L);
     }
 
-    public void map(Text key, CrawlDatum value, OutputCollector<Text,CrawlDatum> output,
-        Reporter reporter) throws IOException {
+    public void map(Text key, CrawlDatum value,
+        OutputCollector<Text, CrawlDatum> output, Reporter reporter)
+        throws IOException {
       output.collect(key, value);
     }
 
@@ -411,7 +438,8 @@ public class Generator extends Configure
     private LongWritable genTime = new LongWritable(0L);
 
     public void reduce(Text key, Iterator<CrawlDatum> values,
-        OutputCollector<Text,CrawlDatum> output, Reporter reporter) throws IOException {
+        OutputCollector<Text, CrawlDatum> output, Reporter reporter)
+        throws IOException {
       genTime.set(0L);
       while (values.hasNext()) {
         CrawlDatum val = values.next();
@@ -435,19 +463,21 @@ public class Generator extends Configure
     }
   }
 
-  public Generator() {}
+  public Generator() {
+  }
 
   public Generator(Configuration conf) {
     setConf(conf);
   }
 
-  public Path[] generate(Path dbDir, Path segments, int numLists, long topN, long curTime)
-      throws IOException {
+  public Path[] generate(Path dbDir, Path segments, int numLists, long topN,
+      long curTime) throws IOException {
 
     JobConf job = new NutchJob(getConf());
     boolean filter = job.getBoolean(GENERATOR_FILTER, true);
     boolean normalise = job.getBoolean(GENERATOR_NORMALISE, true);
-    return generate(dbDir, segments, numLists, topN, curTime, filter, normalise, false, 1);
+    return generate(dbDir, segments, numLists, topN, curTime, filter,
+        normalise, false, 1);
   }
 
   /**
@@ -456,7 +486,8 @@ public class Generator extends Configure
    **/
   public Path[] generate(Path dbDir, Path segments, int numLists, long topN,
       long curTime, boolean filter, boolean force) throws IOException {
-    return generate(dbDir, segments, numLists, topN, curTime, filter, true, force, 1);
+    return generate(dbDir, segments, numLists, topN, curTime, filter, true,
+        force, 1);
   }
 
   /**
@@ -482,11 +513,11 @@ public class Generator extends Configure
    *           When an I/O error occurs
    */
   public Path[] generate(Path dbDir, Path segments, int numLists, long topN,
-      long curTime, boolean filter, boolean norm, boolean force, int maxNumSegments)
-      throws IOException {
+      long curTime, boolean filter, boolean norm, boolean force,
+      int maxNumSegments) throws IOException {
 
-    Path tempDir = new Path(getConf().get("mapred.temp.dir", ".") + "/generate-temp-"
-        + java.util.UUID.randomUUID().toString());
+    Path tempDir = new Path(getConf().get("mapred.temp.dir", ".")
+        + "/generate-temp-" + java.util.UUID.randomUUID().toString());
 
     Path lock = new Path(dbDir, CrawlDb.LOCK_NAME);
     FileSystem fs = FileSystem.get(getConf());
@@ -501,7 +532,7 @@ public class Generator extends Configure
     if (topN != Long.MAX_VALUE) {
       LOG.info("Generator: topN: " + topN);
     }
-    
+
     // map to inverted subset due for fetch, sort by score
     JobConf job = new NutchJob(getConf());
     job.setJobName("generate: select from " + dbDir);
@@ -553,7 +584,8 @@ public class Generator extends Configure
     try {
       for (FileStatus stat : status) {
         Path subfetchlist = stat.getPath();
-        if (!subfetchlist.getName().startsWith("fetchlist-")) continue;
+        if (!subfetchlist.getName().startsWith("fetchlist-"))
+          continue;
         // start a new partition job for this segment
         Path newSeg = partitionSegment(fs, segments, subfetchlist, numLists);
         generatedSegments.add(newSeg);
@@ -573,8 +605,8 @@ public class Generator extends Configure
 
     if (getConf().getBoolean(GENERATE_UPDATE_CRAWLDB, false)) {
       // update the db from tempDir
-      Path tempDir2 = new Path(getConf().get("mapred.temp.dir", ".") + "/generate-temp-"
-          + java.util.UUID.randomUUID().toString());
+      Path tempDir2 = new Path(getConf().get("mapred.temp.dir", ".")
+          + "/generate-temp-" + java.util.UUID.randomUUID().toString());
 
       job = new NutchJob(getConf());
       job.setJobName("generate: updatedb " + dbDir);
@@ -607,7 +639,8 @@ public class Generator extends Configure
     fs.delete(tempDir, true);
 
     long end = System.currentTimeMillis();
-    LOG.info("Generator: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
+    LOG.info("Generator: finished at " + sdf.format(end) + ", elapsed: "
+        + TimingUtil.elapsedTime(start, end));
 
     Path[] patharray = new Path[generatedSegments.size()];
     return generatedSegments.toArray(patharray);
@@ -653,7 +686,8 @@ public class Generator extends Configure
   public static synchronized String generateSegmentName() {
     try {
       Thread.sleep(1000);
-    } catch (Throwable t) {}
+    } catch (Throwable t) {
+    }
     ;
     return sdf.format(new Date(System.currentTimeMillis()));
   }
@@ -662,7 +696,8 @@ public class Generator extends Configure
    * Generate a fetchlist from the crawldb.
    */
   public static void main(String args[]) throws Exception {
-    int res = ToolRunner.run(NutchConfiguration.create(), new Generator(), args);
+    int res = ToolRunner
+        .run(NutchConfiguration.create(), new Generator(), args);
     System.exit(res);
   }
 
@@ -706,9 +741,10 @@ public class Generator extends Configure
     }
 
     try {
-      Path[] segs = generate(dbDir, segmentsDir, numFetchers, topN, curTime, filter,
-          norm, force, maxNumSegments);
-      if (segs == null) return 1;
+      Path[] segs = generate(dbDir, segmentsDir, numFetchers, topN, curTime,
+          filter, norm, force, maxNumSegments);
+      if (segs == null)
+        return 1;
     } catch (Exception e) {
       LOG.error("Generator: " + StringUtils.stringifyException(e));
       return -1;

Modified: nutch/trunk/src/java/org/apache/nutch/crawl/Inlink.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/Inlink.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/Inlink.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/Inlink.java Thu Jan 29 05:38:59 2015
@@ -26,7 +26,8 @@ public class Inlink implements Writable
   private String fromUrl;
   private String anchor;
 
-  public Inlink() {}
+  public Inlink() {
+  }
 
   public Inlink(String fromUrl, String anchor) {
     this.fromUrl = fromUrl;
@@ -40,8 +41,8 @@ public class Inlink implements Writable
 
   /** Skips over one Inlink in the input. */
   public static void skip(DataInput in) throws IOException {
-    Text.skip(in);                                // skip fromUrl
-    Text.skip(in);                                // skip anchor
+    Text.skip(in); // skip fromUrl
+    Text.skip(in); // skip anchor
   }
 
   public void write(DataOutput out) throws IOException {
@@ -55,16 +56,20 @@ public class Inlink implements Writable
     return inlink;
   }
 
-  public String getFromUrl() { return fromUrl; }
-  public String getAnchor() { return anchor; }
+  public String getFromUrl() {
+    return fromUrl;
+  }
+
+  public String getAnchor() {
+    return anchor;
+  }
 
   public boolean equals(Object o) {
     if (!(o instanceof Inlink))
       return false;
-    Inlink other = (Inlink)o;
-    return
-      this.fromUrl.equals(other.fromUrl) &&
-      this.anchor.equals(other.anchor);
+    Inlink other = (Inlink) o;
+    return this.fromUrl.equals(other.fromUrl)
+        && this.anchor.equals(other.anchor);
   }
 
   public int hashCode() {

Modified: nutch/trunk/src/java/org/apache/nutch/crawl/Inlinks.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/Inlinks.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/Inlinks.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/Inlinks.java Thu Jan 29 05:38:59 2015
@@ -27,17 +27,25 @@ import org.apache.hadoop.io.*;
 public class Inlinks implements Writable {
   private HashSet<Inlink> inlinks = new HashSet<Inlink>(1);
 
-  public void add(Inlink inlink) { inlinks.add(inlink); }
+  public void add(Inlink inlink) {
+    inlinks.add(inlink);
+  }
 
-  public void add(Inlinks inlinks) { this.inlinks.addAll(inlinks.inlinks); }
+  public void add(Inlinks inlinks) {
+    this.inlinks.addAll(inlinks.inlinks);
+  }
 
   public Iterator<Inlink> iterator() {
     return this.inlinks.iterator();
   }
-  
-  public int size() { return inlinks.size(); }
 
-  public void clear() { inlinks.clear(); }
+  public int size() {
+    return inlinks.size();
+  }
+
+  public void clear() {
+    inlinks.clear();
+  }
 
   public void readFields(DataInput in) throws IOException {
     int length = in.readInt();
@@ -67,30 +75,32 @@ public class Inlinks implements Writable
     return buffer.toString();
   }
 
-  /** Return the set of anchor texts.  Only a single anchor with a given text
-   * is permitted from a given domain. */
+  /**
+   * Return the set of anchor texts. Only a single anchor with a given text is
+   * permitted from a given domain.
+   */
   public String[] getAnchors() {
-    HashMap<String, Set<String>> domainToAnchors =
-      new HashMap<String, Set<String>>();
+    HashMap<String, Set<String>> domainToAnchors = new HashMap<String, Set<String>>();
     ArrayList<String> results = new ArrayList<String>();
     Iterator<Inlink> it = inlinks.iterator();
     while (it.hasNext()) {
       Inlink inlink = it.next();
       String anchor = inlink.getAnchor();
 
-      if (anchor.length() == 0)                   // skip empty anchors
+      if (anchor.length() == 0) // skip empty anchors
         continue;
-      String domain = null;                       // extract domain name
+      String domain = null; // extract domain name
       try {
         domain = new URL(inlink.getFromUrl()).getHost();
-      } catch (MalformedURLException e) {}
+      } catch (MalformedURLException e) {
+      }
       Set<String> domainAnchors = domainToAnchors.get(domain);
       if (domainAnchors == null) {
         domainAnchors = new HashSet<String>();
         domainToAnchors.put(domain, domainAnchors);
       }
-      if (domainAnchors.add(anchor)) {            // new anchor from domain
-        results.add(anchor);                      // collect it
+      if (domainAnchors.add(anchor)) { // new anchor from domain
+        results.add(anchor); // collect it
       }
     }
 

Modified: nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java Thu Jan 29 05:38:59 2015
@@ -43,7 +43,8 @@ import org.apache.nutch.util.NutchJob;
 import org.apache.nutch.util.TimingUtil;
 
 /** Maintains an inverted link map, listing incoming links for each url. */
-public class LinkDb extends Configured implements Tool, Mapper<Text, ParseData, Text, Inlinks> {
+public class LinkDb extends Configured implements Tool,
+    Mapper<Text, ParseData, Text, Inlinks> {
 
   public static final Logger LOG = LoggerFactory.getLogger(LinkDb.class);
 
@@ -56,13 +57,14 @@ public class LinkDb extends Configured i
   private boolean ignoreInternalLinks;
   private URLFilters urlFilters;
   private URLNormalizers urlNormalizers;
-  
-  public LinkDb() {}
-  
+
+  public LinkDb() {
+  }
+
   public LinkDb(Configuration conf) {
     setConf(conf);
   }
-  
+
   public void configure(JobConf job) {
     maxAnchorLength = job.getInt("db.max.anchor.length", 100);
     ignoreInternalLinks = job.getBoolean(IGNORE_INTERNAL_LINKS, true);
@@ -74,16 +76,19 @@ public class LinkDb extends Configured i
     }
   }
 
-  public void close() {}
+  public void close() {
+  }
 
   public void map(Text key, ParseData parseData,
-                  OutputCollector<Text, Inlinks> output, Reporter reporter)
-    throws IOException {
+      OutputCollector<Text, Inlinks> output, Reporter reporter)
+      throws IOException {
     String fromUrl = key.toString();
     String fromHost = getHost(fromUrl);
     if (urlNormalizers != null) {
       try {
-        fromUrl = urlNormalizers.normalize(fromUrl, URLNormalizers.SCOPE_LINKDB); // normalize the url
+        fromUrl = urlNormalizers
+            .normalize(fromUrl, URLNormalizers.SCOPE_LINKDB); // normalize the
+                                                              // url
       } catch (Exception e) {
         LOG.warn("Skipping " + fromUrl + ":" + e);
         fromUrl = null;
@@ -97,7 +102,8 @@ public class LinkDb extends Configured i
         fromUrl = null;
       }
     }
-    if (fromUrl == null) return; // discard all outlinks
+    if (fromUrl == null)
+      return; // discard all outlinks
     Outlink[] outlinks = parseData.getOutlinks();
     Inlinks inlinks = new Inlinks();
     for (int i = 0; i < outlinks.length; i++) {
@@ -107,12 +113,14 @@ public class LinkDb extends Configured i
       if (ignoreInternalLinks) {
         String toHost = getHost(toUrl);
         if (toHost == null || toHost.equals(fromHost)) { // internal link
-          continue;                               // skip it
+          continue; // skip it
         }
       }
       if (urlNormalizers != null) {
         try {
-          toUrl = urlNormalizers.normalize(toUrl, URLNormalizers.SCOPE_LINKDB); // normalize the url
+          toUrl = urlNormalizers.normalize(toUrl, URLNormalizers.SCOPE_LINKDB); // normalize
+                                                                                // the
+                                                                                // url
         } catch (Exception e) {
           LOG.warn("Skipping " + toUrl + ":" + e);
           toUrl = null;
@@ -126,13 +134,14 @@ public class LinkDb extends Configured i
           toUrl = null;
         }
       }
-      if (toUrl == null) continue;
+      if (toUrl == null)
+        continue;
       inlinks.clear();
-      String anchor = outlink.getAnchor();        // truncate long anchors
+      String anchor = outlink.getAnchor(); // truncate long anchors
       if (anchor.length() > maxAnchorLength) {
         anchor = anchor.substring(0, maxAnchorLength);
       }
-      inlinks.add(new Inlink(fromUrl, anchor));   // collect inverted link
+      inlinks.add(new Inlink(fromUrl, anchor)); // collect inverted link
       output.collect(new Text(toUrl), inlinks);
     }
   }
@@ -145,13 +154,16 @@ public class LinkDb extends Configured i
     }
   }
 
-  public void invert(Path linkDb, final Path segmentsDir, boolean normalize, boolean filter, boolean force) throws IOException {
+  public void invert(Path linkDb, final Path segmentsDir, boolean normalize,
+      boolean filter, boolean force) throws IOException {
     final FileSystem fs = FileSystem.get(getConf());
-    FileStatus[] files = fs.listStatus(segmentsDir, HadoopFSUtil.getPassDirectoriesFilter(fs));
+    FileStatus[] files = fs.listStatus(segmentsDir,
+        HadoopFSUtil.getPassDirectoriesFilter(fs));
     invert(linkDb, HadoopFSUtil.getPaths(files), normalize, filter, force);
   }
 
-  public void invert(Path linkDb, Path[] segments, boolean normalize, boolean filter, boolean force) throws IOException {
+  public void invert(Path linkDb, Path[] segments, boolean normalize,
+      boolean filter, boolean force) throws IOException {
     JobConf job = LinkDb.createJob(getConf(), linkDb, normalize, filter);
     Path lock = new Path(linkDb, LOCK_NAME);
     FileSystem fs = FileSystem.get(getConf());
@@ -174,7 +186,8 @@ public class LinkDb extends Configured i
       if (LOG.isInfoEnabled()) {
         LOG.info("LinkDb: adding segment: " + segments[i]);
       }
-      FileInputFormat.addInputPath(job, new Path(segments[i], ParseData.DIR_NAME));
+      FileInputFormat.addInputPath(job, new Path(segments[i],
+          ParseData.DIR_NAME));
     }
     try {
       JobClient.runJob(job);
@@ -203,13 +216,14 @@ public class LinkDb extends Configured i
     LinkDb.install(job, linkDb);
 
     long end = System.currentTimeMillis();
-    LOG.info("LinkDb: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
+    LOG.info("LinkDb: finished at " + sdf.format(end) + ", elapsed: "
+        + TimingUtil.elapsedTime(start, end));
   }
 
-  private static JobConf createJob(Configuration config, Path linkDb, boolean normalize, boolean filter) {
-    Path newLinkDb =
-      new Path("linkdb-" +
-               Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
+  private static JobConf createJob(Configuration config, Path linkDb,
+      boolean normalize, boolean filter) {
+    Path newLinkDb = new Path("linkdb-"
+        + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
 
     JobConf job = new NutchJob(config);
     job.setJobName("linkdb " + linkDb);
@@ -247,12 +261,14 @@ public class LinkDb extends Configured i
     Path old = new Path(linkDb, "old");
     Path current = new Path(linkDb, CURRENT_NAME);
     if (fs.exists(current)) {
-      if (fs.exists(old)) fs.delete(old, true);
+      if (fs.exists(old))
+        fs.delete(old, true);
       fs.rename(current, old);
     }
     fs.mkdirs(linkDb);
     fs.rename(newLinkDb, current);
-    if (fs.exists(old)) fs.delete(old, true);
+    if (fs.exists(old))
+      fs.delete(old, true);
     LockUtil.removeLockFile(fs, new Path(linkDb, LOCK_NAME));
   }
 
@@ -263,11 +279,14 @@ public class LinkDb extends Configured i
 
   public int run(String[] args) throws Exception {
     if (args.length < 2) {
-      System.err.println("Usage: LinkDb <linkdb> (-dir <segmentsDir> | <seg1> <seg2> ...) [-force] [-noNormalize] [-noFilter]");
+      System.err
+          .println("Usage: LinkDb <linkdb> (-dir <segmentsDir> | <seg1> <seg2> ...) [-force] [-noNormalize] [-noFilter]");
       System.err.println("\tlinkdb\toutput LinkDb to create or update");
-      System.err.println("\t-dir segmentsDir\tparent directory of several segments, OR");
+      System.err
+          .println("\t-dir segmentsDir\tparent directory of several segments, OR");
       System.err.println("\tseg1 seg2 ...\t list of segment directories");
-      System.err.println("\t-force\tforce update even if LinkDb appears to be locked (CAUTION advised)");
+      System.err
+          .println("\t-force\tforce update even if LinkDb appears to be locked (CAUTION advised)");
       System.err.println("\t-noNormalize\tdon't normalize link URLs");
       System.err.println("\t-noFilter\tdon't apply URLFilters to link URLs");
       return -1;
@@ -281,7 +300,8 @@ public class LinkDb extends Configured i
     boolean force = false;
     for (int i = 1; i < args.length; i++) {
       if (args[i].equals("-dir")) {
-        FileStatus[] paths = fs.listStatus(new Path(args[++i]), HadoopFSUtil.getPassDirectoriesFilter(fs));
+        FileStatus[] paths = fs.listStatus(new Path(args[++i]),
+            HadoopFSUtil.getPassDirectoriesFilter(fs));
         segs.addAll(Arrays.asList(HadoopFSUtil.getPaths(paths)));
       } else if (args[i].equalsIgnoreCase("-noNormalize")) {
         normalize = false;
@@ -289,7 +309,8 @@ public class LinkDb extends Configured i
         filter = false;
       } else if (args[i].equalsIgnoreCase("-force")) {
         force = true;
-      } else segs.add(new Path(args[i]));
+      } else
+        segs.add(new Path(args[i]));
     }
     try {
       invert(db, segs.toArray(new Path[segs.size()]), normalize, filter, force);

Modified: nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbFilter.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbFilter.java Thu Jan 29 05:38:59 2015
@@ -31,8 +31,8 @@ import org.apache.nutch.net.URLFilters;
 import org.apache.nutch.net.URLNormalizers;
 
 /**
- * This class provides a way to separate the URL normalization
- * and filtering steps from the rest of LinkDb manipulation code.
+ * This class provides a way to separate the URL normalization and filtering
+ * steps from the rest of LinkDb manipulation code.
  * 
  * @author Andrzej Bialecki
  */
@@ -50,13 +50,13 @@ public class LinkDbFilter implements Map
   private URLFilters filters;
 
   private URLNormalizers normalizers;
-  
+
   private String scope;
-  
+
   public static final Logger LOG = LoggerFactory.getLogger(LinkDbFilter.class);
 
   private Text newKey = new Text();
-  
+
   public void configure(JobConf job) {
     filter = job.getBoolean(URL_FILTERING, false);
     normalize = job.getBoolean(URL_NORMALIZING, false);
@@ -69,10 +69,12 @@ public class LinkDbFilter implements Map
     }
   }
 
-  public void close() {}
+  public void close() {
+  }
 
   public void map(Text key, Inlinks value,
-      OutputCollector<Text, Inlinks> output, Reporter reporter) throws IOException {
+      OutputCollector<Text, Inlinks> output, Reporter reporter)
+      throws IOException {
     String url = key.toString();
     Inlinks result = new Inlinks();
     if (normalize) {
@@ -91,7 +93,8 @@ public class LinkDbFilter implements Map
         url = null;
       }
     }
-    if (url == null) return; // didn't pass the filters
+    if (url == null)
+      return; // didn't pass the filters
     Iterator<Inlink> it = value.iterator();
     String fromUrl = null;
     while (it.hasNext()) {
@@ -113,7 +116,7 @@ public class LinkDbFilter implements Map
           fromUrl = null;
         }
       }
-      if (fromUrl != null) { 
+      if (fromUrl != null) {
         result.add(new Inlink(fromUrl, inlink.getAnchor()));
       }
     }

Modified: nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbMerger.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbMerger.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbMerger.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbMerger.java Thu Jan 29 05:38:59 2015
@@ -46,37 +46,44 @@ import org.apache.nutch.util.NutchJob;
 import org.apache.nutch.util.TimingUtil;
 
 /**
- * This tool merges several LinkDb-s into one, optionally filtering
- * URLs through the current URLFilters, to skip prohibited URLs and
- * links.
+ * This tool merges several LinkDb-s into one, optionally filtering URLs through
+ * the current URLFilters, to skip prohibited URLs and links.
  * 
- * <p>It's possible to use this tool just for filtering - in that case
- * only one LinkDb should be specified in arguments.</p>
- * <p>If more than one LinkDb contains information about the same URL,
- * all inlinks are accumulated, but only at most <code>db.max.inlinks</code>
- * inlinks will ever be added.</p>
- * <p>If activated, URLFilters will be applied to both the target URLs and
- * to any incoming link URL. If a target URL is prohibited, all
- * inlinks to that target will be removed, including the target URL. If
- * some of incoming links are prohibited, only they will be removed, and they
- * won't count when checking the above-mentioned maximum limit.
+ * <p>
+ * It's possible to use this tool just for filtering - in that case only one
+ * LinkDb should be specified in arguments.
+ * </p>
+ * <p>
+ * If more than one LinkDb contains information about the same URL, all inlinks
+ * are accumulated, but only at most <code>db.max.inlinks</code> inlinks will
+ * ever be added.
+ * </p>
+ * <p>
+ * If activated, URLFilters will be applied to both the target URLs and to any
+ * incoming link URL. If a target URL is prohibited, all inlinks to that target
+ * will be removed, including the target URL. If some of incoming links are
+ * prohibited, only they will be removed, and they won't count when checking the
+ * above-mentioned maximum limit.
  * 
  * @author Andrzej Bialecki
  */
-public class LinkDbMerger extends Configured implements Tool, Reducer<Text, Inlinks, Text, Inlinks> {
+public class LinkDbMerger extends Configured implements Tool,
+    Reducer<Text, Inlinks, Text, Inlinks> {
   private static final Logger LOG = LoggerFactory.getLogger(LinkDbMerger.class);
-  
+
   private int maxInlinks;
-  
+
   public LinkDbMerger() {
-    
+
   }
-  
+
   public LinkDbMerger(Configuration conf) {
     setConf(conf);
   }
 
-  public void reduce(Text key, Iterator<Inlinks> values, OutputCollector<Text, Inlinks> output, Reporter reporter) throws IOException {
+  public void reduce(Text key, Iterator<Inlinks> values,
+      OutputCollector<Text, Inlinks> output, Reporter reporter)
+      throws IOException {
 
     Inlinks result = new Inlinks();
 
@@ -86,43 +93,48 @@ public class LinkDbMerger extends Config
       int end = Math.min(maxInlinks - result.size(), inlinks.size());
       Iterator<Inlink> it = inlinks.iterator();
       int i = 0;
-      while(it.hasNext() && i++ < end) {
+      while (it.hasNext() && i++ < end) {
         result.add(it.next());
       }
     }
-    if (result.size() == 0) return;
+    if (result.size() == 0)
+      return;
     output.collect(key, result);
-    
+
   }
 
   public void configure(JobConf job) {
     maxInlinks = job.getInt("db.max.inlinks", 10000);
   }
 
-  public void close() throws IOException { }
+  public void close() throws IOException {
+  }
 
-  public void merge(Path output, Path[] dbs, boolean normalize, boolean filter) throws Exception {
+  public void merge(Path output, Path[] dbs, boolean normalize, boolean filter)
+      throws Exception {
     SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
     long start = System.currentTimeMillis();
     LOG.info("LinkDb merge: starting at " + sdf.format(start));
 
     JobConf job = createMergeJob(getConf(), output, normalize, filter);
     for (int i = 0; i < dbs.length; i++) {
-      FileInputFormat.addInputPath(job, new Path(dbs[i], LinkDb.CURRENT_NAME));      
+      FileInputFormat.addInputPath(job, new Path(dbs[i], LinkDb.CURRENT_NAME));
     }
     JobClient.runJob(job);
     FileSystem fs = FileSystem.get(getConf());
     fs.mkdirs(output);
-    fs.rename(FileOutputFormat.getOutputPath(job), new Path(output, LinkDb.CURRENT_NAME));
+    fs.rename(FileOutputFormat.getOutputPath(job), new Path(output,
+        LinkDb.CURRENT_NAME));
 
     long end = System.currentTimeMillis();
-    LOG.info("LinkDb merge: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
+    LOG.info("LinkDb merge: finished at " + sdf.format(end) + ", elapsed: "
+        + TimingUtil.elapsedTime(start, end));
   }
 
-  public static JobConf createMergeJob(Configuration config, Path linkDb, boolean normalize, boolean filter) {
-    Path newLinkDb =
-      new Path("linkdb-merge-" + 
-               Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
+  public static JobConf createMergeJob(Configuration config, Path linkDb,
+      boolean normalize, boolean filter) {
+    Path newLinkDb = new Path("linkdb-merge-"
+        + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
 
     JobConf job = new NutchJob(config);
     job.setJobName("linkdb merge " + linkDb);
@@ -145,22 +157,27 @@ public class LinkDbMerger extends Config
 
     return job;
   }
-  
+
   /**
    * @param args
    */
   public static void main(String[] args) throws Exception {
-    int res = ToolRunner.run(NutchConfiguration.create(), new LinkDbMerger(), args);
+    int res = ToolRunner.run(NutchConfiguration.create(), new LinkDbMerger(),
+        args);
     System.exit(res);
   }
-  
+
   public int run(String[] args) throws Exception {
     if (args.length < 2) {
-      System.err.println("Usage: LinkDbMerger <output_linkdb> <linkdb1> [<linkdb2> <linkdb3> ...] [-normalize] [-filter]");
+      System.err
+          .println("Usage: LinkDbMerger <output_linkdb> <linkdb1> [<linkdb2> <linkdb3> ...] [-normalize] [-filter]");
       System.err.println("\toutput_linkdb\toutput LinkDb");
-      System.err.println("\tlinkdb1 ...\tinput LinkDb-s (single input LinkDb is ok)");
-      System.err.println("\t-normalize\tuse URLNormalizer on both fromUrls and toUrls in linkdb(s) (usually not needed)");
-      System.err.println("\t-filter\tuse URLFilters on both fromUrls and toUrls in linkdb(s)");
+      System.err
+          .println("\tlinkdb1 ...\tinput LinkDb-s (single input LinkDb is ok)");
+      System.err
+          .println("\t-normalize\tuse URLNormalizer on both fromUrls and toUrls in linkdb(s) (usually not needed)");
+      System.err
+          .println("\t-filter\tuse URLFilters on both fromUrls and toUrls in linkdb(s)");
       return -1;
     }
     Path output = new Path(args[0]);
@@ -172,7 +189,8 @@ public class LinkDbMerger extends Config
         filter = true;
       } else if (args[i].equals("-normalize")) {
         normalize = true;
-      } else dbs.add(new Path(args[i]));
+      } else
+        dbs.add(new Path(args[i]));
     }
     try {
       merge(output, dbs.toArray(new Path[dbs.size()]), normalize, filter);

Modified: nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java Thu Jan 29 05:38:59 2015
@@ -50,14 +50,14 @@ public class LinkDbReader extends Config
   private MapFile.Reader[] readers;
 
   public LinkDbReader() {
-    
+
   }
-  
+
   public LinkDbReader(Configuration conf, Path directory) throws Exception {
     setConf(conf);
     init(directory);
   }
-  
+
   public void init(Path directory) throws Exception {
     this.fs = FileSystem.get(getConf());
     this.directory = directory;
@@ -73,16 +73,16 @@ public class LinkDbReader extends Config
   public Inlinks getInlinks(Text url) throws IOException {
 
     if (readers == null) {
-      synchronized(this) {
-        readers = MapFileOutputFormat.getReaders
-          (fs, new Path(directory, LinkDb.CURRENT_NAME), getConf());
+      synchronized (this) {
+        readers = MapFileOutputFormat.getReaders(fs, new Path(directory,
+            LinkDb.CURRENT_NAME), getConf());
       }
     }
-    
-    return (Inlinks)MapFileOutputFormat.getEntry
-      (readers, PARTITIONER, url, new Inlinks());
+
+    return (Inlinks) MapFileOutputFormat.getEntry(readers, PARTITIONER, url,
+        new Inlinks());
   }
-  
+
   public void close() throws IOException {
     if (readers != null) {
       for (int i = 0; i < readers.length; i++) {
@@ -90,7 +90,7 @@ public class LinkDbReader extends Config
       }
     }
   }
-  
+
   public void processDumpJob(String linkdb, String output) throws IOException {
     SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
     long start = System.currentTimeMillis();
@@ -114,19 +114,24 @@ public class LinkDbReader extends Config
     JobClient.runJob(job);
 
     long end = System.currentTimeMillis();
-    LOG.info("LinkDb dump: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
+    LOG.info("LinkDb dump: finished at " + sdf.format(end) + ", elapsed: "
+        + TimingUtil.elapsedTime(start, end));
   }
-  
+
   public static void main(String[] args) throws Exception {
-    int res = ToolRunner.run(NutchConfiguration.create(), new LinkDbReader(), args);
+    int res = ToolRunner.run(NutchConfiguration.create(), new LinkDbReader(),
+        args);
     System.exit(res);
   }
-  
+
   public int run(String[] args) throws Exception {
     if (args.length < 2) {
-      System.err.println("Usage: LinkDbReader <linkdb> (-dump <out_dir> | -url <url>)");
-      System.err.println("\t-dump <out_dir>\tdump whole link db to a text file in <out_dir>");
-      System.err.println("\t-url <url>\tprint information about <url> to System.out");
+      System.err
+          .println("Usage: LinkDbReader <linkdb> (-dump <out_dir> | -url <url>)");
+      System.err
+          .println("\t-dump <out_dir>\tdump whole link db to a text file in <out_dir>");
+      System.err
+          .println("\t-url <url>\tprint information about <url> to System.out");
       return -1;
     }
     try {

Modified: nutch/trunk/src/java/org/apache/nutch/crawl/MD5Signature.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/MD5Signature.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/MD5Signature.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/MD5Signature.java Thu Jan 29 05:38:59 2015
@@ -22,9 +22,9 @@ import org.apache.nutch.parse.Parse;
 import org.apache.nutch.protocol.Content;
 
 /**
- * Default implementation of a page signature. It calculates an MD5 hash
- * of the raw binary content of a page. In case there is no content, it
- * calculates a hash from the page's URL.
+ * Default implementation of a page signature. It calculates an MD5 hash of the
+ * raw binary content of a page. In case there is no content, it calculates a
+ * hash from the page's URL.
  * 
  * @author Andrzej Bialecki &lt;ab@getopt.org&gt;
  */
@@ -32,7 +32,8 @@ public class MD5Signature extends Signat
 
   public byte[] calculate(Content content, Parse parse) {
     byte[] data = content.getContent();
-    if (data == null) data = content.getUrl().getBytes();
+    if (data == null)
+      data = content.getUrl().getBytes();
     return MD5Hash.digest(data).getDigest();
   }
 }

Modified: nutch/trunk/src/java/org/apache/nutch/crawl/MapWritable.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/MapWritable.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/MapWritable.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/MapWritable.java Thu Jan 29 05:38:59 2015
@@ -47,19 +47,19 @@ import org.apache.hadoop.util.StringUtil
 import org.apache.nutch.protocol.ProtocolStatus;
 
 /**
- * A writable map, with a similar behavior as <code>java.util.HashMap</code>.
- * In addition to the size of key and value writable tuple two additional bytes
- * are stored to identify the Writable classes. This means that a maximum of
- * 255 different class types can be used for key and value objects.
- * A binary-id to class mapping is defined in a static block of this class.
- * However it is possible to use custom implementations of Writable.
- * For these custom Writables we write the byte id - utf class name tuple
- * into the header of each MapWritable that uses these types.
- *
+ * A writable map, with a similar behavior as <code>java.util.HashMap</code>. In
+ * addition to the size of key and value writable tuple two additional bytes are
+ * stored to identify the Writable classes. This means that a maximum of 255
+ * different class types can be used for key and value objects. A binary-id to
+ * class mapping is defined in a static block of this class. However it is
+ * possible to use custom implementations of Writable. For these custom
+ * Writables we write the byte id - utf class name tuple into the header of each
+ * MapWritable that uses these types.
+ * 
  * @author Stefan Groschupf
  * @deprecated Use org.apache.hadoop.io.MapWritable instead.
  */
- 
+
 @Deprecated
 public class MapWritable implements Writable {
 
@@ -105,14 +105,16 @@ public class MapWritable implements Writ
     CLASS_ID_MAP.put(clazz, byteId);
     ID_CLASS_MAP.put(byteId, clazz);
   }
-  
-  public MapWritable() { }
-  
+
+  public MapWritable() {
+  }
+
   /**
    * Copy constructor. This constructor makes a deep copy, using serialization /
    * deserialization to break any possible references to contained objects.
    * 
-   * @param map map to copy from
+   * @param map
+   *          map to copy from
    */
   public MapWritable(MapWritable map) {
     if (map != null) {
@@ -123,8 +125,8 @@ public class MapWritable implements Writ
         dib.reset(dob.getData(), dob.getLength());
         readFields(dib);
       } catch (IOException e) {
-        throw new IllegalArgumentException("this map cannot be copied: " +
-                StringUtils.stringifyException(e));
+        throw new IllegalArgumentException("this map cannot be copied: "
+            + StringUtils.stringifyException(e));
       }
     }
   }
@@ -177,7 +179,8 @@ public class MapWritable implements Writ
 
   public Set<Writable> keySet() {
     HashSet<Writable> set = new HashSet<Writable>();
-    if (isEmpty()) return set;
+    if (isEmpty())
+      return set;
     set.add(fFirst.fKey);
     KeyValueEntry entry = fFirst;
     while ((entry = entry.fNextEntry) != null) {
@@ -257,7 +260,8 @@ public class MapWritable implements Writ
   public boolean equals(Object obj) {
     if (obj instanceof MapWritable) {
       MapWritable map = (MapWritable) obj;
-      if (fSize != map.fSize) return false;
+      if (fSize != map.fSize)
+        return false;
       HashSet<KeyValueEntry> set1 = new HashSet<KeyValueEntry>();
       KeyValueEntry e1 = fFirst;
       while (e1 != null) {
@@ -345,7 +349,7 @@ public class MapWritable implements Writ
           clazz = Class.forName(Text.readString(in));
           addIdEntry(id, clazz);
         } catch (Exception e) {
-          if (LOG.isWarnEnabled()) { 
+          if (LOG.isWarnEnabled()) {
             LOG.warn("Unable to load internal map entry" + e.toString());
           }
           fIdCount--;
@@ -364,8 +368,8 @@ public class MapWritable implements Writ
           }
         } catch (IOException e) {
           if (LOG.isWarnEnabled()) {
-            LOG.warn("Unable to load meta data entry, ignoring.. : "  +
-                     e.toString());
+            LOG.warn("Unable to load meta data entry, ignoring.. : "
+                + e.toString());
           }
           fSize--;
         }

Modified: nutch/trunk/src/java/org/apache/nutch/crawl/MimeAdaptiveFetchSchedule.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/MimeAdaptiveFetchSchedule.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/MimeAdaptiveFetchSchedule.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/MimeAdaptiveFetchSchedule.java Thu Jan 29 05:38:59 2015
@@ -34,29 +34,31 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 /**
- * Extension of @see AdaptiveFetchSchedule that allows for more flexible configuration
- * of DEC and INC factors for various MIME-types.
- *
- * This class can be typically used in cases where a recrawl consists of many different
- * MIME-types. It's not very common for MIME-types other than text/html to change frequently.
- * Using this class you can configure different factors per MIME-type so to prefer frequently
- * changing MIME-types over others.
+ * Extension of @see AdaptiveFetchSchedule that allows for more flexible
+ * configuration of DEC and INC factors for various MIME-types.
+ * 
+ * This class can be typically used in cases where a recrawl consists of many
+ * different MIME-types. It's not very common for MIME-types other than
+ * text/html to change frequently. Using this class you can configure different
+ * factors per MIME-type so to prefer frequently changing MIME-types over
+ * others.
+ * 
+ * For it to work this class relies on the Content-Type MetaData key being
+ * present in the CrawlDB. This can either be done when injecting new URL's or
+ * by adding "Content-Type" to the db.parsemeta.to.crawldb configuration setting
+ * to force MIME-types of newly discovered URL's to be added to the CrawlDB.
  * 
- * For it to work this class relies on the Content-Type MetaData key being present in the CrawlDB.
- * This can either be done when injecting new URL's or by adding "Content-Type" to the
- * db.parsemeta.to.crawldb configuration setting to force MIME-types of newly discovered URL's to
- * be added to the CrawlDB.
- *
  * @author markus
  */
 public class MimeAdaptiveFetchSchedule extends AdaptiveFetchSchedule {
   // Loggg
-  public static final Logger LOG = LoggerFactory.getLogger(MimeAdaptiveFetchSchedule.class);
+  public static final Logger LOG = LoggerFactory
+      .getLogger(MimeAdaptiveFetchSchedule.class);
 
   // Conf directives
   public static final String SCHEDULE_INC_RATE = "db.fetch.schedule.adaptive.inc_rate";
   public static final String SCHEDULE_DEC_RATE = "db.fetch.schedule.adaptive.dec_rate";
-  public static final String SCHEDULE_MIME_FILE= "db.fetch.schedule.mime.file";
+  public static final String SCHEDULE_MIME_FILE = "db.fetch.schedule.mime.file";
 
   // Default values for DEC and INC rate
   private float defaultIncRate;
@@ -74,18 +76,21 @@ public class MimeAdaptiveFetchSchedule e
   }
 
   // Here we store the mime's and their delta's
-  private HashMap<String,AdaptiveRate> mimeMap;
+  private HashMap<String, AdaptiveRate> mimeMap;
 
   public void setConf(Configuration conf) {
     super.setConf(conf);
-    if (conf == null) return;
+    if (conf == null)
+      return;
 
-    // Read and set the default INC and DEC rates in case we cannot set values based on MIME-type
+    // Read and set the default INC and DEC rates in case we cannot set values
+    // based on MIME-type
     defaultIncRate = conf.getFloat(SCHEDULE_INC_RATE, 0.2f);
     defaultDecRate = conf.getFloat(SCHEDULE_DEC_RATE, 0.2f);
 
     // Where's the mime/factor file?
-    Reader mimeFile = conf.getConfResourceAsReader(conf.get(SCHEDULE_MIME_FILE, "adaptive-mimetypes.txt"));
+    Reader mimeFile = conf.getConfResourceAsReader(conf.get(SCHEDULE_MIME_FILE,
+        "adaptive-mimetypes.txt"));
 
     try {
       readMimeFile(mimeFile);
@@ -96,8 +101,8 @@ public class MimeAdaptiveFetchSchedule e
 
   @Override
   public CrawlDatum setFetchSchedule(Text url, CrawlDatum datum,
-          long prevFetchTime, long prevModifiedTime,
-          long fetchTime, long modifiedTime, int state) {
+      long prevFetchTime, long prevModifiedTime, long fetchTime,
+      long modifiedTime, int state) {
 
     // Set defaults
     INC_RATE = defaultIncRate;
@@ -106,7 +111,8 @@ public class MimeAdaptiveFetchSchedule e
     // Check if the Content-Type field is available in the CrawlDatum
     if (datum.getMetaData().containsKey(HttpHeaders.WRITABLE_CONTENT_TYPE)) {
       // Get the MIME-type of the current URL
-      String currentMime = datum.getMetaData().get(HttpHeaders.WRITABLE_CONTENT_TYPE).toString();
+      String currentMime = datum.getMetaData()
+          .get(HttpHeaders.WRITABLE_CONTENT_TYPE).toString();
 
       // Get rid of charset
       currentMime = currentMime.substring(0, currentMime.indexOf(';'));
@@ -120,18 +126,19 @@ public class MimeAdaptiveFetchSchedule e
     }
 
     return super.setFetchSchedule(url, datum, prevFetchTime, prevModifiedTime,
-      fetchTime, modifiedTime, state);
+        fetchTime, modifiedTime, state);
   }
 
   /**
    * Reads the mime types and their associated INC/DEC factors in a HashMap
-   *
-   * @param mimeFile Reader
+   * 
+   * @param mimeFile
+   *          Reader
    * @return void
    */
   private void readMimeFile(Reader mimeFile) throws IOException {
     // Instance of our mime/factor map
-    mimeMap = new HashMap<String,AdaptiveRate>();
+    mimeMap = new HashMap<String, AdaptiveRate>();
 
     // Open a reader
     BufferedReader reader = new BufferedReader(mimeFile);
@@ -149,7 +156,8 @@ public class MimeAdaptiveFetchSchedule e
         // Sanity check, we need two or three items
         if (splits.length == 3) {
           // Add a lower cased MIME-type and the factor to the map
-          mimeMap.put(StringUtils.lowerCase(splits[0]), new AdaptiveRate(new Float(splits[1]), new Float(splits[2])));
+          mimeMap.put(StringUtils.lowerCase(splits[0]), new AdaptiveRate(
+              new Float(splits[1]), new Float(splits[2])));
         } else {
           LOG.warn("Invalid configuration line in: " + line);
         }
@@ -178,7 +186,8 @@ public class MimeAdaptiveFetchSchedule e
 
     // Set a default MIME-type to test with
     org.apache.hadoop.io.MapWritable x = new org.apache.hadoop.io.MapWritable();
-    x.put(HttpHeaders.WRITABLE_CONTENT_TYPE, new Text("text/html; charset=utf-8"));
+    x.put(HttpHeaders.WRITABLE_CONTENT_TYPE, new Text(
+        "text/html; charset=utf-8"));
     p.setMetaData(x);
 
     p.setFetchTime(0);
@@ -187,37 +196,45 @@ public class MimeAdaptiveFetchSchedule e
     // let's move the timeline a couple of deltas
     for (int i = 0; i < 10000; i++) {
       if (lastModified + update < curTime) {
-        //System.out.println("i=" + i + ", lastModified=" + lastModified + ", update=" + update + ", curTime=" + curTime);
+        // System.out.println("i=" + i + ", lastModified=" + lastModified +
+        // ", update=" + update + ", curTime=" + curTime);
         changed = true;
         changeCnt++;
         lastModified = curTime;
       }
 
-      LOG.info(i + ". " + changed + "\twill fetch at " + (p.getFetchTime() / delta) + "\tinterval "
-              + (p.getFetchInterval() / SECONDS_PER_DAY ) + " days" + "\t missed " + miss);
+      LOG.info(i + ". " + changed + "\twill fetch at "
+          + (p.getFetchTime() / delta) + "\tinterval "
+          + (p.getFetchInterval() / SECONDS_PER_DAY) + " days" + "\t missed "
+          + miss);
 
       if (p.getFetchTime() <= curTime) {
         fetchCnt++;
-        fs.setFetchSchedule(new Text("http://www.example.com"), p,
-                p.getFetchTime(), p.getModifiedTime(), curTime, lastModified,
-                changed ? FetchSchedule.STATUS_MODIFIED : FetchSchedule.STATUS_NOTMODIFIED);
-
-        LOG.info("\tfetched & adjusted: " + "\twill fetch at " + (p.getFetchTime() / delta) + "\tinterval "
-                + (p.getFetchInterval() / SECONDS_PER_DAY ) + " days");
-
-        if (!changed) miss++;
-        if (miss > maxMiss) maxMiss = miss;
+        fs.setFetchSchedule(new Text("http://www.example.com"), p, p
+            .getFetchTime(), p.getModifiedTime(), curTime, lastModified,
+            changed ? FetchSchedule.STATUS_MODIFIED
+                : FetchSchedule.STATUS_NOTMODIFIED);
+
+        LOG.info("\tfetched & adjusted: " + "\twill fetch at "
+            + (p.getFetchTime() / delta) + "\tinterval "
+            + (p.getFetchInterval() / SECONDS_PER_DAY) + " days");
+
+        if (!changed)
+          miss++;
+        if (miss > maxMiss)
+          maxMiss = miss;
         changed = false;
         totalMiss += miss;
         miss = 0;
       }
 
-      if (changed) miss++;
+      if (changed)
+        miss++;
       curTime += delta;
     }
     LOG.info("Total missed: " + totalMiss + ", max miss: " + maxMiss);
-    LOG.info("Page changed " + changeCnt + " times, fetched " + fetchCnt + " times.");
+    LOG.info("Page changed " + changeCnt + " times, fetched " + fetchCnt
+        + " times.");
   }
 
-
 }
\ No newline at end of file

Modified: nutch/trunk/src/java/org/apache/nutch/crawl/NutchWritable.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/NutchWritable.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/NutchWritable.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/NutchWritable.java Thu Jan 29 05:38:59 2015
@@ -26,32 +26,31 @@ public class NutchWritable extends Gener
 
   static {
     CLASSES = (Class<? extends Writable>[]) new Class<?>[] {
-      org.apache.hadoop.io.NullWritable.class,
-      org.apache.hadoop.io.BooleanWritable.class,
-      org.apache.hadoop.io.LongWritable.class,
-      org.apache.hadoop.io.BytesWritable.class,
-      org.apache.hadoop.io.FloatWritable.class,
-      org.apache.hadoop.io.IntWritable.class,
-      org.apache.hadoop.io.MapWritable.class,
-      org.apache.hadoop.io.Text.class,
-      org.apache.hadoop.io.MD5Hash.class,
-      org.apache.nutch.crawl.CrawlDatum.class,
-      org.apache.nutch.crawl.Inlink.class,
-      org.apache.nutch.crawl.Inlinks.class,
-      org.apache.nutch.indexer.NutchIndexAction.class,
-      org.apache.nutch.metadata.Metadata.class,
-      org.apache.nutch.parse.Outlink.class,
-      org.apache.nutch.parse.ParseText.class,
-      org.apache.nutch.parse.ParseData.class,
-      org.apache.nutch.parse.ParseImpl.class,
-      org.apache.nutch.parse.ParseStatus.class,
-      org.apache.nutch.protocol.Content.class,
-      org.apache.nutch.protocol.ProtocolStatus.class,
-      org.apache.nutch.scoring.webgraph.LinkDatum.class
-    };
+        org.apache.hadoop.io.NullWritable.class,
+        org.apache.hadoop.io.BooleanWritable.class,
+        org.apache.hadoop.io.LongWritable.class,
+        org.apache.hadoop.io.BytesWritable.class,
+        org.apache.hadoop.io.FloatWritable.class,
+        org.apache.hadoop.io.IntWritable.class,
+        org.apache.hadoop.io.MapWritable.class,
+        org.apache.hadoop.io.Text.class, org.apache.hadoop.io.MD5Hash.class,
+        org.apache.nutch.crawl.CrawlDatum.class,
+        org.apache.nutch.crawl.Inlink.class,
+        org.apache.nutch.crawl.Inlinks.class,
+        org.apache.nutch.indexer.NutchIndexAction.class,
+        org.apache.nutch.metadata.Metadata.class,
+        org.apache.nutch.parse.Outlink.class,
+        org.apache.nutch.parse.ParseText.class,
+        org.apache.nutch.parse.ParseData.class,
+        org.apache.nutch.parse.ParseImpl.class,
+        org.apache.nutch.parse.ParseStatus.class,
+        org.apache.nutch.protocol.Content.class,
+        org.apache.nutch.protocol.ProtocolStatus.class,
+        org.apache.nutch.scoring.webgraph.LinkDatum.class };
   }
 
-  public NutchWritable() { }
+  public NutchWritable() {
+  }
 
   public NutchWritable(Writable instance) {
     set(instance);

Modified: nutch/trunk/src/java/org/apache/nutch/crawl/Signature.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/Signature.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/Signature.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/Signature.java Thu Jan 29 05:38:59 2015
@@ -24,7 +24,7 @@ import org.apache.hadoop.conf.Configurab
 
 public abstract class Signature implements Configurable {
   protected Configuration conf;
-  
+
   public abstract byte[] calculate(Content content, Parse parse);
 
   public Configuration getConf() {

Modified: nutch/trunk/src/java/org/apache/nutch/crawl/SignatureComparator.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/SignatureComparator.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/SignatureComparator.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/SignatureComparator.java Thu Jan 29 05:38:59 2015
@@ -23,25 +23,34 @@ public class SignatureComparator impleme
   public int compare(Object o1, Object o2) {
     return _compare(o1, o2);
   }
-  
+
   public static int _compare(Object o1, Object o2) {
-    if (o1 == null && o2 == null) return 0;
-    if (o1 == null) return -1;
-    if (o2 == null) return 1;
-    if (!(o1 instanceof byte[])) return -1;
-    if (!(o2 instanceof byte[])) return 1;
-    byte[] data1 = (byte[])o1;
-    byte[] data2 = (byte[])o2;
+    if (o1 == null && o2 == null)
+      return 0;
+    if (o1 == null)
+      return -1;
+    if (o2 == null)
+      return 1;
+    if (!(o1 instanceof byte[]))
+      return -1;
+    if (!(o2 instanceof byte[]))
+      return 1;
+    byte[] data1 = (byte[]) o1;
+    byte[] data2 = (byte[]) o2;
     return _compare(data1, 0, data1.length, data2, 0, data2.length);
   }
-  
-  public static int _compare(byte[] data1, int s1, int l1, byte[] data2, int s2, int l2) {
-    if (l2 > l1) return -1;
-    if (l2 < l1) return 1;
+
+  public static int _compare(byte[] data1, int s1, int l1, byte[] data2,
+      int s2, int l2) {
+    if (l2 > l1)
+      return -1;
+    if (l2 < l1)
+      return 1;
     int res = 0;
     for (int i = 0; i < l1; i++) {
       res = (data1[s1 + i] - data2[s2 + i]);
-      if (res != 0) return res;
+      if (res != 0)
+        return res;
     }
     return 0;
   }

Modified: nutch/trunk/src/java/org/apache/nutch/crawl/SignatureFactory.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/SignatureFactory.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/SignatureFactory.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/SignatureFactory.java Thu Jan 29 05:38:59 2015
@@ -27,28 +27,30 @@ import org.apache.nutch.util.ObjectCache
 
 /**
  * Factory class, which instantiates a Signature implementation according to the
- * current Configuration configuration. This newly created instance is cached in the
- * Configuration instance, so that it could be later retrieved.
+ * current Configuration configuration. This newly created instance is cached in
+ * the Configuration instance, so that it could be later retrieved.
  * 
  * @author Andrzej Bialecki &lt;ab@getopt.org&gt;
  */
 public class SignatureFactory {
-  private static final Logger LOG = LoggerFactory.getLogger(SignatureFactory.class);
+  private static final Logger LOG = LoggerFactory
+      .getLogger(SignatureFactory.class);
 
-  private SignatureFactory() {}                   // no public ctor
+  private SignatureFactory() {
+  } // no public ctor
 
   /** Return the default Signature implementation. */
   public synchronized static Signature getSignature(Configuration conf) {
     String clazz = conf.get("db.signature.class", MD5Signature.class.getName());
     ObjectCache objectCache = ObjectCache.get(conf);
-    Signature impl = (Signature)objectCache.getObject(clazz);
+    Signature impl = (Signature) objectCache.getObject(clazz);
     if (impl == null) {
       try {
         if (LOG.isInfoEnabled()) {
           LOG.info("Using Signature impl: " + clazz);
         }
         Class<?> implClass = Class.forName(clazz);
-        impl = (Signature)implClass.newInstance();
+        impl = (Signature) implClass.newInstance();
         impl.setConf(conf);
         objectCache.setObject(clazz, impl);
       } catch (Exception e) {

Modified: nutch/trunk/src/java/org/apache/nutch/crawl/TextMD5Signature.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/TextMD5Signature.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/TextMD5Signature.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/TextMD5Signature.java Thu Jan 29 05:38:59 2015
@@ -22,9 +22,9 @@ import org.apache.nutch.parse.Parse;
 import org.apache.nutch.protocol.Content;
 
 /**
- * Implementation of a page signature. It calculates an MD5 hash
- * of the textual content of a page. In case there is no content, it
- * calculates a hash from the page's URL.
+ * Implementation of a page signature. It calculates an MD5 hash of the textual
+ * content of a page. In case there is no content, it calculates a hash from the
+ * page's URL.
  */
 public class TextMD5Signature extends Signature {
 
@@ -36,7 +36,7 @@ public class TextMD5Signature extends Si
     if (text == null || text.length() == 0) {
       return fallback.calculate(content, parse);
     }
-    
+
     return MD5Hash.digest(text).getDigest();
   }
 }

Modified: nutch/trunk/src/java/org/apache/nutch/crawl/TextProfileSignature.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/TextProfileSignature.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/TextProfileSignature.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/TextProfileSignature.java Thu Jan 29 05:38:59 2015
@@ -35,41 +35,50 @@ import org.apache.nutch.util.StringUtil;
 import org.apache.nutch.util.NutchConfiguration;
 
 /**
- * <p>An implementation of a page signature. It calculates an MD5 hash
- * of a plain text "profile" of a page. In case there is no text, it
- * calculates a hash using the {@link MD5Signature}.</p>
- * <p>The algorithm to calculate a page "profile" takes the plain text version of
- * a page and performs the following steps:
+ * <p>
+ * An implementation of a page signature. It calculates an MD5 hash of a plain
+ * text "profile" of a page. In case there is no text, it calculates a hash
+ * using the {@link MD5Signature}.
+ * </p>
+ * <p>
+ * The algorithm to calculate a page "profile" takes the plain text version of a
+ * page and performs the following steps:
  * <ul>
  * <li>remove all characters except letters and digits, and bring all characters
  * to lower case,</li>
  * <li>split the text into tokens (all consecutive non-whitespace characters),</li>
- * <li>discard tokens equal or shorter than MIN_TOKEN_LEN (default 2 characters),</li>
+ * <li>discard tokens equal or shorter than MIN_TOKEN_LEN (default 2
+ * characters),</li>
  * <li>sort the list of tokens by decreasing frequency,</li>
- * <li>round down the counts of tokens to the nearest multiple of QUANT
- * (<code>QUANT = QUANT_RATE * maxFreq</code>, where <code>QUANT_RATE</code> is 0.01f
- * by default, and <code>maxFreq</code> is the maximum token frequency). If
- * <code>maxFreq</code> is higher than 1, then QUANT is always higher than 2 (which
- * means that tokens with frequency 1 are always discarded).</li>
- * <li>tokens, which frequency after quantization falls below QUANT, are discarded.</li>
- * <li>create a list of tokens and their quantized frequency, separated by spaces,
- * in the order of decreasing frequency.</li>
+ * <li>round down the counts of tokens to the nearest multiple of QUANT (
+ * <code>QUANT = QUANT_RATE * maxFreq</code>, where <code>QUANT_RATE</code> is
+ * 0.01f by default, and <code>maxFreq</code> is the maximum token frequency).
+ * If <code>maxFreq</code> is higher than 1, then QUANT is always higher than 2
+ * (which means that tokens with frequency 1 are always discarded).</li>
+ * <li>tokens, which frequency after quantization falls below QUANT, are
+ * discarded.</li>
+ * <li>create a list of tokens and their quantized frequency, separated by
+ * spaces, in the order of decreasing frequency.</li>
  * </ul>
  * This list is then submitted to an MD5 hash calculation.
  * 
  * @author Andrzej Bialecki &lt;ab@getopt.org&gt;
  */
 public class TextProfileSignature extends Signature {
-  
+
   Signature fallback = new MD5Signature();
 
   public byte[] calculate(Content content, Parse parse) {
-    int MIN_TOKEN_LEN = getConf().getInt("db.signature.text_profile.min_token_len", 2);
-    float QUANT_RATE = getConf().getFloat("db.signature.text_profile.quant_rate", 0.01f);
+    int MIN_TOKEN_LEN = getConf().getInt(
+        "db.signature.text_profile.min_token_len", 2);
+    float QUANT_RATE = getConf().getFloat(
+        "db.signature.text_profile.quant_rate", 0.01f);
     HashMap<String, Token> tokens = new HashMap<String, Token>();
     String text = null;
-    if (parse != null) text = parse.getText();
-    if (text == null || text.length() == 0) return fallback.calculate(content, parse);
+    if (parse != null)
+      text = parse.getText();
+    if (text == null || text.length() == 0)
+      return fallback.calculate(content, parse);
     StringBuffer curToken = new StringBuffer();
     int maxFreq = 0;
     for (int i = 0; i < text.length(); i++) {
@@ -87,7 +96,8 @@ public class TextProfileSignature extend
               tokens.put(s, tok);
             }
             tok.cnt++;
-            if (tok.cnt > maxFreq) maxFreq = tok.cnt;
+            if (tok.cnt > maxFreq)
+              maxFreq = tok.cnt;
           }
           curToken.setLength(0);
         }
@@ -103,17 +113,20 @@ public class TextProfileSignature extend
         tokens.put(s, tok);
       }
       tok.cnt++;
-      if (tok.cnt > maxFreq) maxFreq = tok.cnt;
+      if (tok.cnt > maxFreq)
+        maxFreq = tok.cnt;
     }
     Iterator<Token> it = tokens.values().iterator();
     ArrayList<Token> profile = new ArrayList<Token>();
     // calculate the QUANT value
     int QUANT = Math.round(maxFreq * QUANT_RATE);
     if (QUANT < 2) {
-      if (maxFreq > 1) QUANT = 2;
-      else QUANT = 1;
+      if (maxFreq > 1)
+        QUANT = 2;
+      else
+        QUANT = 1;
     }
-    while(it.hasNext()) {
+    while (it.hasNext()) {
       Token t = it.next();
       // round down to the nearest QUANT
       t.cnt = (t.cnt / QUANT) * QUANT;
@@ -128,32 +141,33 @@ public class TextProfileSignature extend
     it = profile.iterator();
     while (it.hasNext()) {
       Token t = it.next();
-      if (newText.length() > 0) newText.append("\n");
+      if (newText.length() > 0)
+        newText.append("\n");
       newText.append(t.toString());
     }
     return MD5Hash.digest(newText.toString()).getDigest();
   }
-  
+
   private static class Token {
     public int cnt;
     public String val;
-    
+
     public Token(int cnt, String val) {
       this.cnt = cnt;
       this.val = val;
     }
-    
+
     public String toString() {
       return val + " " + cnt;
     }
   }
-  
+
   private static class TokenComparator implements Comparator<Token> {
     public int compare(Token t1, Token t2) {
       return t2.cnt - t1.cnt;
     }
   }
-  
+
   public static void main(String[] args) throws Exception {
     TextProfileSignature sig = new TextProfileSignature();
     sig.setConf(NutchConfiguration.create());
@@ -161,15 +175,18 @@ public class TextProfileSignature extend
     File[] files = new File(args[0]).listFiles();
     for (int i = 0; i < files.length; i++) {
       FileInputStream fis = new FileInputStream(files[i]);
-      BufferedReader br = new BufferedReader(new InputStreamReader(fis, "UTF-8"));
+      BufferedReader br = new BufferedReader(
+          new InputStreamReader(fis, "UTF-8"));
       StringBuffer text = new StringBuffer();
       String line = null;
       while ((line = br.readLine()) != null) {
-        if (text.length() > 0) text.append("\n");
+        if (text.length() > 0)
+          text.append("\n");
         text.append(line);
       }
       br.close();
-      byte[] signature = sig.calculate(null, new ParseImpl(text.toString(), null));
+      byte[] signature = sig.calculate(null, new ParseImpl(text.toString(),
+          null));
       res.put(files[i].toString(), signature);
     }
     Iterator<String> it = res.keySet().iterator();

Modified: nutch/trunk/src/java/org/apache/nutch/crawl/URLPartitioner.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/URLPartitioner.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/URLPartitioner.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/URLPartitioner.java Thu Jan 29 05:38:59 2015
@@ -33,8 +33,9 @@ import org.apache.nutch.util.URLUtil;
  * Partition urls by host, domain name or IP depending on the value of the
  * parameter 'partition.url.mode' which can be 'byHost', 'byDomain' or 'byIP'
  */
-public class URLPartitioner implements Partitioner<Text,Writable> {
-  private static final Logger LOG = LoggerFactory.getLogger(URLPartitioner.class);
+public class URLPartitioner implements Partitioner<Text, Writable> {
+  private static final Logger LOG = LoggerFactory
+      .getLogger(URLPartitioner.class);
 
   public static final String PARTITION_MODE_KEY = "partition.url.mode";
 
@@ -58,7 +59,8 @@ public class URLPartitioner implements P
     normalizers = new URLNormalizers(job, URLNormalizers.SCOPE_PARTITION);
   }
 
-  public void close() {}
+  public void close() {
+  }
 
   /** Hash by domain name. */
   public int getPartition(Text key, Writable value, int numReduceTasks) {
@@ -66,15 +68,16 @@ public class URLPartitioner implements P
     URL url = null;
     int hashCode = urlString.hashCode();
     try {
-      urlString = normalizers.normalize(urlString, URLNormalizers.SCOPE_PARTITION);
+      urlString = normalizers.normalize(urlString,
+          URLNormalizers.SCOPE_PARTITION);
       url = new URL(urlString);
       hashCode = url.getHost().hashCode();
     } catch (MalformedURLException e) {
       LOG.warn("Malformed URL: '" + urlString + "'");
     }
 
-    if (mode.equals(PARTITION_MODE_DOMAIN) && url != null) hashCode = URLUtil
-        .getDomainName(url).hashCode();
+    if (mode.equals(PARTITION_MODE_DOMAIN) && url != null)
+      hashCode = URLUtil.getDomainName(url).hashCode();
     else if (mode.equals(PARTITION_MODE_IP)) {
       try {
         InetAddress address = InetAddress.getByName(url.getHost());