You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ab...@apache.org on 2006/11/28 21:14:59 UTC

svn commit: r480188 - in /lucene/nutch/trunk/src: java/org/apache/nutch/fetcher/ java/org/apache/nutch/indexer/ java/org/apache/nutch/metadata/ java/org/apache/nutch/parse/ java/org/apache/nutch/segment/ plugin/scoring-opic/src/java/org/apache/nutch/sc...

Author: ab
Date: Tue Nov 28 12:14:58 2006
New Revision: 480188

URL: http://svn.apache.org/viewvc?view=rev&rev=480188
Log:
Move some constants to Nutch.java, so that Metadata could use them properly.

Modified:
    lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
    lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java
    lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java
    lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java
    lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java
    lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java
    lucene/nutch/trunk/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?view=diff&rev=480188&r1=480187&r2=480188
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Tue Nov 28 12:14:58 2006
@@ -33,6 +33,7 @@
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.crawl.SignatureFactory;
 import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.metadata.Nutch;
 import org.apache.nutch.net.*;
 import org.apache.nutch.protocol.*;
 import org.apache.nutch.parse.*;
@@ -45,10 +46,6 @@
 
   public static final Log LOG = LogFactory.getLog(Fetcher.class);
   
-  public static final String SIGNATURE_KEY = "nutch.content.digest";
-  public static final String SEGMENT_NAME_KEY = "nutch.segment.name";
-  public static final String SCORE_KEY = "nutch.crawl.score";
-
   public static class InputFormat extends SequenceFileInputFormat {
     /** Don't split inputs, to keep things polite. */
     public FileSplit[] getSplits(FileSystem fs, JobConf job, int nSplits)
@@ -268,7 +265,7 @@
       }
       Metadata metadata = content.getMetadata();
       // add segment to metadata
-      metadata.set(SEGMENT_NAME_KEY, segmentName);
+      metadata.set(Nutch.SEGMENT_NAME_KEY, segmentName);
       // add score to content metadata so that ParseSegment can pick it up.
       try {
         scfilters.passScoreBeforeParsing(key, datum, content);
@@ -297,11 +294,11 @@
         // Calculate page signature. For non-parsing fetchers this will
         // be done in ParseSegment
         byte[] signature = SignatureFactory.getSignature(getConf()).calculate(content, parse);
-        metadata.set(SIGNATURE_KEY, StringUtil.toHexString(signature));
+        metadata.set(Nutch.SIGNATURE_KEY, StringUtil.toHexString(signature));
         datum.setSignature(signature);
         // Ensure segment name and score are in parseData metadata
-        parse.getData().getContentMeta().set(SEGMENT_NAME_KEY, segmentName);
-        parse.getData().getContentMeta().set(SIGNATURE_KEY, StringUtil.toHexString(signature));
+        parse.getData().getContentMeta().set(Nutch.SEGMENT_NAME_KEY, segmentName);
+        parse.getData().getContentMeta().set(Nutch.SIGNATURE_KEY, StringUtil.toHexString(signature));
         try {
           scfilters.passScoreAfterParsing(key, content, parse);
         } catch (Exception e) {
@@ -359,7 +356,7 @@
   public void configure(JobConf job) {
     setConf(job);
 
-    this.segmentName = job.get(SEGMENT_NAME_KEY);
+    this.segmentName = job.get(Nutch.SEGMENT_NAME_KEY);
     this.storingContent = isStoringContent(job);
     this.parsing = isParsing(job);
 
@@ -430,7 +427,7 @@
     job.setJobName("fetch " + segment);
 
     job.setInt("fetcher.threads.fetch", threads);
-    job.set(SEGMENT_NAME_KEY, segment.getName());
+    job.set(Nutch.SEGMENT_NAME_KEY, segment.getName());
 
     // for politeness, don't permit parallel execution of a single task
     job.setSpeculativeExecution(false);

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java?view=diff&rev=480188&r1=480187&r2=480188
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java Tue Nov 28 12:14:58 2006
@@ -47,6 +47,7 @@
 import org.apache.lucene.index.*;
 import org.apache.lucene.document.*;
 import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.metadata.Nutch;
 
 /** Create indexes for segments. */
 public class Indexer extends ToolBase implements Reducer {
@@ -220,11 +221,11 @@
     Metadata metadata = parseData.getContentMeta();
 
     // add segment, used to map from merged index back to segment files
-    doc.add(new Field("segment", metadata.get(Fetcher.SEGMENT_NAME_KEY),
+    doc.add(new Field("segment", metadata.get(Nutch.SEGMENT_NAME_KEY),
             Field.Store.YES, Field.Index.NO));
 
     // add digest, used by dedup
-    doc.add(new Field("digest", metadata.get(Fetcher.SIGNATURE_KEY),
+    doc.add(new Field("digest", metadata.get(Nutch.SIGNATURE_KEY),
             Field.Store.YES, Field.Index.NO));
 
 //     if (LOG.isInfoEnabled()) {

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java?view=diff&rev=480188&r1=480187&r2=480188
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java Tue Nov 28 12:14:58 2006
@@ -30,5 +30,11 @@
   
   public static final String CHAR_ENCODING_FOR_CONVERSION =
           "CharEncodingForConversion";
-    
+
+  public static final String SIGNATURE_KEY = "nutch.content.digest";
+
+  public static final String SEGMENT_NAME_KEY = "nutch.segment.name";
+
+  public static final String SCORE_KEY = "nutch.crawl.score";
+
 }

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java?view=diff&rev=480188&r1=480187&r2=480188
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java Tue Nov 28 12:14:58 2006
@@ -29,6 +29,7 @@
 import org.apache.nutch.scoring.ScoringFilterException;
 import org.apache.nutch.scoring.ScoringFilters;
 import org.apache.nutch.util.StringUtil;
+import org.apache.nutch.metadata.Nutch;
 import org.apache.nutch.net.*;
 
 import java.io.*;
@@ -89,7 +90,7 @@
           
           ParseData parseData = parse.getData();
           // recover the signature prepared by Fetcher or ParseSegment
-          String sig = parseData.getContentMeta().get(Fetcher.SIGNATURE_KEY);
+          String sig = parseData.getContentMeta().get(Nutch.SIGNATURE_KEY);
           if (sig != null) {
             byte[] signature = StringUtil.fromHexString(sig);
             if (signature != null) {

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java?view=diff&rev=480188&r1=480187&r2=480188
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java Tue Nov 28 12:14:58 2006
@@ -25,6 +25,7 @@
 import org.apache.hadoop.io.*;
 import org.apache.hadoop.mapred.*;
 import org.apache.hadoop.conf.*;
+import org.apache.nutch.metadata.Nutch;
 import org.apache.nutch.protocol.*;
 import org.apache.nutch.scoring.ScoringFilterException;
 import org.apache.nutch.scoring.ScoringFilters;
@@ -80,7 +81,7 @@
 
     // compute the new signature
     byte[] signature = SignatureFactory.getSignature(getConf()).calculate(content, parse);
-    content.getMetadata().set(Fetcher.SIGNATURE_KEY, StringUtil.toHexString(signature));
+    content.getMetadata().set(Nutch.SIGNATURE_KEY, StringUtil.toHexString(signature));
     
     if (status.isSuccess()) {
       try {

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java?view=diff&rev=480188&r1=480187&r2=480188
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java Tue Nov 28 12:14:58 2006
@@ -35,6 +35,7 @@
 import org.apache.nutch.fetcher.Fetcher;
 import org.apache.nutch.metadata.MetaWrapper;
 import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.metadata.Nutch;
 import org.apache.nutch.net.URLFilters;
 import org.apache.nutch.parse.ParseData;
 import org.apache.nutch.parse.ParseText;
@@ -170,9 +171,9 @@
           } else if (o instanceof ParseData) {
             // update the segment name inside contentMeta - required by Indexer
             if (slice == null) {
-              ((ParseData)o).getContentMeta().set(Fetcher.SEGMENT_NAME_KEY, segmentName);
+              ((ParseData)o).getContentMeta().set(Nutch.SEGMENT_NAME_KEY, segmentName);
             } else {
-              ((ParseData)o).getContentMeta().set(Fetcher.SEGMENT_NAME_KEY, segmentName + "-" + slice);
+              ((ParseData)o).getContentMeta().set(Nutch.SEGMENT_NAME_KEY, segmentName + "-" + slice);
             }
             pd_out = ensureMapFile(slice, ParseData.DIR_NAME, ParseData.class);
             pd_out.append(key, o);

Modified: lucene/nutch/trunk/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java?view=diff&rev=480188&r1=480187&r2=480188
==============================================================================
--- lucene/nutch/trunk/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java (original)
+++ lucene/nutch/trunk/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java Tue Nov 28 12:14:58 2006
@@ -32,6 +32,7 @@
 import org.apache.nutch.crawl.Inlinks;
 import org.apache.nutch.fetcher.Fetcher;
 import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.metadata.Nutch;
 import org.apache.nutch.parse.Parse;
 import org.apache.nutch.parse.ParseData;
 import org.apache.nutch.protocol.Content;
@@ -102,18 +103,18 @@
 
   /** Store a float value of CrawlDatum.getScore() under Fetcher.SCORE_KEY. */
   public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content) {
-    content.getMetadata().set(Fetcher.SCORE_KEY, "" + datum.getScore());
+    content.getMetadata().set(Nutch.SCORE_KEY, "" + datum.getScore());
   }
 
   /** Copy the value from Content metadata under Fetcher.SCORE_KEY to parseData. */
   public void passScoreAfterParsing(Text url, Content content, Parse parse) {
-    parse.getData().getContentMeta().set(Fetcher.SCORE_KEY, content.getMetadata().get(Fetcher.SCORE_KEY));
+    parse.getData().getContentMeta().set(Nutch.SCORE_KEY, content.getMetadata().get(Nutch.SCORE_KEY));
   }
 
   /** Get a float value from Fetcher.SCORE_KEY, divide it by the number of outlinks and apply. */
   public CrawlDatum distributeScoreToOutlink(Text fromUrl, Text toUrl, ParseData parseData, CrawlDatum target, CrawlDatum adjust, int allCount, int validCount) throws ScoringFilterException {
     float score = scoreInjected;
-    String scoreString = parseData.getContentMeta().get(Fetcher.SCORE_KEY);
+    String scoreString = parseData.getContentMeta().get(Nutch.SCORE_KEY);
     if (scoreString != null) {
       try {
         score = Float.parseFloat(scoreString);