You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ab...@apache.org on 2006/11/28 21:14:59 UTC
svn commit: r480188 - in /lucene/nutch/trunk/src:
java/org/apache/nutch/fetcher/ java/org/apache/nutch/indexer/
java/org/apache/nutch/metadata/ java/org/apache/nutch/parse/
java/org/apache/nutch/segment/
plugin/scoring-opic/src/java/org/apache/nutch/sc...
Author: ab
Date: Tue Nov 28 12:14:58 2006
New Revision: 480188
URL: http://svn.apache.org/viewvc?view=rev&rev=480188
Log:
Move some constants to Nutch.java, so that Metadata could use them properly.
Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java
lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java
lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java
lucene/nutch/trunk/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?view=diff&rev=480188&r1=480187&r2=480188
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Tue Nov 28 12:14:58 2006
@@ -33,6 +33,7 @@
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.SignatureFactory;
import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.net.*;
import org.apache.nutch.protocol.*;
import org.apache.nutch.parse.*;
@@ -45,10 +46,6 @@
public static final Log LOG = LogFactory.getLog(Fetcher.class);
- public static final String SIGNATURE_KEY = "nutch.content.digest";
- public static final String SEGMENT_NAME_KEY = "nutch.segment.name";
- public static final String SCORE_KEY = "nutch.crawl.score";
-
public static class InputFormat extends SequenceFileInputFormat {
/** Don't split inputs, to keep things polite. */
public FileSplit[] getSplits(FileSystem fs, JobConf job, int nSplits)
@@ -268,7 +265,7 @@
}
Metadata metadata = content.getMetadata();
// add segment to metadata
- metadata.set(SEGMENT_NAME_KEY, segmentName);
+ metadata.set(Nutch.SEGMENT_NAME_KEY, segmentName);
// add score to content metadata so that ParseSegment can pick it up.
try {
scfilters.passScoreBeforeParsing(key, datum, content);
@@ -297,11 +294,11 @@
// Calculate page signature. For non-parsing fetchers this will
// be done in ParseSegment
byte[] signature = SignatureFactory.getSignature(getConf()).calculate(content, parse);
- metadata.set(SIGNATURE_KEY, StringUtil.toHexString(signature));
+ metadata.set(Nutch.SIGNATURE_KEY, StringUtil.toHexString(signature));
datum.setSignature(signature);
// Ensure segment name and score are in parseData metadata
- parse.getData().getContentMeta().set(SEGMENT_NAME_KEY, segmentName);
- parse.getData().getContentMeta().set(SIGNATURE_KEY, StringUtil.toHexString(signature));
+ parse.getData().getContentMeta().set(Nutch.SEGMENT_NAME_KEY, segmentName);
+ parse.getData().getContentMeta().set(Nutch.SIGNATURE_KEY, StringUtil.toHexString(signature));
try {
scfilters.passScoreAfterParsing(key, content, parse);
} catch (Exception e) {
@@ -359,7 +356,7 @@
public void configure(JobConf job) {
setConf(job);
- this.segmentName = job.get(SEGMENT_NAME_KEY);
+ this.segmentName = job.get(Nutch.SEGMENT_NAME_KEY);
this.storingContent = isStoringContent(job);
this.parsing = isParsing(job);
@@ -430,7 +427,7 @@
job.setJobName("fetch " + segment);
job.setInt("fetcher.threads.fetch", threads);
- job.set(SEGMENT_NAME_KEY, segment.getName());
+ job.set(Nutch.SEGMENT_NAME_KEY, segment.getName());
// for politeness, don't permit parallel execution of a single task
job.setSpeculativeExecution(false);
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java?view=diff&rev=480188&r1=480187&r2=480188
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java Tue Nov 28 12:14:58 2006
@@ -47,6 +47,7 @@
import org.apache.lucene.index.*;
import org.apache.lucene.document.*;
import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.metadata.Nutch;
/** Create indexes for segments. */
public class Indexer extends ToolBase implements Reducer {
@@ -220,11 +221,11 @@
Metadata metadata = parseData.getContentMeta();
// add segment, used to map from merged index back to segment files
- doc.add(new Field("segment", metadata.get(Fetcher.SEGMENT_NAME_KEY),
+ doc.add(new Field("segment", metadata.get(Nutch.SEGMENT_NAME_KEY),
Field.Store.YES, Field.Index.NO));
// add digest, used by dedup
- doc.add(new Field("digest", metadata.get(Fetcher.SIGNATURE_KEY),
+ doc.add(new Field("digest", metadata.get(Nutch.SIGNATURE_KEY),
Field.Store.YES, Field.Index.NO));
// if (LOG.isInfoEnabled()) {
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java?view=diff&rev=480188&r1=480187&r2=480188
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java Tue Nov 28 12:14:58 2006
@@ -30,5 +30,11 @@
public static final String CHAR_ENCODING_FOR_CONVERSION =
"CharEncodingForConversion";
-
+
+ public static final String SIGNATURE_KEY = "nutch.content.digest";
+
+ public static final String SEGMENT_NAME_KEY = "nutch.segment.name";
+
+ public static final String SCORE_KEY = "nutch.crawl.score";
+
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java?view=diff&rev=480188&r1=480187&r2=480188
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java Tue Nov 28 12:14:58 2006
@@ -29,6 +29,7 @@
import org.apache.nutch.scoring.ScoringFilterException;
import org.apache.nutch.scoring.ScoringFilters;
import org.apache.nutch.util.StringUtil;
+import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.net.*;
import java.io.*;
@@ -89,7 +90,7 @@
ParseData parseData = parse.getData();
// recover the signature prepared by Fetcher or ParseSegment
- String sig = parseData.getContentMeta().get(Fetcher.SIGNATURE_KEY);
+ String sig = parseData.getContentMeta().get(Nutch.SIGNATURE_KEY);
if (sig != null) {
byte[] signature = StringUtil.fromHexString(sig);
if (signature != null) {
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java?view=diff&rev=480188&r1=480187&r2=480188
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java Tue Nov 28 12:14:58 2006
@@ -25,6 +25,7 @@
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.conf.*;
+import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.protocol.*;
import org.apache.nutch.scoring.ScoringFilterException;
import org.apache.nutch.scoring.ScoringFilters;
@@ -80,7 +81,7 @@
// compute the new signature
byte[] signature = SignatureFactory.getSignature(getConf()).calculate(content, parse);
- content.getMetadata().set(Fetcher.SIGNATURE_KEY, StringUtil.toHexString(signature));
+ content.getMetadata().set(Nutch.SIGNATURE_KEY, StringUtil.toHexString(signature));
if (status.isSuccess()) {
try {
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java?view=diff&rev=480188&r1=480187&r2=480188
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java Tue Nov 28 12:14:58 2006
@@ -35,6 +35,7 @@
import org.apache.nutch.fetcher.Fetcher;
import org.apache.nutch.metadata.MetaWrapper;
import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.ParseText;
@@ -170,9 +171,9 @@
} else if (o instanceof ParseData) {
// update the segment name inside contentMeta - required by Indexer
if (slice == null) {
- ((ParseData)o).getContentMeta().set(Fetcher.SEGMENT_NAME_KEY, segmentName);
+ ((ParseData)o).getContentMeta().set(Nutch.SEGMENT_NAME_KEY, segmentName);
} else {
- ((ParseData)o).getContentMeta().set(Fetcher.SEGMENT_NAME_KEY, segmentName + "-" + slice);
+ ((ParseData)o).getContentMeta().set(Nutch.SEGMENT_NAME_KEY, segmentName + "-" + slice);
}
pd_out = ensureMapFile(slice, ParseData.DIR_NAME, ParseData.class);
pd_out.append(key, o);
Modified: lucene/nutch/trunk/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java?view=diff&rev=480188&r1=480187&r2=480188
==============================================================================
--- lucene/nutch/trunk/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java (original)
+++ lucene/nutch/trunk/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java Tue Nov 28 12:14:58 2006
@@ -32,6 +32,7 @@
import org.apache.nutch.crawl.Inlinks;
import org.apache.nutch.fetcher.Fetcher;
import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.protocol.Content;
@@ -102,18 +103,18 @@
/** Store a float value of CrawlDatum.getScore() under Fetcher.SCORE_KEY. */
public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content) {
- content.getMetadata().set(Fetcher.SCORE_KEY, "" + datum.getScore());
+ content.getMetadata().set(Nutch.SCORE_KEY, "" + datum.getScore());
}
/** Copy the value from Content metadata under Fetcher.SCORE_KEY to parseData. */
public void passScoreAfterParsing(Text url, Content content, Parse parse) {
- parse.getData().getContentMeta().set(Fetcher.SCORE_KEY, content.getMetadata().get(Fetcher.SCORE_KEY));
+ parse.getData().getContentMeta().set(Nutch.SCORE_KEY, content.getMetadata().get(Nutch.SCORE_KEY));
}
/** Get a float value from Fetcher.SCORE_KEY, divide it by the number of outlinks and apply. */
public CrawlDatum distributeScoreToOutlink(Text fromUrl, Text toUrl, ParseData parseData, CrawlDatum target, CrawlDatum adjust, int allCount, int validCount) throws ScoringFilterException {
float score = scoreInjected;
- String scoreString = parseData.getContentMeta().get(Fetcher.SCORE_KEY);
+ String scoreString = parseData.getContentMeta().get(Nutch.SCORE_KEY);
if (scoreString != null) {
try {
score = Float.parseFloat(scoreString);