You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ab...@apache.org on 2006/10/16 22:39:02 UTC
svn commit: r464654 [1/2] - in /lucene/nutch/trunk: ./ bin/ lib/
src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/fetcher/
src/java/org/apache/nutch/indexer/ src/java/org/apache/nutch/metadata/
src/java/org/apache/nutch/parse/ src/java/org/ap...
Author: ab
Date: Mon Oct 16 13:38:57 2006
New Revision: 464654
URL: http://svn.apache.org/viewvc?view=rev&rev=464654
Log:
NUTCH-383: upgrade to Hadoop 0.7.1 and Lucene 2.0.0.
NUTCH-373: replace DeleteDuplicates with a version that implements both
parts of the algorithm. Add JUnit test.
Added:
lucene/nutch/trunk/lib/hadoop-0.7.1.jar (with props)
lucene/nutch/trunk/lib/lucene-core-2.0.0.jar (with props)
lucene/nutch/trunk/lib/lucene-misc-2.0.0.jar (with props)
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java (with props)
lucene/nutch/trunk/src/java/org/apache/nutch/tools/compat/
lucene/nutch/trunk/src/java/org/apache/nutch/tools/compat/CrawlDbConverter.java (with props)
lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/lib/lucene-analyzers-2.0.0.jar (with props)
lucene/nutch/trunk/src/plugin/summary-lucene/lib/lucene-highlighter-2.0.0.jar (with props)
lucene/nutch/trunk/src/test/org/apache/nutch/indexer/
lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestDeleteDuplicates.java
Removed:
lucene/nutch/trunk/lib/hadoop-0.5.0.jar
lucene/nutch/trunk/lib/lucene-core-1.9.1.jar
lucene/nutch/trunk/lib/lucene-misc-1.9.1.jar
lucene/nutch/trunk/src/java/org/apache/nutch/util/ToolBase.java
lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/lib/lucene-analyzers-1.9-rc1-dev.jar
lucene/nutch/trunk/src/plugin/summary-lucene/lib/lucene-highlighter-2.0-rc1-dev.jar
Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/bin/nutch
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbFilter.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Inlink.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbMerger.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/MapWritable.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/PartitionUrlByHost.java
lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexMerger.java
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSorter.java
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java
lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Metadata.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/Outlink.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java
lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java
lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Protocol.java
lucene/nutch/trunk/src/java/org/apache/nutch/scoring/ScoringFilter.java
lucene/nutch/trunk/src/java/org/apache/nutch/scoring/ScoringFilters.java
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/DistributedSearch.java
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/FetchedSegments.java
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Hits.java
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/IndexSearcher.java
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LinkDbInlinks.java
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Summary.java
lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java
lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java
lucene/nutch/trunk/src/java/org/apache/nutch/tools/DmozParser.java
lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java
lucene/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java
lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java
lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/plugin.xml
lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java
lucene/nutch/trunk/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java
lucene/nutch/trunk/src/plugin/parse-mp3/src/test/org/apache/nutch/parse/mp3/TestMP3Parser.java
lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/apache/nutch/parse/msexcel/TestMSExcelParser.java
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java
lucene/nutch/trunk/src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java
lucene/nutch/trunk/src/plugin/parse-oo/src/test/org/apache/nutch/parse/oo/TestOOParser.java
lucene/nutch/trunk/src/plugin/parse-pdf/src/test/org/apache/nutch/parse/pdf/TestPdfParser.java
lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java
lucene/nutch/trunk/src/plugin/parse-rss/src/test/org/apache/nutch/parse/rss/TestRSSParser.java
lucene/nutch/trunk/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java
lucene/nutch/trunk/src/plugin/parse-swf/src/test/org/apache/nutch/parse/swf/TestSWFParser.java
lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java
lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
lucene/nutch/trunk/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java
lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java
lucene/nutch/trunk/src/plugin/summary-lucene/plugin.xml
lucene/nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java
lucene/nutch/trunk/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java
lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbMerger.java
lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java
lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestInjector.java
lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestLinkDbMerger.java
lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestMapWritable.java
lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java
Modified: lucene/nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=464654&r1=464653&r2=464654
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Mon Oct 16 13:38:57 2006
@@ -48,6 +48,19 @@
Response.CONTENT_ENCODING is gzip or x-gzip , it can not fetch any thing
(King Kong via pkosiorowski)
+17. NUTCH-383 - upgrade to Hadoop 0.7.1 and Lucene 2.0.0. (ab)
+
+ ****************************** WARNING !!! ********************************
+ * This upgrade breaks data format compatibility. A tool 'convertdb' *
+ * was added to migrate existing CrawlDb-s to the new format. Segment data *
+ * can be partially migrated using 'mergesegs', however segments will *
+ * require re-parsing (and consequently re-indexing). *
+ ****************************** WARNING !!! ********************************
+
+18. NUTCH-371 - DeleteDuplicates now correctly implements both parts of
+ the algorithm. (ab)
+
+
Release 0.8 - 2006-07-25
Modified: lucene/nutch/trunk/bin/nutch
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/bin/nutch?view=diff&rev=464654&r1=464653&r2=464654
==============================================================================
--- lucene/nutch/trunk/bin/nutch (original)
+++ lucene/nutch/trunk/bin/nutch Mon Oct 16 13:38:57 2006
@@ -34,6 +34,7 @@
echo "where COMMAND is one of:"
echo " crawl one-step crawler for intranets"
echo " readdb read / dump crawl db"
+ echo " convdb convert crawl db from pre-0.9 format"
echo " mergedb merge crawldb-s, with optional filtering"
echo " readlinkdb read / dump link db"
echo " inject inject new urls into the database"
@@ -154,6 +155,8 @@
CLASS=org.apache.nutch.parse.ParseSegment
elif [ "$COMMAND" = "readdb" ] ; then
CLASS=org.apache.nutch.crawl.CrawlDbReader
+elif [ "$COMMAND" = "convdb" ] ; then
+ CLASS=org.apache.nutch.tools.compat.CrawlDbConverter
elif [ "$COMMAND" = "mergedb" ] ; then
CLASS=org.apache.nutch.crawl.CrawlDbMerger
elif [ "$COMMAND" = "readlinkdb" ] ; then
Added: lucene/nutch/trunk/lib/hadoop-0.7.1.jar
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/hadoop-0.7.1.jar?view=auto&rev=464654
==============================================================================
Binary file - no diff available.
Propchange: lucene/nutch/trunk/lib/hadoop-0.7.1.jar
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: lucene/nutch/trunk/lib/lucene-core-2.0.0.jar
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/lucene-core-2.0.0.jar?view=auto&rev=464654
==============================================================================
Binary file - no diff available.
Propchange: lucene/nutch/trunk/lib/lucene-core-2.0.0.jar
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: lucene/nutch/trunk/lib/lucene-misc-2.0.0.jar
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/lucene-misc-2.0.0.jar?view=auto&rev=464654
==============================================================================
Binary file - no diff available.
Propchange: lucene/nutch/trunk/lib/lucene-misc-2.0.0.jar
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java?view=diff&rev=464654&r1=464653&r2=464654
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java Mon Oct 16 13:38:57 2006
@@ -28,10 +28,10 @@
import org.apache.hadoop.conf.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.StringUtils;
+import org.apache.hadoop.util.ToolBase;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
-import org.apache.nutch.util.ToolBase;
/**
* This class takes the output of the fetcher and updates the
@@ -97,15 +97,13 @@
job.addInputPath(current);
}
job.setInputFormat(SequenceFileInputFormat.class);
- job.setInputKeyClass(UTF8.class);
- job.setInputValueClass(CrawlDatum.class);
job.setMapperClass(CrawlDbFilter.class);
job.setReducerClass(CrawlDbReducer.class);
job.setOutputPath(newCrawlDb);
job.setOutputFormat(MapFileOutputFormat.class);
- job.setOutputKeyClass(UTF8.class);
+ job.setOutputKeyClass(Text.class);
job.setOutputValueClass(CrawlDatum.class);
return job;
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbFilter.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbFilter.java?view=diff&rev=464654&r1=464653&r2=464654
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbFilter.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbFilter.java Mon Oct 16 13:38:57 2006
@@ -20,7 +20,7 @@
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.JobConf;
@@ -71,6 +71,8 @@
}
public void close() {}
+
+ private Text newKey = new Text();
public void map(WritableComparable key, Writable value, OutputCollector output, Reporter reporter) throws IOException {
@@ -92,7 +94,6 @@
}
}
if (url != null) { // if it passes
- UTF8 newKey = (UTF8) key;
newKey.set(url); // collect it
output.collect(newKey, value);
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java?view=diff&rev=464654&r1=464653&r2=464654
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java Mon Oct 16 13:38:57 2006
@@ -27,14 +27,14 @@
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.StringUtils;
+import org.apache.hadoop.util.ToolBase;
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
-import org.apache.nutch.util.ToolBase;
/**
* This tool merges several CrawlDb-s into one, optionally filtering
@@ -119,7 +119,7 @@
job.setJobName("crawldb merge " + output);
job.setInputFormat(SequenceFileInputFormat.class);
- job.setInputKeyClass(UTF8.class);
+ job.setInputKeyClass(Text.class);
job.setInputValueClass(CrawlDatum.class);
job.setMapperClass(CrawlDbFilter.class);
@@ -129,7 +129,7 @@
job.setOutputPath(newCrawlDb);
job.setOutputFormat(MapFileOutputFormat.class);
- job.setOutputKeyClass(UTF8.class);
+ job.setOutputKeyClass(Text.class);
job.setOutputValueClass(CrawlDatum.class);
return job;
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java?view=diff&rev=464654&r1=464653&r2=464654
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java Mon Oct 16 13:38:57 2006
@@ -33,7 +33,7 @@
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.MapFile;
import org.apache.hadoop.io.SequenceFile;
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.JobClient;
@@ -88,10 +88,10 @@
public void map(WritableComparable key, Writable value, OutputCollector output, Reporter reporter)
throws IOException {
CrawlDatum cd = (CrawlDatum) value;
- output.collect(new UTF8("T"), COUNT_1);
- output.collect(new UTF8("status " + cd.getStatus()), COUNT_1);
- output.collect(new UTF8("retry " + cd.getRetriesSinceFetch()), COUNT_1);
- output.collect(new UTF8("s"), new LongWritable((long) (cd.getScore() * 1000.0)));
+ output.collect(new Text("T"), COUNT_1);
+ output.collect(new Text("status " + cd.getStatus()), COUNT_1);
+ output.collect(new Text("retry " + cd.getRetriesSinceFetch()), COUNT_1);
+ output.collect(new Text("s"), new LongWritable((long) (cd.getScore() * 1000.0)));
}
}
@@ -104,7 +104,7 @@
public void reduce(WritableComparable key, Iterator values, OutputCollector output, Reporter reporter)
throws IOException {
val.set(0L);
- String k = ((UTF8)key).toString();
+ String k = ((Text)key).toString();
if (!k.equals("s")) {
while (values.hasNext()) {
LongWritable cnt = (LongWritable)values.next();
@@ -121,9 +121,9 @@
if (cnt.get() > max) max = cnt.get();
total += cnt.get();
}
- output.collect(new UTF8("scn"), new LongWritable(min));
- output.collect(new UTF8("scx"), new LongWritable(max));
- output.collect(new UTF8("sct"), new LongWritable(total));
+ output.collect(new Text("scn"), new LongWritable(min));
+ output.collect(new Text("scx"), new LongWritable(max));
+ output.collect(new Text("sct"), new LongWritable(total));
}
}
}
@@ -134,7 +134,7 @@
public void reduce(WritableComparable key, Iterator values, OutputCollector output, Reporter reporter)
throws IOException {
- String k = ((UTF8) key).toString();
+ String k = ((Text) key).toString();
if (k.equals("T")) {
// sum all values for this key
long sum = 0;
@@ -244,7 +244,7 @@
job.addInputPath(new Path(crawlDb, CrawlDatum.DB_DIR_NAME));
job.setInputFormat(SequenceFileInputFormat.class);
- job.setInputKeyClass(UTF8.class);
+ job.setInputKeyClass(Text.class);
job.setInputValueClass(CrawlDatum.class);
job.setMapperClass(CrawlDbStatMapper.class);
@@ -253,7 +253,7 @@
job.setOutputPath(tmpFolder);
job.setOutputFormat(SequenceFileOutputFormat.class);
- job.setOutputKeyClass(UTF8.class);
+ job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
JobClient.runJob(job);
@@ -262,7 +262,7 @@
FileSystem fileSystem = FileSystem.get(config);
SequenceFile.Reader[] readers = SequenceFileOutputFormat.getReaders(config, tmpFolder);
- UTF8 key = new UTF8();
+ Text key = new Text();
LongWritable value = new LongWritable();
TreeMap stats = new TreeMap();
@@ -315,7 +315,7 @@
}
public CrawlDatum get(String crawlDb, String url, Configuration config) throws IOException {
- UTF8 key = new UTF8(url);
+ Text key = new Text(url);
CrawlDatum val = new CrawlDatum();
openReaders(crawlDb, config);
CrawlDatum res = (CrawlDatum)MapFileOutputFormat.getEntry(readers, new HashPartitioner(), key, val);
@@ -346,12 +346,12 @@
job.addInputPath(new Path(crawlDb, CrawlDatum.DB_DIR_NAME));
job.setInputFormat(SequenceFileInputFormat.class);
- job.setInputKeyClass(UTF8.class);
+ job.setInputKeyClass(Text.class);
job.setInputValueClass(CrawlDatum.class);
job.setOutputPath(outFolder);
job.setOutputFormat(TextOutputFormat.class);
- job.setOutputKeyClass(UTF8.class);
+ job.setOutputKeyClass(Text.class);
job.setOutputValueClass(CrawlDatum.class);
JobClient.runJob(job);
@@ -375,7 +375,7 @@
job.setJobName("topN prepare " + crawlDb);
job.addInputPath(new Path(crawlDb, CrawlDatum.DB_DIR_NAME));
job.setInputFormat(SequenceFileInputFormat.class);
- job.setInputKeyClass(UTF8.class);
+ job.setInputKeyClass(Text.class);
job.setInputValueClass(CrawlDatum.class);
job.setMapperClass(CrawlDbTopNMapper.class);
job.setReducerClass(IdentityReducer.class);
@@ -383,7 +383,7 @@
job.setOutputPath(tempDir);
job.setOutputFormat(SequenceFileOutputFormat.class);
job.setOutputKeyClass(FloatWritable.class);
- job.setOutputValueClass(UTF8.class);
+ job.setOutputValueClass(Text.class);
// XXX hmmm, no setFloat() in the API ... :(
job.setLong("CrawlDbReader.topN.min", Math.round(1000000.0 * min));
@@ -399,14 +399,14 @@
job.addInputPath(tempDir);
job.setInputFormat(SequenceFileInputFormat.class);
job.setInputKeyClass(FloatWritable.class);
- job.setInputValueClass(UTF8.class);
+ job.setInputValueClass(Text.class);
job.setMapperClass(IdentityMapper.class);
job.setReducerClass(CrawlDbTopNReducer.class);
job.setOutputPath(outFolder);
job.setOutputFormat(TextOutputFormat.class);
job.setOutputKeyClass(FloatWritable.class);
- job.setOutputValueClass(UTF8.class);
+ job.setOutputValueClass(Text.class);
// XXX *sigh* this apparently doesn't work ... :-((
job.setNumReduceTasks(1); // create a single file.
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java?view=diff&rev=464654&r1=464653&r2=464654
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java Mon Oct 16 13:38:57 2006
@@ -109,7 +109,7 @@
} else {
result.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
try {
- scfilters.initialScore((UTF8)key, result);
+ scfilters.initialScore((Text)key, result);
} catch (ScoringFilterException e) {
if (LOG.isWarnEnabled()) {
LOG.warn("Cannot filter init score for url " + key +
@@ -152,7 +152,7 @@
}
try {
- scfilters.updateDbScore((UTF8)key, old, result, linked);
+ scfilters.updateDbScore((Text)key, old, result, linked);
} catch (Exception e) {
if (LOG.isWarnEnabled()) {
LOG.warn("Couldn't update score, key=" + key + ": " + e);
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java?view=diff&rev=464654&r1=464653&r2=464654
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Mon Oct 16 13:38:57 2006
@@ -29,6 +29,7 @@
import org.apache.hadoop.conf.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.StringUtils;
+import org.apache.hadoop.util.ToolBase;
import org.apache.hadoop.fs.Path;
import org.apache.nutch.net.URLFilterException;
@@ -38,7 +39,6 @@
import org.apache.nutch.scoring.ScoringFilters;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
-import org.apache.nutch.util.ToolBase;
/** Generates a subset of a crawl db to fetch. */
public class Generator extends ToolBase {
@@ -46,11 +46,11 @@
public static final Log LOG = LogFactory.getLog(Generator.class);
public static class SelectorEntry implements Writable {
- public UTF8 url;
+ public Text url;
public CrawlDatum datum;
public SelectorEntry() {
- url = new UTF8();
+ url = new Text();
datum = new CrawlDatum();
}
@@ -102,7 +102,7 @@
public void map(WritableComparable key, Writable value,
OutputCollector output, Reporter reporter)
throws IOException {
- UTF8 url = (UTF8)key;
+ Text url = (Text)key;
// don't generate URLs that don't pass URLFilters
try {
if (filters.filter(url.toString()) == null)
@@ -122,7 +122,7 @@
float sort = 1.0f;
try {
- sort = scfilters.generatorSortValue((UTF8)key, crawlDatum, sort);
+ sort = scfilters.generatorSortValue((Text)key, crawlDatum, sort);
} catch (ScoringFilterException sfe) {
if (LOG.isWarnEnabled()) {
LOG.warn("Couldn't filter generatorSortValue for " + key + ": " + sfe);
@@ -131,7 +131,7 @@
// sort by decreasing score, using DecreasingFloatComparator
sortValue.set(sort);
entry.datum = crawlDatum;
- entry.url = (UTF8)key;
+ entry.url = (Text)key;
output.collect(sortValue, entry); // invert for sort by score
}
@@ -150,7 +150,7 @@
while (values.hasNext() && count < limit) {
SelectorEntry entry = (SelectorEntry)values.next();
- UTF8 url = entry.url;
+ Text url = entry.url;
if (maxPerHost > 0) { // are we counting hosts?
String host = new URL(url.toString()).getHost();
@@ -236,11 +236,11 @@
/** Sort fetch lists by hash of URL. */
public static class HashComparator extends WritableComparator {
- public HashComparator() { super(UTF8.class); }
+ public HashComparator() { super(Text.class); }
public int compare(WritableComparable a, WritableComparable b) {
- UTF8 url1 = (UTF8)a;
- UTF8 url2 = (UTF8)b;
+ Text url1 = (Text)a;
+ Text url2 = (Text)b;
int hash1 = hash(url1.getBytes(), 0, url1.getLength());
int hash2 = hash(url2.getBytes(), 0, url2.getLength());
if (hash1 != hash2) {
@@ -252,14 +252,12 @@
public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
- int n1 = readUnsignedShort(b1, s1);
- int n2 = readUnsignedShort(b2, s2);
- int hash1 = hash(b1, s1+2, n1);
- int hash2 = hash(b2, s2+2, n2);
+ int hash1 = hash(b1, s1, l1);
+ int hash2 = hash(b2, s2, l2);
if (hash1 != hash2) {
return hash1 - hash2;
}
- return compareBytes(b1, s1+2, n1, b2, s2+2, n2);
+ return compareBytes(b1, s1, l1, b2, s2, l2);
}
private static int hash(byte[] bytes, int start, int length) {
@@ -319,7 +317,7 @@
job.setInputPath(new Path(dbDir, CrawlDatum.DB_DIR_NAME));
job.setInputFormat(SequenceFileInputFormat.class);
- job.setInputKeyClass(UTF8.class);
+ job.setInputKeyClass(Text.class);
job.setInputValueClass(CrawlDatum.class);
job.setMapperClass(Selector.class);
@@ -353,7 +351,7 @@
job.setOutputPath(output);
job.setOutputFormat(SequenceFileOutputFormat.class);
- job.setOutputKeyClass(UTF8.class);
+ job.setOutputKeyClass(Text.class);
job.setOutputValueClass(CrawlDatum.class);
job.setOutputKeyComparatorClass(HashComparator.class);
JobClient.runJob(job);
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java?view=diff&rev=464654&r1=464653&r2=464654
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java Mon Oct 16 13:38:57 2006
@@ -28,13 +28,13 @@
import org.apache.hadoop.conf.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.StringUtils;
+import org.apache.hadoop.util.ToolBase;
import org.apache.nutch.net.*;
import org.apache.nutch.scoring.ScoringFilterException;
import org.apache.nutch.scoring.ScoringFilters;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
-import org.apache.nutch.util.ToolBase;
/** This class takes a flat file of URLs and adds them to the of pages to be
* crawled. Useful for bootstrapping the system. */
@@ -65,7 +65,7 @@
public void map(WritableComparable key, Writable val,
OutputCollector output, Reporter reporter)
throws IOException {
- UTF8 value = (UTF8)val;
+ Text value = (Text)val;
String url = value.toString(); // value is line of text
// System.out.println("url: " +url);
try {
@@ -138,7 +138,7 @@
sortJob.setOutputPath(tempDir);
sortJob.setOutputFormat(SequenceFileOutputFormat.class);
- sortJob.setOutputKeyClass(UTF8.class);
+ sortJob.setOutputKeyClass(Text.class);
sortJob.setOutputValueClass(CrawlDatum.class);
JobClient.runJob(sortJob);
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Inlink.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Inlink.java?view=diff&rev=464654&r1=464653&r2=464654
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Inlink.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Inlink.java Mon Oct 16 13:38:57 2006
@@ -33,19 +33,19 @@
}
public void readFields(DataInput in) throws IOException {
- fromUrl = UTF8.readString(in);
- anchor = UTF8.readString(in);
+ fromUrl = Text.readString(in);
+ anchor = Text.readString(in);
}
/** Skips over one Inlink in the input. */
public static void skip(DataInput in) throws IOException {
- UTF8.skip(in); // skip fromUrl
- UTF8.skip(in); // skip anchor
+ Text.skip(in); // skip fromUrl
+ Text.skip(in); // skip anchor
}
public void write(DataOutput out) throws IOException {
- UTF8.writeString(out, fromUrl);
- UTF8.writeString(out, anchor);
+ Text.writeString(out, fromUrl);
+ Text.writeString(out, anchor);
}
public static Inlink read(DataInput in) throws IOException {
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java?view=diff&rev=464654&r1=464653&r2=464654
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java Mon Oct 16 13:38:57 2006
@@ -30,13 +30,13 @@
import org.apache.hadoop.conf.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.StringUtils;
+import org.apache.hadoop.util.ToolBase;
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.parse.*;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
-import org.apache.nutch.util.ToolBase;
/** Maintains an inverted link map, listing incoming links for each url. */
public class LinkDb extends ToolBase implements Mapper, Reducer {
@@ -161,7 +161,7 @@
anchor = anchor.substring(0, maxAnchorLength);
}
inlinks.add(new Inlink(fromUrl, anchor)); // collect inverted link
- output.collect(new UTF8(toUrl), inlinks);
+ output.collect(new Text(toUrl), inlinks);
}
}
@@ -256,7 +256,7 @@
job.setJobName("linkdb " + linkDb);
job.setInputFormat(SequenceFileInputFormat.class);
- job.setInputKeyClass(UTF8.class);
+ job.setInputKeyClass(Text.class);
job.setInputValueClass(ParseData.class);
job.setMapperClass(LinkDb.class);
@@ -277,7 +277,7 @@
job.setOutputPath(newLinkDb);
job.setOutputFormat(MapFileOutputFormat.class);
job.setBoolean("mapred.output.compress", true);
- job.setOutputKeyClass(UTF8.class);
+ job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Inlinks.class);
return job;
@@ -292,7 +292,7 @@
job.setJobName("linkdb merge " + linkDb);
job.setInputFormat(SequenceFileInputFormat.class);
- job.setInputKeyClass(UTF8.class);
+ job.setInputKeyClass(Text.class);
job.setInputValueClass(Inlinks.class);
job.setMapperClass(LinkDbFilter.class);
@@ -303,7 +303,7 @@
job.setOutputPath(newLinkDb);
job.setOutputFormat(MapFileOutputFormat.class);
job.setBoolean("mapred.output.compress", true);
- job.setOutputKeyClass(UTF8.class);
+ job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Inlinks.class);
return job;
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbMerger.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbMerger.java?view=diff&rev=464654&r1=464653&r2=464654
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbMerger.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbMerger.java Mon Oct 16 13:38:57 2006
@@ -17,6 +17,8 @@
import java.util.ArrayList;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
@@ -24,8 +26,8 @@
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.util.StringUtils;
+import org.apache.hadoop.util.ToolBase;
import org.apache.nutch.util.NutchConfiguration;
-import org.apache.nutch.util.ToolBase;
/**
* This tool merges several LinkDb-s into one, optionally filtering
@@ -46,6 +48,8 @@
* @author Andrzej Bialecki
*/
public class LinkDbMerger extends ToolBase {
+ private static final Log LOG = LogFactory.getLog(LinkDbMerger.class);
+
public LinkDbMerger() {
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java?view=diff&rev=464654&r1=464653&r2=464654
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java Mon Oct 16 13:38:57 2006
@@ -27,11 +27,11 @@
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.mapred.lib.HashPartitioner;
import org.apache.hadoop.util.StringUtils;
+import org.apache.hadoop.util.ToolBase;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
-import org.apache.nutch.util.ToolBase;
import java.util.Iterator;
@@ -59,14 +59,14 @@
this.directory = directory;
}
- public String[] getAnchors(UTF8 url) throws IOException {
+ public String[] getAnchors(Text url) throws IOException {
Inlinks inlinks = getInlinks(url);
if (inlinks == null)
return null;
return inlinks.getAnchors();
}
- public Inlinks getInlinks(UTF8 url) throws IOException {
+ public Inlinks getInlinks(Text url) throws IOException {
if (readers == null) {
synchronized(this) {
@@ -100,12 +100,12 @@
job.addInputPath(new Path(linkdb, LinkDb.CURRENT_NAME));
job.setInputFormat(SequenceFileInputFormat.class);
- job.setInputKeyClass(UTF8.class);
+ job.setInputKeyClass(Text.class);
job.setInputValueClass(Inlinks.class);
job.setOutputPath(outFolder);
job.setOutputFormat(TextOutputFormat.class);
- job.setOutputKeyClass(UTF8.class);
+ job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Inlinks.class);
JobClient.runJob(job);
@@ -129,7 +129,7 @@
return 0;
} else if (args[1].equals("-url")) {
init(new Path(args[0]));
- Inlinks links = getInlinks(new UTF8(args[2]));
+ Inlinks links = getInlinks(new Text(args[2]));
if (links == null) {
System.out.println(" - no link information.");
} else {
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/MapWritable.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/MapWritable.java?view=diff&rev=464654&r1=464653&r2=464654
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/MapWritable.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/MapWritable.java Mon Oct 16 13:38:57 2006
@@ -39,7 +39,7 @@
import org.apache.hadoop.io.MD5Hash;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.ObjectWritable;
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.util.StringUtils;
@@ -81,7 +81,7 @@
addToMap(NullWritable.class, new Byte((byte) -127));
addToMap(LongWritable.class, new Byte((byte) -126));
- addToMap(UTF8.class, new Byte((byte) -125));
+ addToMap(Text.class, new Byte((byte) -125));
addToMap(MD5Hash.class, new Byte((byte) -124));
addToMap(org.apache.nutch.fetcher.FetcherOutput.class,
new Byte((byte) -123));
@@ -305,7 +305,7 @@
ClassIdEntry entry = fIdFirst;
while (entry != null) {
out.writeByte(entry.fId);
- UTF8.writeString(out, entry.fclazz.getName());
+ Text.writeString(out, entry.fclazz.getName());
entry = entry.fNextIdEntry;
}
}
@@ -336,7 +336,7 @@
for (int i = 0; i < fIdCount; i++) {
try {
id = in.readByte();
- clazz = Class.forName(UTF8.readString(in));
+ clazz = Class.forName(Text.readString(in));
addIdEntry(id, clazz);
} catch (Exception e) {
if (LOG.isWarnEnabled()) {
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/PartitionUrlByHost.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/PartitionUrlByHost.java?view=diff&rev=464654&r1=464653&r2=464654
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/PartitionUrlByHost.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/PartitionUrlByHost.java Mon Oct 16 13:38:57 2006
@@ -42,7 +42,7 @@
/** Hash by hostname. */
public int getPartition(WritableComparable key, Writable value,
int numReduceTasks) {
- String urlString = ((UTF8)key).toString();
+ String urlString = ((Text)key).toString();
try {
urlString = normalizers.normalize(urlString, URLNormalizers.SCOPE_PARTITION);
} catch (Exception e) {
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?view=diff&rev=464654&r1=464653&r2=464654
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Mon Oct 16 13:38:57 2006
@@ -27,6 +27,7 @@
import org.apache.hadoop.conf.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.StringUtils;
+import org.apache.hadoop.util.ToolBase;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.SignatureFactory;
@@ -101,7 +102,7 @@
synchronized (Fetcher.this) {activeThreads++;} // count threads
try {
- UTF8 key = new UTF8();
+ Text key = new Text();
CrawlDatum datum = new CrawlDatum();
while (true) {
@@ -128,7 +129,7 @@
}
// url may be changed through redirects.
- UTF8 url = new UTF8();
+ Text url = new Text();
url.set(key);
try {
if (LOG.isInfoEnabled()) { LOG.info("fetching " + url); }
@@ -158,7 +159,7 @@
newUrl = normalizers.normalize(newUrl, URLNormalizers.SCOPE_FETCHER);
newUrl = this.urlFilters.filter(newUrl);
if (newUrl != null && !newUrl.equals(url.toString())) {
- url = new UTF8(newUrl);
+ url = new Text(newUrl);
redirecting = true;
redirectCount++;
if (LOG.isDebugEnabled()) {
@@ -177,7 +178,7 @@
newUrl = normalizers.normalize(newUrl, URLNormalizers.SCOPE_FETCHER);
newUrl = this.urlFilters.filter(newUrl);
if (newUrl != null && !newUrl.equals(url.toString())) {
- url = new UTF8(newUrl);
+ url = new Text(newUrl);
redirecting = true;
redirectCount++;
if (LOG.isDebugEnabled()) {
@@ -245,7 +246,7 @@
}
}
- private void logError(UTF8 url, String message) {
+ private void logError(Text url, String message) {
if (LOG.isInfoEnabled()) {
LOG.info("fetch of " + url + " failed with: " + message);
}
@@ -254,7 +255,7 @@
}
}
- private ParseStatus output(UTF8 key, CrawlDatum datum,
+ private ParseStatus output(Text key, CrawlDatum datum,
Content content, int status) {
datum.setStatus(status);
@@ -435,14 +436,14 @@
job.setInputPath(new Path(segment, CrawlDatum.GENERATE_DIR_NAME));
job.setInputFormat(InputFormat.class);
- job.setInputKeyClass(UTF8.class);
+ job.setInputKeyClass(Text.class);
job.setInputValueClass(CrawlDatum.class);
job.setMapRunnerClass(Fetcher.class);
job.setOutputPath(segment);
job.setOutputFormat(FetcherOutputFormat.class);
- job.setOutputKeyClass(UTF8.class);
+ job.setOutputKeyClass(Text.class);
job.setOutputValueClass(FetcherOutput.class);
JobClient.runJob(job);
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java?view=diff&rev=464654&r1=464653&r2=464654
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java Mon Oct 16 13:38:57 2006
@@ -25,7 +25,7 @@
import org.apache.hadoop.io.MapFile;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.Writable;
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.OutputFormat;
import org.apache.hadoop.mapred.RecordWriter;
@@ -55,7 +55,7 @@
new Path(new Path(job.getOutputPath(), Content.DIR_NAME), name);
final MapFile.Writer fetchOut =
- new MapFile.Writer(fs, fetch.toString(), UTF8.class, CrawlDatum.class);
+ new MapFile.Writer(fs, fetch.toString(), Text.class, CrawlDatum.class);
return new RecordWriter() {
private MapFile.Writer contentOut;
@@ -64,7 +64,7 @@
{
if (Fetcher.isStoringContent(job)) {
contentOut = new MapFile.Writer(fs, content.toString(),
- UTF8.class, Content.class);
+ Text.class, Content.class);
}
if (Fetcher.isParsing(job)) {
Added: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java?view=auto&rev=464654
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java (added)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java Mon Oct 16 13:38:57 2006
@@ -0,0 +1,507 @@
+/**
+ * Copyright 2006 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.indexer;
+
+import java.io.*;
+import java.text.SimpleDateFormat;
+import java.util.*;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+import org.apache.hadoop.io.*;
+import org.apache.hadoop.fs.*;
+import org.apache.hadoop.conf.*;
+import org.apache.hadoop.mapred.*;
+import org.apache.hadoop.util.Progressable;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.hadoop.util.ToolBase;
+
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.NutchJob;
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.document.DateTools;
+import org.apache.lucene.document.Document;
+
+/**
+ * Delete duplicate documents in a set of Lucene indexes.
+ * Duplicates have either the same contents (via MD5 hash) or the same URL.
+ *
+ * This tool uses the following algorithm:
+ *
+ * <ul>
+ * <li><b>Phase 1 - remove URL duplicates:</b><br/>
+ * In this phase documents with the same URL
+ * are compared, and only the most recent document is retained -
+ * all other URL duplicates are scheduled for deletion.</li>
+ * <li><b>Phase 2 - remove content duplicates:</b><br/>
+ * In this phase documents with the same content hash are compared. If
+ * property "dedup.keep.highest.score" is set to true (default) then only
+ * the document with the highest score is retained. If this property is set
+ * to false, only the document with the shortest URL is retained - all other
+ * content duplicates are scheduled for deletion.</li>
+ * <li><b>Phase 3 - delete documents:</b><br/>
+ * In this phase documents scheduled for deletion are marked as deleted in
+ * Lucene index(es).</li>
+ * </ul>
+ *
+ * @author Andrzej Bialecki
+ */
+public class DeleteDuplicates extends ToolBase
+ implements Mapper, Reducer, OutputFormat {
+ private static final Log LOG = LogFactory.getLog(DeleteDuplicates.class);
+
+// Algorithm:
+//
+// 1. map indexes -> <url, <md5, url, time, urlLen, index,doc>>
+// reduce, deleting all but most recent
+//
+// 2. map indexes -> <md5, <md5, url, time, urlLen, index,doc>>
+// partition by md5
+// reduce, deleting all but with highest score (or shortest url).
+
+ public static class IndexDoc implements WritableComparable {
+ private Text url = new Text();
+ private int urlLen;
+ private float score;
+ private long time;
+ private MD5Hash hash = new MD5Hash();
+ private Text index = new Text(); // the segment index
+ private int doc; // within the index
+ private boolean keep = true; // keep or discard
+
+ public String toString() {
+ return "[url=" + url + ",score=" + score + ",time=" + time
+ + ",hash=" + hash + ",index=" + index + ",doc=" + doc
+ + ",keep=" + keep + "]";
+ }
+
+ public void write(DataOutput out) throws IOException {
+ url.write(out);
+ out.writeFloat(score);
+ out.writeLong(time);
+ hash.write(out);
+ index.write(out);
+ out.writeInt(doc);
+ out.writeBoolean(keep);
+ }
+
+ public void readFields(DataInput in) throws IOException {
+ url.readFields(in);
+ urlLen = url.getLength();
+ score = in.readFloat();
+ time = in.readLong();
+ hash.readFields(in);
+ index.readFields(in);
+ doc = in.readInt();
+ keep = in.readBoolean();
+ }
+
+ public int compareTo(Object o) {
+ IndexDoc that = (IndexDoc)o;
+ if (this.keep != that.keep) {
+ return this.keep ? 1 : -1;
+ } else if (!this.hash.equals(that.hash)) { // order first by hash
+ return this.hash.compareTo(that.hash);
+ } else if (this.time != that.time) { // prefer more recent docs
+ return this.time > that.time ? 1 : -1 ;
+ } else if (this.urlLen != this.urlLen) { // prefer shorter urls
+ return this.urlLen - that.urlLen;
+ } else {
+ return this.score > that.score ? 1 : -1;
+ }
+ }
+
+ public boolean equals(Object o) {
+ IndexDoc that = (IndexDoc)o;
+ return this.keep == that.keep
+ && this.hash.equals(that.hash)
+ && this.time == that.time
+ && this.score == that.score
+ && this.urlLen == that.urlLen
+ && this.index.equals(that.index)
+ && this.doc == that.doc;
+ }
+
+ }
+
+ public static class InputFormat extends InputFormatBase {
+ private static final long INDEX_LENGTH = Integer.MAX_VALUE;
+
+ /** Return each index as a split. */
+ public FileSplit[] getSplits(FileSystem fs, JobConf job,
+ int numSplits)
+ throws IOException {
+ Path[] files = listPaths(fs, job);
+ FileSplit[] splits = new FileSplit[files.length];
+ for (int i = 0; i < files.length; i++) {
+ splits[i] = new FileSplit(files[i], 0, INDEX_LENGTH);
+ }
+ return splits;
+ }
+
+ public class DDRecordReader implements RecordReader {
+
+ private IndexReader indexReader;
+ private int maxDoc;
+ private int doc;
+ private Text index;
+
+ public DDRecordReader(FileSystem fs, FileSplit split, JobConf job,
+ Text index) throws IOException {
+ indexReader = IndexReader.open(new FsDirectory(fs, split.getPath(), false, job));
+ maxDoc = indexReader.maxDoc();
+ this.index = index;
+ }
+
+ public boolean next(Writable key, Writable value)
+ throws IOException {
+
+ // skip deleted documents
+ while (indexReader.isDeleted(doc) && doc < maxDoc) doc++;
+ if (doc >= maxDoc)
+ return false;
+
+ Document document = indexReader.document(doc);
+
+ // fill in key
+ ((Text)key).set(document.get("url"));
+ // fill in value
+ IndexDoc indexDoc = (IndexDoc)value;
+ indexDoc.keep = true;
+ indexDoc.url.set(document.get("url"));
+ indexDoc.hash.setDigest(document.get("digest"));
+ indexDoc.score = Float.parseFloat(document.get("boost"));
+ try {
+ indexDoc.time = DateTools.stringToTime(document.get("tstamp"));
+ } catch (Exception e) {
+ // try to figure out the time from segment name
+ try {
+ String segname = document.get("segment");
+ indexDoc.time = new SimpleDateFormat("yyyyMMddHHmmss").parse(segname).getTime();
+ // make it unique
+ indexDoc.time += doc;
+ } catch (Exception e1) {
+ // use current time
+ indexDoc.time = System.currentTimeMillis();
+ }
+ }
+ indexDoc.index = index;
+ indexDoc.doc = doc;
+
+ doc++;
+
+ return true;
+ }
+
+ public long getPos() throws IOException {
+ return maxDoc==0 ? 0 : (doc*INDEX_LENGTH)/maxDoc;
+ }
+
+ public void close() throws IOException {
+ indexReader.close();
+ }
+
+ public WritableComparable createKey() {
+ return new Text();
+ }
+
+ public Writable createValue() {
+ return new IndexDoc();
+ }
+ }
+
+ /** Return each index as a split. */
+ public RecordReader getRecordReader(final FileSystem fs,
+ final FileSplit split,
+ final JobConf job,
+ Reporter reporter) throws IOException {
+ final Text index = new Text(split.getPath().toString());
+ reporter.setStatus(index.toString());
+ return new DDRecordReader(fs, split, job, index);
+ }
+ }
+
+ public static class HashPartitioner implements Partitioner {
+ public void configure(JobConf job) {}
+ public void close() {}
+ public int getPartition(WritableComparable key, Writable value,
+ int numReduceTasks) {
+ int hashCode = ((MD5Hash)key).hashCode();
+ return (hashCode & Integer.MAX_VALUE) % numReduceTasks;
+ }
+ }
+
+ public static class UrlsReducer implements Reducer {
+
+ public void configure(JobConf job) {}
+
+ public void close() {}
+
+ public void reduce(WritableComparable key, Iterator values,
+ OutputCollector output, Reporter reporter) throws IOException {
+ IndexDoc latest = null;
+ while (values.hasNext()) {
+ IndexDoc value = (IndexDoc)values.next();
+ if (latest == null) {
+ latest = value;
+ continue;
+ }
+ if (value.time > latest.time) {
+ // discard current and use more recent
+ latest.keep = false;
+ LOG.debug("-discard " + latest + ", keep " + value);
+ output.collect(latest.hash, latest);
+ latest = value;
+ } else {
+ // discard
+ value.keep = false;
+ LOG.debug("-discard " + value + ", keep " + latest);
+ output.collect(value.hash, value);
+ }
+
+ }
+ // keep the latest
+ latest.keep = true;
+ output.collect(latest.hash, latest);
+
+ }
+ }
+
+ public static class HashReducer implements Reducer {
+ boolean byScore;
+
+ public void configure(JobConf job) {
+ byScore = job.getBoolean("dedup.keep.highest.score", true);
+ }
+
+ public void close() {}
+ public void reduce(WritableComparable key, Iterator values,
+ OutputCollector output, Reporter reporter)
+ throws IOException {
+ IndexDoc highest = null;
+ while (values.hasNext()) {
+ IndexDoc value = (IndexDoc)values.next();
+ // skip already deleted
+ if (!value.keep) {
+ LOG.debug("-discard " + value + " (already marked)");
+ output.collect(value.url, value);
+ continue;
+ }
+ if (highest == null) {
+ highest = value;
+ continue;
+ }
+ if (byScore) {
+ if (value.score > highest.score) {
+ highest.keep = false;
+ LOG.debug("-discard " + highest + ", keep " + value);
+ output.collect(highest.url, highest); // delete highest
+ highest = value;
+ }
+ } else {
+ if (value.urlLen < highest.urlLen) {
+ highest.keep = false;
+ LOG.debug("-discard " + highest + ", keep " + value);
+ output.collect(highest.url, highest); // delete highest
+ highest = value;
+ }
+ }
+ }
+ LOG.debug("-keep " + highest);
+ // no need to add this - in phase 2 we only process docs to delete them
+ // highest.keep = true;
+ // output.collect(key, highest);
+ }
+ }
+
+ private FileSystem fs;
+
+ public void configure(JobConf job) {
+ setConf(job);
+ }
+
+ public void setConf(Configuration conf) {
+ super.setConf(conf);
+ try {
+ fs = FileSystem.get(conf);
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ public void close() {}
+
+ /** Map [*,IndexDoc] pairs to [index,doc] pairs. */
+ public void map(WritableComparable key, Writable value,
+ OutputCollector output, Reporter reporter)
+ throws IOException {
+ IndexDoc indexDoc = (IndexDoc)value;
+ // don't delete these
+ if (indexDoc.keep) return;
+ // delete all others
+ output.collect(indexDoc.index, new IntWritable(indexDoc.doc));
+ }
+
+ /** Delete docs named in values from index named in key. */
+ public void reduce(WritableComparable key, Iterator values,
+ OutputCollector output, Reporter reporter)
+ throws IOException {
+ Path index = new Path(key.toString());
+ IndexReader reader = IndexReader.open(new FsDirectory(fs, index, false, getConf()));
+ try {
+ while (values.hasNext()) {
+ IntWritable value = (IntWritable)values.next();
+ LOG.debug("-delete " + index + " doc=" + value);
+ reader.deleteDocument(value.get());
+ }
+ } finally {
+ reader.close();
+ }
+ }
+
+ /** Write nothing. */
+ public RecordWriter getRecordWriter(final FileSystem fs,
+ final JobConf job,
+ final String name,
+ final Progressable progress) throws IOException {
+ return new RecordWriter() {
+ public void write(WritableComparable key, Writable value)
+ throws IOException {
+ throw new UnsupportedOperationException();
+ }
+ public void close(Reporter reporter) throws IOException {}
+ };
+ }
+
+ public DeleteDuplicates() {
+
+ }
+
+ public DeleteDuplicates(Configuration conf) {
+ setConf(conf);
+ }
+
+ public void checkOutputSpecs(FileSystem fs, JobConf job) {}
+
+ public void dedup(Path[] indexDirs)
+ throws IOException {
+
+ if (LOG.isInfoEnabled()) { LOG.info("Dedup: starting"); }
+
+ Path outDir1 =
+ new Path("dedup-urls-"+
+ Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
+
+ JobConf job = new NutchJob(getConf());
+
+ for (int i = 0; i < indexDirs.length; i++) {
+ if (LOG.isInfoEnabled()) {
+ LOG.info("Dedup: adding indexes in: " + indexDirs[i]);
+ }
+ job.addInputPath(indexDirs[i]);
+ }
+ job.setJobName("dedup 1: urls by time");
+
+ job.setInputFormat(InputFormat.class);
+ job.setMapOutputKeyClass(Text.class);
+ job.setMapOutputValueClass(IndexDoc.class);
+
+ job.setReducerClass(UrlsReducer.class);
+ job.setOutputPath(outDir1);
+
+ job.setOutputKeyClass(MD5Hash.class);
+ job.setOutputValueClass(IndexDoc.class);
+ job.setOutputFormat(SequenceFileOutputFormat.class);
+
+ JobClient.runJob(job);
+
+ Path outDir2 =
+ new Path("dedup-hash-"+
+ Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
+ job = new NutchJob(getConf());
+ job.setJobName("dedup 2: content by hash");
+
+ job.addInputPath(outDir1);
+ job.setInputFormat(SequenceFileInputFormat.class);
+ job.setMapOutputKeyClass(MD5Hash.class);
+ job.setMapOutputValueClass(IndexDoc.class);
+ job.setPartitionerClass(HashPartitioner.class);
+ job.setSpeculativeExecution(false);
+
+ job.setReducerClass(HashReducer.class);
+ job.setOutputPath(outDir2);
+
+ job.setOutputKeyClass(Text.class);
+ job.setOutputValueClass(IndexDoc.class);
+ job.setOutputFormat(SequenceFileOutputFormat.class);
+
+ JobClient.runJob(job);
+
+ // remove outDir1 - no longer needed
+ fs.delete(outDir1);
+
+ job = new NutchJob(getConf());
+ job.setJobName("dedup 3: delete from index(es)");
+
+ job.addInputPath(outDir2);
+ job.setInputFormat(SequenceFileInputFormat.class);
+ //job.setInputKeyClass(Text.class);
+ //job.setInputValueClass(IndexDoc.class);
+
+ job.setInt("io.file.buffer.size", 4096);
+ job.setMapperClass(DeleteDuplicates.class);
+ job.setReducerClass(DeleteDuplicates.class);
+
+ job.setOutputFormat(DeleteDuplicates.class);
+ job.setOutputKeyClass(Text.class);
+ job.setOutputValueClass(IntWritable.class);
+
+ JobClient.runJob(job);
+
+ fs.delete(outDir2);
+
+ if (LOG.isInfoEnabled()) { LOG.info("Dedup: done"); }
+ }
+
+ public static void main(String[] args) throws Exception {
+ int res = new DeleteDuplicates().doMain(NutchConfiguration.create(), args);
+ System.exit(res);
+ }
+
+ public int run(String[] args) throws Exception {
+
+ if (args.length < 1) {
+ System.err.println("Usage: DeleteDuplicates <indexes> ...");
+ return -1;
+ }
+
+ Path[] indexes = new Path[args.length];
+ for (int i = 0; i < args.length; i++) {
+ indexes[i] = new Path(args[i]);
+ }
+ try {
+ dedup(indexes);
+ return 0;
+ } catch (Exception e) {
+ LOG.fatal("DeleteDuplicates: " + StringUtils.stringifyException(e));
+ return -1;
+ }
+ }
+
+}
Propchange: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexMerger.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexMerger.java?view=diff&rev=464654&r1=464653&r2=464654
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexMerger.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexMerger.java Mon Oct 16 13:38:57 2006
@@ -24,11 +24,11 @@
import org.apache.hadoop.fs.*;
import org.apache.hadoop.util.StringUtils;
+import org.apache.hadoop.util.ToolBase;
import org.apache.hadoop.conf.*;
import org.apache.nutch.util.LogUtil;
import org.apache.nutch.util.NutchConfiguration;
-import org.apache.nutch.util.ToolBase;
import org.apache.lucene.store.Directory;
import org.apache.lucene.index.IndexWriter;
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSorter.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSorter.java?view=diff&rev=464654&r1=464653&r2=464654
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSorter.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSorter.java Mon Oct 16 13:38:57 2006
@@ -27,12 +27,12 @@
import org.apache.lucene.search.*;
import org.apache.nutch.util.NutchConfiguration;
-import org.apache.nutch.util.ToolBase;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.util.StringUtils;
+import org.apache.hadoop.util.ToolBase;
/** Sort a Nutch index by page score. Higher scoring documents are assigned
* smaller document numbers. */
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java?view=diff&rev=464654&r1=464653&r2=464654
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java Mon Oct 16 13:38:57 2006
@@ -29,6 +29,7 @@
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.Progressable;
import org.apache.hadoop.util.StringUtils;
+import org.apache.hadoop.util.ToolBase;
import org.apache.nutch.parse.*;
import org.apache.nutch.analysis.*;
@@ -37,7 +38,6 @@
import org.apache.nutch.util.LogUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
-import org.apache.nutch.util.ToolBase;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.Inlinks;
@@ -74,6 +74,11 @@
}
return super.next(key, (Writable)wrapper.get());
}
+
+ // override the default - we want ObjectWritable-s here
+ public Writable createValue() {
+ return new ObjectWritable();
+ }
};
}
}
@@ -233,7 +238,7 @@
Parse parse = new ParseImpl(parseText, parseData);
try {
// run indexing filters
- doc = this.filters.filter(doc, parse, (UTF8)key, fetchDatum, inlinks);
+ doc = this.filters.filter(doc, parse, (Text)key, fetchDatum, inlinks);
} catch (IndexingException e) {
if (LOG.isWarnEnabled()) { LOG.warn("Error indexing "+key+": "+e); }
return;
@@ -242,7 +247,7 @@
float boost = 1.0f;
// run scoring filters
try {
- boost = this.scfilters.indexerScore((UTF8)key, doc, dbDatum,
+ boost = this.scfilters.indexerScore((Text)key, doc, dbDatum,
fetchDatum, parse, inlinks, boost);
} catch (ScoringFilterException e) {
if (LOG.isWarnEnabled()) {
@@ -283,15 +288,15 @@
job.addInputPath(new Path(linkDb, LinkDb.CURRENT_NAME));
job.setInputFormat(InputFormat.class);
- job.setInputKeyClass(UTF8.class);
- job.setInputValueClass(ObjectWritable.class);
+ //job.setInputKeyClass(Text.class);
+ //job.setInputValueClass(ObjectWritable.class);
//job.setCombinerClass(Indexer.class);
job.setReducerClass(Indexer.class);
job.setOutputPath(indexDir);
job.setOutputFormat(OutputFormat.class);
- job.setOutputKeyClass(UTF8.class);
+ job.setOutputKeyClass(Text.class);
job.setOutputValueClass(ObjectWritable.class);
JobClient.runJob(job);
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java?view=diff&rev=464654&r1=464653&r2=464654
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java Mon Oct 16 13:38:57 2006
@@ -21,7 +21,7 @@
// Hadoop imports
import org.apache.hadoop.conf.Configurable;
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
// Nutch imports
import org.apache.nutch.parse.Parse;
@@ -50,6 +50,6 @@
* @return modified (or a new) document instance
* @throws IndexingException
*/
- Document filter(Document doc, Parse parse, UTF8 url, CrawlDatum datum, Inlinks inlinks)
+ Document filter(Document doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
throws IndexingException;
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java?view=diff&rev=464654&r1=464653&r2=464654
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java Mon Oct 16 13:38:57 2006
@@ -29,7 +29,7 @@
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.Inlinks;
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
/** Creates and caches {@link IndexingFilter} implementing plugins.*/
public class IndexingFilters {
@@ -66,7 +66,7 @@
}
/** Run all defined filters. */
- public Document filter(Document doc, Parse parse, UTF8 url, CrawlDatum datum, Inlinks inlinks)
+ public Document filter(Document doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
throws IndexingException {
for (int i = 0; i < this.indexingFilters.length; i++) {
doc = this.indexingFilters[i].filter(doc, parse, url, datum, inlinks);
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Metadata.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Metadata.java?view=diff&rev=464654&r1=464653&r2=464654
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Metadata.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Metadata.java Mon Oct 16 13:38:57 2006
@@ -34,7 +34,7 @@
import org.apache.commons.lang.StringUtils;
// Hadoop imports
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
@@ -305,11 +305,11 @@
String[] values = null;
String[] names = names();
for (int i=0; i<names.length; i++) {
- UTF8.writeString(out, names[i]);
+ Text.writeString(out, names[i]);
values = getValues(names[i]);
out.writeInt(values.length);
for (int j=0; j<values.length; j++) {
- UTF8.writeString(out, values[j]);
+ Text.writeString(out, values[j]);
}
}
}
@@ -319,10 +319,10 @@
int keySize = in.readInt();
String key;
for (int i=0; i<keySize; i++) {
- key = UTF8.readString(in);
+ key = Text.readString(in);
int valueSize = in.readInt();
for (int j=0; j<valueSize; j++) {
- add(key, UTF8.readString(in));
+ add(key, Text.readString(in));
}
}
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/Outlink.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/Outlink.java?view=diff&rev=464654&r1=464653&r2=464654
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/Outlink.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/Outlink.java Mon Oct 16 13:38:57 2006
@@ -37,19 +37,19 @@
}
public void readFields(DataInput in) throws IOException {
- toUrl = UTF8.readString(in);
- anchor = UTF8.readString(in);
+ toUrl = Text.readString(in);
+ anchor = Text.readString(in);
}
/** Skips over one Outlink in the input. */
public static void skip(DataInput in) throws IOException {
- UTF8.skip(in); // skip toUrl
- UTF8.skip(in); // skip anchor
+ Text.skip(in); // skip toUrl
+ Text.skip(in); // skip anchor
}
public void write(DataOutput out) throws IOException {
- UTF8.writeString(out, toUrl);
- UTF8.writeString(out, anchor);
+ Text.writeString(out, toUrl);
+ Text.writeString(out, anchor);
}
public static Outlink read(DataInput in) throws IOException {
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java?view=diff&rev=464654&r1=464653&r2=464654
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java Mon Oct 16 13:38:57 2006
@@ -34,7 +34,7 @@
public final class ParseData extends VersionedWritable implements Configurable {
public static final String DIR_NAME = "parse_data";
- private final static byte VERSION = 4;
+ private final static byte VERSION = 5;
private String title;
private Outlink[] outlinks;
@@ -42,6 +42,7 @@
private Metadata parseMeta;
private ParseStatus status;
private Configuration conf;
+ private byte version = VERSION;
// TODO mb@media-style.com: should we really implement Configurable or should we add the
// parameter Configuration to the default-constructor. NOTE: The test
@@ -110,16 +111,16 @@
// Writable methods
//
- public byte getVersion() { return VERSION; }
+ public byte getVersion() { return version; }
public final void readFields(DataInput in) throws IOException {
- byte version = in.readByte();
- if (version > 1)
- status = ParseStatus.read(in);
- else
- status = ParseStatus.STATUS_SUCCESS;
- title = UTF8.readString(in); // read title
+ version = in.readByte();
+ // incompatible change from UTF8 (version < 5) to Text
+ if (version != VERSION)
+ throw new VersionMismatchException(VERSION, version);
+ status = ParseStatus.read(in);
+ title = Text.readString(in); // read title
int totalOutlinks = in.readInt(); // read outlinks
int maxOutlinksPerPage = this.conf.getInt("db.max.outlinks.per.page", 100);
@@ -139,7 +140,7 @@
int propertyCount = in.readInt(); // read metadata
contentMeta = new Metadata();
for (int i = 0; i < propertyCount; i++) {
- contentMeta.add(UTF8.readString(in), UTF8.readString(in));
+ contentMeta.add(Text.readString(in), Text.readString(in));
}
} else {
contentMeta = new Metadata();
@@ -154,7 +155,7 @@
public final void write(DataOutput out) throws IOException {
out.writeByte(VERSION); // write version
status.write(out); // write status
- UTF8.writeString(out, title); // write title
+ Text.writeString(out, title); // write title
out.writeInt(outlinks.length); // write outlinks
for (int i = 0; i < outlinks.length; i++) {
@@ -189,6 +190,7 @@
public String toString() {
StringBuffer buffer = new StringBuffer();
+ buffer.append("Version: " + version + "\n" );
buffer.append("Status: " + status + "\n" );
buffer.append("Title: " + title + "\n" );
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java?view=diff&rev=464654&r1=464653&r2=464654
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java Mon Oct 16 13:38:57 2006
@@ -66,13 +66,13 @@
new Path(new Path(job.getOutputPath(), CrawlDatum.PARSE_DIR_NAME), name);
final MapFile.Writer textOut =
- new MapFile.Writer(fs, text.toString(), UTF8.class, ParseText.class);
+ new MapFile.Writer(fs, text.toString(), Text.class, ParseText.class);
final MapFile.Writer dataOut =
- new MapFile.Writer(fs, data.toString(), UTF8.class,ParseData.class,true);
+ new MapFile.Writer(fs, data.toString(), Text.class,ParseData.class,true);
final SequenceFile.Writer crawlOut =
- new SequenceFile.Writer(fs, crawl, UTF8.class, CrawlDatum.class);
+ new SequenceFile.Writer(fs, crawl, Text.class, CrawlDatum.class);
return new RecordWriter() {
@@ -141,10 +141,10 @@
}
}
CrawlDatum target = new CrawlDatum(CrawlDatum.STATUS_LINKED, interval);
- UTF8 targetUrl = new UTF8(toUrls[i]);
+ Text targetUrl = new Text(toUrls[i]);
adjust = null;
try {
- adjust = scfilters.distributeScoreToOutlink((UTF8)key, targetUrl,
+ adjust = scfilters.distributeScoreToOutlink((Text)key, targetUrl,
parseData, target, null, links.length, validCount);
} catch (ScoringFilterException e) {
if (LOG.isWarnEnabled()) {
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java?view=diff&rev=464654&r1=464653&r2=464654
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java Mon Oct 16 13:38:57 2006
@@ -54,10 +54,17 @@
}
public void close() {}
+
+ private Text newKey = new Text();
public void map(WritableComparable key, Writable value,
OutputCollector output, Reporter reporter)
throws IOException {
+ // convert on the fly from old UTF8 keys
+ if (key instanceof UTF8) {
+ newKey.set(key.toString());
+ key = newKey;
+ }
Content content = (Content)value;
Parse parse = null;
@@ -75,7 +82,7 @@
if (status.isSuccess()) {
try {
- scfilters.passScoreAfterParsing((UTF8)key, content, parse);
+ scfilters.passScoreAfterParsing((Text)key, content, parse);
} catch (ScoringFilterException e) {
if (LOG.isWarnEnabled()) {
e.printStackTrace(LogUtil.getWarnStream(LOG));
@@ -107,14 +114,14 @@
job.setInputPath(new Path(segment, Content.DIR_NAME));
job.setInputFormat(SequenceFileInputFormat.class);
- job.setInputKeyClass(UTF8.class);
+ job.setInputKeyClass(Text.class);
job.setInputValueClass(Content.class);
job.setMapperClass(ParseSegment.class);
job.setReducerClass(ParseSegment.class);
job.setOutputPath(segment);
job.setOutputFormat(ParseOutputFormat.class);
- job.setOutputKeyClass(UTF8.class);
+ job.setOutputKeyClass(Text.class);
job.setOutputValueClass(ParseImpl.class);
JobClient.runJob(job);
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java?view=diff&rev=464654&r1=464653&r2=464654
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java Mon Oct 16 13:38:57 2006
@@ -24,7 +24,7 @@
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
import org.apache.nutch.parse.ParseUtil;
import org.apache.nutch.protocol.ProtocolFactory;
@@ -75,7 +75,7 @@
Configuration conf = NutchConfiguration.create();
ProtocolFactory factory = new ProtocolFactory(conf);
Protocol protocol = factory.getProtocol(url);
- Content content = protocol.getProtocolOutput(new UTF8(url), new CrawlDatum()).getContent();
+ Content content = protocol.getProtocolOutput(new Text(url), new CrawlDatum()).getContent();
if (force) {
content.setContentType(contentType);
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java?view=diff&rev=464654&r1=464653&r2=464654
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java Mon Oct 16 13:38:57 2006
@@ -32,7 +32,7 @@
public static final String DIR_NAME = "content";
- private final static byte VERSION = 1;
+ private final static byte VERSION = 2;
private byte version;
private String url;
@@ -64,31 +64,54 @@
protected final void readFieldsCompressed(DataInput in) throws IOException {
version = in.readByte();
- if (version > VERSION)
- throw new VersionMismatchException(VERSION, version);
-
- url = UTF8.readString(in); // read url
- base = UTF8.readString(in); // read base
-
- content = new byte[in.readInt()]; // read content
- in.readFully(content);
-
- contentType = UTF8.readString(in); // read contentType
-
metadata = new Metadata();
- metadata.readFields(in); // read meta data
+ switch (version) {
+ case 0:
+ case 1:
+ url = UTF8.readString(in); // read url
+ base = UTF8.readString(in); // read base
+
+ content = new byte[in.readInt()]; // read content
+ in.readFully(content);
+
+ contentType = UTF8.readString(in); // read contentType
+ // reconstruct metadata
+ int keySize = in.readInt();
+ String key;
+ for (int i = 0; i < keySize; i++) {
+ key = UTF8.readString(in);
+ int valueSize = in.readInt();
+ for (int j = 0; j < valueSize; j++) {
+ metadata.add(key, UTF8.readString(in));
+ }
+ }
+ break;
+ case VERSION:
+ url = Text.readString(in); // read url
+ base = Text.readString(in); // read base
+
+ content = new byte[in.readInt()]; // read content
+ in.readFully(content);
+
+ contentType = Text.readString(in); // read contentType
+ metadata.readFields(in); // read meta data
+ break;
+ default:
+ throw new VersionMismatchException(VERSION, version);
+ }
+
}
protected final void writeCompressed(DataOutput out) throws IOException {
- out.writeByte(version);
+ out.writeByte(VERSION);
- UTF8.writeString(out, url); // write url
- UTF8.writeString(out, base); // write base
+ Text.writeString(out, url); // write url
+ Text.writeString(out, base); // write base
out.writeInt(content.length); // write content
out.write(content);
- UTF8.writeString(out, contentType); // write contentType
+ Text.writeString(out, contentType); // write contentType
metadata.write(out); // write metadata
}
@@ -171,6 +194,7 @@
ensureInflated();
StringBuffer buffer = new StringBuffer();
+ buffer.append("Version: " + version + "\n" );
buffer.append("url: " + url + "\n" );
buffer.append("base: " + base + "\n" );
buffer.append("contentType: " + contentType + "\n" );
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Protocol.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Protocol.java?view=diff&rev=464654&r1=464653&r2=464654
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Protocol.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Protocol.java Mon Oct 16 13:38:57 2006
@@ -18,7 +18,7 @@
// Hadoop imports
import org.apache.hadoop.conf.Configurable;
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
// Nutch imports
import org.apache.nutch.crawl.CrawlDatum;
@@ -32,5 +32,5 @@
/** Returns the {@link Content} for a fetchlist entry.
*/
- ProtocolOutput getProtocolOutput(UTF8 url, CrawlDatum datum);
+ ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum);
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/scoring/ScoringFilter.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/scoring/ScoringFilter.java?view=diff&rev=464654&r1=464653&r2=464654
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/scoring/ScoringFilter.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/scoring/ScoringFilter.java Mon Oct 16 13:38:57 2006
@@ -18,7 +18,7 @@
import java.util.List;
import org.apache.hadoop.conf.Configurable;
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
import org.apache.lucene.document.Document;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.Inlinks;
@@ -49,7 +49,7 @@
* @param datum new datum. Filters will modify it in-place.
* @throws ScoringFilterException
*/
- public void injectedScore(UTF8 url, CrawlDatum datum) throws ScoringFilterException;
+ public void injectedScore(Text url, CrawlDatum datum) throws ScoringFilterException;
/**
* Set an initial score for newly discovered pages. Note: newly discovered pages
@@ -60,7 +60,7 @@
* @param datum new datum. Filters will modify it in-place.
* @throws ScoringFilterException
*/
- public void initialScore(UTF8 url, CrawlDatum datum) throws ScoringFilterException;
+ public void initialScore(Text url, CrawlDatum datum) throws ScoringFilterException;
/**
* This method prepares a sort value for the purpose of sorting and
@@ -69,7 +69,7 @@
* @param datum page's datum, should not be modified
* @param initSort initial sort value, or a value from previous filters in chain
*/
- public float generatorSortValue(UTF8 url, CrawlDatum datum, float initSort) throws ScoringFilterException;
+ public float generatorSortValue(Text url, CrawlDatum datum, float initSort) throws ScoringFilterException;
/**
* This method takes all relevant score information from the current datum
@@ -82,7 +82,7 @@
* @param content instance of content. Implementations may modify this
* in-place, primarily by setting some metadata properties.
*/
- public void passScoreBeforeParsing(UTF8 url, CrawlDatum datum, Content content) throws ScoringFilterException;
+ public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content) throws ScoringFilterException;
/**
* Currently a part of score distribution is performed using only data coming
@@ -93,7 +93,7 @@
* @param parse target instance to copy the score information to. Implementations
* may modify this in-place, primarily by setting some metadata properties.
*/
- public void passScoreAfterParsing(UTF8 url, Content content, Parse parse) throws ScoringFilterException;
+ public void passScoreAfterParsing(Text url, Content content, Parse parse) throws ScoringFilterException;
/**
* Distribute score value from the current page to all its outlinked pages.
@@ -116,7 +116,7 @@
* be null if not needed.
* @throws ScoringFilterException
*/
- public CrawlDatum distributeScoreToOutlink(UTF8 fromUrl, UTF8 toUrl,
+ public CrawlDatum distributeScoreToOutlink(Text fromUrl, Text toUrl,
ParseData parseData, CrawlDatum target, CrawlDatum adjust,
int allCount, int validCount) throws ScoringFilterException;
@@ -136,7 +136,7 @@
* links pointing to this page, found in the current update batch.
* @throws ScoringFilterException
*/
- public void updateDbScore(UTF8 url, CrawlDatum old, CrawlDatum datum, List inlinked) throws ScoringFilterException;
+ public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum, List inlinked) throws ScoringFilterException;
/**
* This method calculates a Lucene document boost.
@@ -156,6 +156,6 @@
* other scoring strategies by modifying Lucene document directly.
* @throws ScoringFilterException
*/
- public float indexerScore(UTF8 url, Document doc, CrawlDatum dbDatum,
+ public float indexerScore(Text url, Document doc, CrawlDatum dbDatum,
CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore) throws ScoringFilterException;
}