You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by do...@apache.org on 2007/06/27 09:05:53 UTC
svn commit: r551081 - in /lucene/nutch/trunk: ./
src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/fetcher/
src/java/org/apache/nutch/indexer/ src/java/org/apache/nutch/metadata/
src/java/org/apache/nutch/segment/ src/java/org/apache/nutch/util/
Author: dogacan
Date: Wed Jun 27 00:05:52 2007
New Revision: 551081
URL: http://svn.apache.org/viewvc?view=rev&rev=551081
Log:
NUTCH-474 - Replace usage of ObjectWritable with something based on GenericWritable.
Added:
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/NutchWritable.java
lucene/nutch/trunk/src/java/org/apache/nutch/util/GenericWritableConfigurable.java
Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java
lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java
lucene/nutch/trunk/src/java/org/apache/nutch/metadata/MetaWrapper.java
lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java
lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java
Modified: lucene/nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=551081&r1=551080&r2=551081
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Wed Jun 27 00:05:52 2007
@@ -67,6 +67,9 @@
21. NUTCH-497 - Extreme Nested Tags causes StackOverflowException in
DomContentUtils...Spider Trap. (kubes)
+22. NUTCH-434 - Replace usage of ObjectWritable with something based on
+ GenericWritable. (dogacan)
+
Release 0.9 - 2007-04-02
1. Changed log4j confiquration to log to stdout on commandline
Added: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/NutchWritable.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/NutchWritable.java?view=auto&rev=551081
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/NutchWritable.java (added)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/NutchWritable.java Wed Jun 27 00:05:52 2007
@@ -0,0 +1,49 @@
+package org.apache.nutch.crawl;
+
+import org.apache.hadoop.io.Writable;
+import org.apache.nutch.util.GenericWritableConfigurable;
+
+public class NutchWritable extends GenericWritableConfigurable {
+
+ private static Class<? extends Writable>[] CLASSES = null;
+
+ static {
+ CLASSES = (Class<? extends Writable>[]) new Class[] {
+ org.apache.hadoop.io.NullWritable.class,
+ org.apache.hadoop.io.LongWritable.class,
+ org.apache.hadoop.io.BytesWritable.class,
+ org.apache.hadoop.io.FloatWritable.class,
+ org.apache.hadoop.io.IntWritable.class,
+ org.apache.hadoop.io.Text.class,
+ org.apache.hadoop.io.MD5Hash.class,
+ org.apache.nutch.crawl.CrawlDatum.class,
+ org.apache.nutch.crawl.Inlink.class,
+ org.apache.nutch.crawl.Inlinks.class,
+ org.apache.nutch.crawl.MapWritable.class,
+ org.apache.nutch.fetcher.FetcherOutput.class,
+ org.apache.nutch.metadata.Metadata.class,
+ org.apache.nutch.parse.Outlink.class,
+ org.apache.nutch.parse.ParseText.class,
+ org.apache.nutch.parse.ParseData.class,
+ org.apache.nutch.parse.ParseImpl.class,
+ org.apache.nutch.parse.ParseStatus.class,
+ org.apache.nutch.protocol.Content.class,
+ org.apache.nutch.protocol.ProtocolStatus.class,
+ org.apache.nutch.searcher.Hit.class,
+ org.apache.nutch.searcher.HitDetails.class,
+ org.apache.nutch.searcher.Hits.class
+ };
+ }
+
+ public NutchWritable() { }
+
+ public NutchWritable(Writable instance) {
+ set(instance);
+ }
+
+ @Override
+ protected Class<? extends Writable>[] getTypes() {
+ return CLASSES;
+ }
+
+}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?view=diff&rev=551081&r1=551080&r2=551081
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Wed Jun 27 00:05:52 2007
@@ -32,6 +32,7 @@
import org.apache.hadoop.util.ToolBase;
import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.NutchWritable;
import org.apache.nutch.crawl.SignatureFactory;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.metadata.Nutch;
@@ -321,9 +322,9 @@
}
try {
- output.collect(key, new ObjectWritable(datum));
+ output.collect(key, new NutchWritable(datum));
if (content != null && storingContent)
- output.collect(key, new ObjectWritable(content));
+ output.collect(key, new NutchWritable(content));
if (parseResult != null) {
for (Entry<Text, Parse> entry : parseResult) {
Text url = entry.getKey();
@@ -357,7 +358,7 @@
LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");
}
}
- output.collect(url, new ObjectWritable(
+ output.collect(url, new NutchWritable(
new ParseImpl(new ParseText(parse.getText()),
parse.getData(), parse.isCanonical())));
}
@@ -493,7 +494,7 @@
job.setOutputPath(segment);
job.setOutputFormat(FetcherOutputFormat.class);
job.setOutputKeyClass(Text.class);
- job.setOutputValueClass(ObjectWritable.class);
+ job.setOutputValueClass(NutchWritable.class);
JobClient.runJob(job);
if (LOG.isInfoEnabled()) { LOG.info("Fetcher: done"); }
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java?view=diff&rev=551081&r1=551080&r2=551081
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java Wed Jun 27 00:05:52 2007
@@ -36,6 +36,7 @@
import org.apache.hadoop.util.StringUtils;
import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.NutchWritable;
import org.apache.nutch.crawl.SignatureFactory;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.metadata.Nutch;
@@ -695,9 +696,9 @@
}
try {
- output.collect(key, new ObjectWritable(datum));
+ output.collect(key, new NutchWritable(datum));
if (content != null && storingContent)
- output.collect(key, new ObjectWritable(content));
+ output.collect(key, new NutchWritable(content));
if (parseResult != null) {
for (Entry<Text, Parse> entry : parseResult) {
Text url = entry.getKey();
@@ -731,7 +732,7 @@
LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");
}
}
- output.collect(url, new ObjectWritable(
+ output.collect(url, new NutchWritable(
new ParseImpl(new ParseText(parse.getText()),
parse.getData(), parse.isCanonical())));
}
@@ -873,7 +874,7 @@
job.setOutputPath(segment);
job.setOutputFormat(FetcherOutputFormat.class);
job.setOutputKeyClass(Text.class);
- job.setOutputValueClass(ObjectWritable.class);
+ job.setOutputValueClass(NutchWritable.class);
JobClient.runJob(job);
if (LOG.isInfoEnabled()) { LOG.info("Fetcher: done"); }
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java?view=diff&rev=551081&r1=551080&r2=551081
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java Wed Jun 27 00:05:52 2007
@@ -20,11 +20,11 @@
import java.io.IOException;
import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.NutchWritable;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.MapFile;
-import org.apache.hadoop.io.ObjectWritable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.Text;
@@ -81,7 +81,7 @@
public void write(WritableComparable key, Writable value)
throws IOException {
- Writable w = (Writable)((ObjectWritable)value).get();
+ Writable w = ((NutchWritable)value).get();
if (w instanceof CrawlDatum)
fetchOut.append(key, w);
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java?view=diff&rev=551081&r1=551080&r2=551081
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java Wed Jun 27 00:05:52 2007
@@ -43,6 +43,7 @@
import org.apache.nutch.crawl.CrawlDb;
import org.apache.nutch.crawl.Inlinks;
import org.apache.nutch.crawl.LinkDb;
+import org.apache.nutch.crawl.NutchWritable;
import org.apache.lucene.index.*;
import org.apache.lucene.document.*;
@@ -55,6 +56,32 @@
public static final String DONE_NAME = "index.done";
public static final Log LOG = LogFactory.getLog(Indexer.class);
+
+ /** A utility class used to pass a lucene document from Indexer.reduce
+ * to Indexer.OutputFormat.
+ * Note: Despite its name, it can't properly wrap a lucene document - it
+ * doesn't know how to serialize/deserialize a lucene document.
+ */
+ private static class LuceneDocumentWrapper implements Writable {
+ private Document doc;
+
+ public LuceneDocumentWrapper(Document doc) {
+ this.doc = doc;
+ }
+
+ public Document get() {
+ return doc;
+ }
+
+ public void readFields(DataInput in) throws IOException {
+ // intentionally left blank
+ }
+
+ public void write(DataOutput out) throws IOException {
+ // intentionally left blank
+ }
+
+ }
/** Unwrap Lucene Documents created by reduce and add them to an index. */
public static class OutputFormat
@@ -87,7 +114,7 @@
public void write(WritableComparable key, Writable value)
throws IOException { // unwrap & index doc
- Document doc = (Document)((ObjectWritable)value).get();
+ Document doc = ((LuceneDocumentWrapper) value).get();
NutchAnalyzer analyzer = factory.get(doc.get("lang"));
if (LOG.isInfoEnabled()) {
LOG.info(" Indexing [" + doc.getField("url").stringValue() + "]" +
@@ -156,7 +183,7 @@
ParseData parseData = null;
ParseText parseText = null;
while (values.hasNext()) {
- Object value = ((ObjectWritable)values.next()).get(); // unwrap
+ Writable value = ((NutchWritable)values.next()).get(); // unwrap
if (value instanceof Inlinks) {
inlinks = (Inlinks)value;
} else if (value instanceof CrawlDatum) {
@@ -240,7 +267,7 @@
doc.add(new Field("boost", Float.toString(boost),
Field.Store.YES, Field.Index.NO));
- output.collect(key, new ObjectWritable(doc));
+ output.collect(key, new LuceneDocumentWrapper(doc));
}
public void index(Path indexDir, Path crawlDb, Path linkDb, Path[] segments)
@@ -274,7 +301,7 @@
job.setOutputPath(indexDir);
job.setOutputFormat(OutputFormat.class);
job.setOutputKeyClass(Text.class);
- job.setOutputValueClass(ObjectWritable.class);
+ job.setOutputValueClass(NutchWritable.class);
JobClient.runJob(job);
if (LOG.isInfoEnabled()) { LOG.info("Indexer: done"); }
@@ -309,7 +336,7 @@
public void map(WritableComparable key, Writable value,
OutputCollector output, Reporter reporter) throws IOException {
- output.collect(key, new ObjectWritable(value));
+ output.collect(key, new NutchWritable(value));
}
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/metadata/MetaWrapper.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/metadata/MetaWrapper.java?view=diff&rev=551081&r1=551080&r2=551081
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/metadata/MetaWrapper.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/metadata/MetaWrapper.java Wed Jun 27 00:05:52 2007
@@ -22,17 +22,18 @@
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.ObjectWritable;
+import org.apache.hadoop.io.Writable;
+import org.apache.nutch.crawl.NutchWritable;
/**
- * This is a simple decorator that adds metadata to any Object-s that can be
- * serialized by <tt>ObjectWritable</tt>. This is useful when data needs to be
+ * This is a simple decorator that adds metadata to any Writable-s that can be
+ * serialized by <tt>NutchWritable</tt>. This is useful when data needs to be
* temporarily enriched during processing, but this
* temporary metadata doesn't need to be permanently stored after the job is done.
*
* @author Andrzej Bialecki
*/
-public class MetaWrapper extends ObjectWritable {
+public class MetaWrapper extends NutchWritable {
private Metadata metadata;
public MetaWrapper() {
@@ -40,14 +41,14 @@
metadata = new Metadata();
}
- public MetaWrapper(Object object, Configuration conf) {
- super(object);
+ public MetaWrapper(Writable instance, Configuration conf) {
+ super(instance);
metadata = new Metadata();
setConf(conf);
}
- public MetaWrapper(Metadata metadata, Object object, Configuration conf) {
- super(object);
+ public MetaWrapper(Metadata metadata, Writable instance, Configuration conf) {
+ super(instance);
if (metadata == null) metadata = new Metadata();
this.metadata = metadata;
setConf(conf);
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java?view=diff&rev=551081&r1=551080&r2=551081
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java Wed Jun 27 00:05:52 2007
@@ -153,7 +153,7 @@
MetaWrapper wrapper = (MetaWrapper) value;
try {
- wrapper.set(getValueClass().newInstance());
+ wrapper.set((Writable)getValueClass().newInstance());
} catch (Exception e) {
throw new IOException(e.toString());
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java?view=diff&rev=551081&r1=551080&r2=551081
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java Wed Jun 27 00:05:52 2007
@@ -42,7 +42,6 @@
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.io.MapFile;
-import org.apache.hadoop.io.ObjectWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.UTF8;
@@ -61,6 +60,7 @@
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.util.Progressable;
import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.NutchWritable;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.ParseText;
import org.apache.nutch.protocol.Content;
@@ -87,7 +87,7 @@
newKey.set(key.toString());
key = newKey;
}
- collector.collect(key, new ObjectWritable(value));
+ collector.collect(key, new NutchWritable(value));
}
}
@@ -104,8 +104,7 @@
final PrintStream printStream = new PrintStream(fs.create(segmentDumpFile));
return new RecordWriter() {
public synchronized void write(WritableComparable key, Writable value) throws IOException {
- ObjectWritable writable = (ObjectWritable) value;
- printStream.println((String) writable.get());
+ printStream.println(value);
}
public synchronized void close(Reporter reporter) throws IOException {
@@ -170,7 +169,7 @@
dump.append("\nRecno:: ").append(recNo++).append("\n");
dump.append("URL:: " + key.toString() + "\n");
while (values.hasNext()) {
- Object value = ((ObjectWritable) values.next()).get(); // unwrap
+ Writable value = ((NutchWritable) values.next()).get(); // unwrap
if (value instanceof CrawlDatum) {
dump.append("\nCrawlDatum::\n").append(((CrawlDatum) value).toString());
} else if (value instanceof Content) {
@@ -183,7 +182,7 @@
LOG.warn("Unrecognized type: " + value.getClass());
}
}
- output.collect(key, new ObjectWritable(dump.toString()));
+ output.collect(key, new Text(dump.toString()));
}
public void dump(Path segment, Path output) throws IOException {
@@ -212,7 +211,7 @@
job.setOutputPath(tempDir);
job.setOutputFormat(TextOutputFormat.class);
job.setOutputKeyClass(Text.class);
- job.setOutputValueClass(ObjectWritable.class);
+ job.setOutputValueClass(NutchWritable.class);
JobClient.runJob(job);
Added: lucene/nutch/trunk/src/java/org/apache/nutch/util/GenericWritableConfigurable.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/util/GenericWritableConfigurable.java?view=auto&rev=551081
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/util/GenericWritableConfigurable.java (added)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/util/GenericWritableConfigurable.java Wed Jun 27 00:05:52 2007
@@ -0,0 +1,41 @@
+package org.apache.nutch.util;
+
+import java.io.DataInput;
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configurable;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.GenericWritable;
+import org.apache.hadoop.io.Writable;
+
+/** A generic Writable wrapper that can inject Configuration to {@link Configurable}s */
+public abstract class GenericWritableConfigurable extends GenericWritable
+ implements Configurable {
+
+ private Configuration conf;
+
+ public Configuration getConf() {
+ return conf;
+ }
+
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+ }
+
+ @Override
+ public void readFields(DataInput in) throws IOException {
+ byte type = in.readByte();
+ Class clazz = getTypes()[type];
+ try {
+ set((Writable) clazz.newInstance());
+ } catch (Exception e) {
+ e.printStackTrace();
+ throw new IOException("Cannot initialize the class: " + clazz);
+ }
+ Writable w = get();
+ if (w instanceof Configurable)
+ ((Configurable)w).setConf(conf);
+ w.readFields(in);
+ }
+
+}