You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ku...@apache.org on 2008/12/02 15:59:21 UTC
svn commit: r722483 - in /lucene/nutch/trunk: CHANGES.txt
src/java/org/apache/nutch/segment/ContentAsTextInputFormat.java
Author: kubes
Date: Tue Dec 2 06:59:21 2008
New Revision: 722483
URL: http://svn.apache.org/viewvc?rev=722483&view=rev
Log:
NUTCH-667: Input Format for working with Content in Hadoop Streaming
Added:
lucene/nutch/trunk/src/java/org/apache/nutch/segment/ContentAsTextInputFormat.java
Modified:
lucene/nutch/trunk/CHANGES.txt
Modified: lucene/nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=722483&r1=722482&r2=722483&view=diff
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Tue Dec 2 06:59:21 2008
@@ -291,6 +291,8 @@
107. NUTCH-647 - Resolve URLs tool (kubes)
108. NUTCH-665 - Search Load Testing Tool (kubes)
+
+109. NUTCH-667 - Input Format for working with Content in Hadoop Streaming
Release 0.9 - 2007-04-02
Added: lucene/nutch/trunk/src/java/org/apache/nutch/segment/ContentAsTextInputFormat.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/segment/ContentAsTextInputFormat.java?rev=722483&view=auto
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/segment/ContentAsTextInputFormat.java (added)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/segment/ContentAsTextInputFormat.java Tue Dec 2 06:59:21 2008
@@ -0,0 +1,94 @@
+package org.apache.nutch.segment;
+
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.FileSplit;
+import org.apache.hadoop.mapred.InputSplit;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.RecordReader;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.hadoop.mapred.SequenceFileInputFormat;
+import org.apache.hadoop.mapred.SequenceFileRecordReader;
+import org.apache.nutch.protocol.Content;
+
+/**
+ * An input format that takes Nutch Content objects and converts them to text
+ * while converting newline endings to spaces. This format is useful for working
+ * with Nutch content objects in Hadoop Streaming with other languages.
+ */
+public class ContentAsTextInputFormat
+ extends SequenceFileInputFormat<Text, Text> {
+
+ private static class ContentAsTextRecordReader
+ implements RecordReader<Text, Text> {
+
+ private final SequenceFileRecordReader<Text, Content> sequenceFileRecordReader;
+
+ private Text innerKey;
+ private Content innerValue;
+
+ public ContentAsTextRecordReader(Configuration conf, FileSplit split)
+ throws IOException {
+ sequenceFileRecordReader = new SequenceFileRecordReader<Text, Content>(
+ conf, split);
+ innerKey = (Text)sequenceFileRecordReader.createKey();
+ innerValue = (Content)sequenceFileRecordReader.createValue();
+ }
+
+ public Text createKey() {
+ return new Text();
+ }
+
+ public Text createValue() {
+ return new Text();
+ }
+
+ public synchronized boolean next(Text key, Text value)
+ throws IOException {
+
+ // convert the content object to text
+ Text tKey = key;
+ Text tValue = value;
+ if (!sequenceFileRecordReader.next(innerKey, innerValue)) {
+ return false;
+ }
+ tKey.set(innerKey.toString());
+ String contentAsStr = new String(innerValue.getContent());
+
+ // replace new line endings with spaces
+ contentAsStr = contentAsStr.replaceAll("\n", " ");
+ value.set(contentAsStr);
+
+ return true;
+ }
+
+ public float getProgress()
+ throws IOException {
+ return sequenceFileRecordReader.getProgress();
+ }
+
+ public synchronized long getPos()
+ throws IOException {
+ return sequenceFileRecordReader.getPos();
+ }
+
+ public synchronized void close()
+ throws IOException {
+ sequenceFileRecordReader.close();
+ }
+ }
+
+ public ContentAsTextInputFormat() {
+ super();
+ }
+
+ public RecordReader<Text, Text> getRecordReader(InputSplit split,
+ JobConf job, Reporter reporter)
+ throws IOException {
+
+ reporter.setStatus(split.toString());
+ return new ContentAsTextRecordReader(job, (FileSplit)split);
+ }
+}