You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by cu...@apache.org on 2005/07/06 23:49:59 UTC
svn commit: r209524 - in /lucene/nutch/branches/mapred/src:
java/org/apache/nutch/mapred/ test/org/apache/nutch/mapred/
Author: cutting
Date: Wed Jul 6 14:49:58 2005
New Revision: 209524
URL: http://svn.apache.org/viewcvs?rev=209524&view=rev
Log:
Add test case for TextInputFormat. and fix it to pass.
Added:
lucene/nutch/branches/mapred/src/test/org/apache/nutch/mapred/TestTextInputFormat.java
Modified:
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/JobTracker.java
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TextInputFormat.java
lucene/nutch/branches/mapred/src/test/org/apache/nutch/mapred/TestSequenceFileInputFormat.java
Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/JobTracker.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/JobTracker.java?rev=209524&r1=209523&r2=209524&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/JobTracker.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/JobTracker.java Wed Jul 6 14:49:58 2005
@@ -237,7 +237,7 @@
new Thread(this.expireTrackers).start();
}
- private static InetSocketAddress getAddress(NutchConf conf) {
+ public static InetSocketAddress getAddress(NutchConf conf) {
String jobTrackerStr =
conf.get("mapred.job.tracker", "localhost:8012");
int colon = jobTrackerStr.indexOf(":");
Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TextInputFormat.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TextInputFormat.java?rev=209524&r1=209523&r2=209524&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TextInputFormat.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TextInputFormat.java Wed Jul 6 14:49:58 2005
@@ -41,9 +41,9 @@
// open the file and seek to the start of the split
final NFSDataInputStream in =
new NFSDataInputStream(fs.open(split.getFile()));
- in.seek(start);
if (start != 0) {
+ in.seek(start-1);
while (in.getPos() < end) { // scan to the next newline in the file
char c = (char)in.read();
if (c == '\r' || c == '\n') {
Modified: lucene/nutch/branches/mapred/src/test/org/apache/nutch/mapred/TestSequenceFileInputFormat.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/test/org/apache/nutch/mapred/TestSequenceFileInputFormat.java?rev=209524&r1=209523&r2=209524&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/test/org/apache/nutch/mapred/TestSequenceFileInputFormat.java (original)
+++ lucene/nutch/branches/mapred/src/test/org/apache/nutch/mapred/TestSequenceFileInputFormat.java Wed Jul 6 14:49:58 2005
@@ -33,21 +33,28 @@
public void testFormat() throws Exception {
JobConf job = new JobConf(NutchConf.get());
NutchFileSystem fs = NutchFileSystem.getNamed("local");
- File dir = new File(System.getProperty("test.build.data",".") + "/mrtest");
+ File dir = new File(System.getProperty("test.build.data",".") + "/mapred");
File file = new File(dir, "test.seq");
int seed = new Random().nextInt();
- LOG.info("seed = "+seed);
+ //LOG.info("seed = "+seed);
Random random = new Random(seed);
dir.mkdirs();
+ File[] files = dir.listFiles();
+ if (files != null) {
+ for (int i = 0; i < files.length; i++) {
+ files[i].delete();
+ }
+ }
+
job.setInputDir(dir);
// for a variety of lengths
for (int length = 0; length < MAX_LENGTH;
length+= random.nextInt(MAX_LENGTH/10)+1) {
- LOG.info("creating; entries = " + length);
+ //LOG.info("creating; entries = " + length);
// create a file with length entries
file.delete();
@@ -73,9 +80,9 @@
for (int i = 0; i < 3; i++) {
int numSplits =
random.nextInt(MAX_LENGTH/(SequenceFile.SYNC_INTERVAL/20))+1;
- LOG.info("splitting: requesting = " + numSplits);
+ //LOG.info("splitting: requesting = " + numSplits);
FileSplit[] splits = format.getSplits(fs, job, numSplits);
- LOG.info("splitting: got = " + splits.length);
+ //LOG.info("splitting: got = " + splits.length);
// check each split
BitSet bits = new BitSet(length);
Added: lucene/nutch/branches/mapred/src/test/org/apache/nutch/mapred/TestTextInputFormat.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/test/org/apache/nutch/mapred/TestTextInputFormat.java?rev=209524&view=auto
==============================================================================
--- lucene/nutch/branches/mapred/src/test/org/apache/nutch/mapred/TestTextInputFormat.java (added)
+++ lucene/nutch/branches/mapred/src/test/org/apache/nutch/mapred/TestTextInputFormat.java Wed Jul 6 14:49:58 2005
@@ -0,0 +1,107 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.mapred;
+
+import java.io.*;
+import java.util.*;
+import junit.framework.TestCase;
+import java.util.logging.*;
+
+import org.apache.nutch.fs.*;
+import org.apache.nutch.io.*;
+import org.apache.nutch.util.*;
+
+public class TestTextInputFormat extends TestCase {
+ private static final Logger LOG = InputFormatBase.LOG;
+
+ private static int MAX_LENGTH = 10000;
+
+ public void testFormat() throws Exception {
+ JobConf job = new JobConf(NutchConf.get());
+ NutchFileSystem fs = NutchFileSystem.getNamed("local");
+ File dir = new File(System.getProperty("test.build.data",".") + "/mapred");
+ File file = new File(dir, "test.txt");
+
+ int seed = new Random().nextInt();
+ //LOG.info("seed = "+seed);
+ Random random = new Random(seed);
+
+ dir.mkdirs();
+ File[] files = dir.listFiles();
+ if (files != null) {
+ for (int i = 0; i < files.length; i++) {
+ files[i].delete();
+ }
+ }
+
+ job.setInputDir(dir);
+
+ // for a variety of lengths
+ for (int length = 0; length < MAX_LENGTH;
+ length+= random.nextInt(MAX_LENGTH/10)+1) {
+
+ //LOG.info("creating; entries = " + length);
+
+ // create a file with length entries
+ file.delete();
+ Writer writer = new FileWriter(file);
+ try {
+ for (int i = 0; i < length; i++) {
+ writer.write(Integer.toString(i));
+ writer.write("\n");
+ }
+ } finally {
+ writer.close();
+ }
+
+ // try splitting the file in a variety of sizes
+ InputFormat format = new TextInputFormat();
+ LongWritable key = new LongWritable();
+ UTF8 value = new UTF8();
+ for (int i = 0; i < 3; i++) {
+ int numSplits = random.nextInt(MAX_LENGTH/20)+1;
+ //LOG.info("splitting: requesting = " + numSplits);
+ FileSplit[] splits = format.getSplits(fs, job, numSplits);
+ //LOG.info("splitting: got = " + splits.length);
+
+ // check each split
+ BitSet bits = new BitSet(length);
+ for (int j = 0; j < splits.length; j++) {
+ RecordReader reader = format.getRecordReader(fs, splits[j], job);
+ int count = 0;
+ while (reader.next(key, value)) {
+ int v = Integer.parseInt(value.toString());
+// if (bits.get(v)) {
+// LOG.info("splits["+j+"]="+splits[j]+" : " + v);
+// LOG.info("@"+reader.getPos());
+// }
+ assertFalse("Key in multiple partitions.", bits.get(v));
+ bits.set(v);
+ count++;
+ }
+ //LOG.info("splits["+j+"]="+splits[j]+" count=" + count);
+ }
+ assertEquals("Some keys in no partition.", length, bits.cardinality());
+ }
+
+ }
+ }
+
+ public static void main(String[] args) throws Exception {
+ new TestTextInputFormat().testFormat();
+ }
+}