You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by cu...@apache.org on 2005/07/06 23:49:59 UTC

svn commit: r209524 - in /lucene/nutch/branches/mapred/src: java/org/apache/nutch/mapred/ test/org/apache/nutch/mapred/

Author: cutting
Date: Wed Jul  6 14:49:58 2005
New Revision: 209524

URL: http://svn.apache.org/viewcvs?rev=209524&view=rev
Log:
Add test case for TextInputFormat. and fix it to pass.

Added:
    lucene/nutch/branches/mapred/src/test/org/apache/nutch/mapred/TestTextInputFormat.java
Modified:
    lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/JobTracker.java
    lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TextInputFormat.java
    lucene/nutch/branches/mapred/src/test/org/apache/nutch/mapred/TestSequenceFileInputFormat.java

Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/JobTracker.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/JobTracker.java?rev=209524&r1=209523&r2=209524&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/JobTracker.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/JobTracker.java Wed Jul  6 14:49:58 2005
@@ -237,7 +237,7 @@
         new Thread(this.expireTrackers).start();
     }
 
-    private static InetSocketAddress getAddress(NutchConf conf) {
+    public static InetSocketAddress getAddress(NutchConf conf) {
       String jobTrackerStr =
         conf.get("mapred.job.tracker", "localhost:8012");
       int colon = jobTrackerStr.indexOf(":");

Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TextInputFormat.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TextInputFormat.java?rev=209524&r1=209523&r2=209524&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TextInputFormat.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TextInputFormat.java Wed Jul  6 14:49:58 2005
@@ -41,9 +41,9 @@
     // open the file and seek to the start of the split
     final NFSDataInputStream in =
       new NFSDataInputStream(fs.open(split.getFile()));
-    in.seek(start);
     
     if (start != 0) {
+      in.seek(start-1);
       while (in.getPos() < end) {    // scan to the next newline in the file
         char c = (char)in.read();
         if (c == '\r' || c == '\n') {

Modified: lucene/nutch/branches/mapred/src/test/org/apache/nutch/mapred/TestSequenceFileInputFormat.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/test/org/apache/nutch/mapred/TestSequenceFileInputFormat.java?rev=209524&r1=209523&r2=209524&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/test/org/apache/nutch/mapred/TestSequenceFileInputFormat.java (original)
+++ lucene/nutch/branches/mapred/src/test/org/apache/nutch/mapred/TestSequenceFileInputFormat.java Wed Jul  6 14:49:58 2005
@@ -33,21 +33,28 @@
   public void testFormat() throws Exception {
     JobConf job = new JobConf(NutchConf.get());
     NutchFileSystem fs = NutchFileSystem.getNamed("local");
-    File dir = new File(System.getProperty("test.build.data",".") + "/mrtest");
+    File dir = new File(System.getProperty("test.build.data",".") + "/mapred");
     File file = new File(dir, "test.seq");
     
     int seed = new Random().nextInt();
-    LOG.info("seed = "+seed);
+    //LOG.info("seed = "+seed);
     Random random = new Random(seed);
 
     dir.mkdirs();
+    File[] files = dir.listFiles();
+    if (files != null) {
+      for (int i = 0; i < files.length; i++) {
+        files[i].delete();
+      }
+    }
+
     job.setInputDir(dir);
 
     // for a variety of lengths
     for (int length = 0; length < MAX_LENGTH;
          length+= random.nextInt(MAX_LENGTH/10)+1) {
 
-      LOG.info("creating; entries = " + length);
+      //LOG.info("creating; entries = " + length);
 
       // create a file with length entries
       file.delete();
@@ -73,9 +80,9 @@
       for (int i = 0; i < 3; i++) {
         int numSplits =
           random.nextInt(MAX_LENGTH/(SequenceFile.SYNC_INTERVAL/20))+1;
-        LOG.info("splitting: requesting = " + numSplits);
+        //LOG.info("splitting: requesting = " + numSplits);
         FileSplit[] splits = format.getSplits(fs, job, numSplits);
-        LOG.info("splitting: got =        " + splits.length);
+        //LOG.info("splitting: got =        " + splits.length);
 
         // check each split
         BitSet bits = new BitSet(length);

Added: lucene/nutch/branches/mapred/src/test/org/apache/nutch/mapred/TestTextInputFormat.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/test/org/apache/nutch/mapred/TestTextInputFormat.java?rev=209524&view=auto
==============================================================================
--- lucene/nutch/branches/mapred/src/test/org/apache/nutch/mapred/TestTextInputFormat.java (added)
+++ lucene/nutch/branches/mapred/src/test/org/apache/nutch/mapred/TestTextInputFormat.java Wed Jul  6 14:49:58 2005
@@ -0,0 +1,107 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.mapred;
+
+import java.io.*;
+import java.util.*;
+import junit.framework.TestCase;
+import java.util.logging.*;
+
+import org.apache.nutch.fs.*;
+import org.apache.nutch.io.*;
+import org.apache.nutch.util.*;
+
+public class TestTextInputFormat extends TestCase {
+  private static final Logger LOG = InputFormatBase.LOG;
+
+  private static int MAX_LENGTH = 10000;
+
+  public void testFormat() throws Exception {
+    JobConf job = new JobConf(NutchConf.get());
+    NutchFileSystem fs = NutchFileSystem.getNamed("local");
+    File dir = new File(System.getProperty("test.build.data",".") + "/mapred");
+    File file = new File(dir, "test.txt");
+    
+    int seed = new Random().nextInt();
+    //LOG.info("seed = "+seed);
+    Random random = new Random(seed);
+
+    dir.mkdirs();
+    File[] files = dir.listFiles();
+    if (files != null) {
+      for (int i = 0; i < files.length; i++) {
+        files[i].delete();
+      }
+    }
+
+    job.setInputDir(dir);
+
+    // for a variety of lengths
+    for (int length = 0; length < MAX_LENGTH;
+         length+= random.nextInt(MAX_LENGTH/10)+1) {
+
+      //LOG.info("creating; entries = " + length);
+
+      // create a file with length entries
+      file.delete();
+      Writer writer = new FileWriter(file);
+      try {
+        for (int i = 0; i < length; i++) {
+          writer.write(Integer.toString(i));
+          writer.write("\n");
+        }
+      } finally {
+        writer.close();
+      }
+
+      // try splitting the file in a variety of sizes
+      InputFormat format = new TextInputFormat();
+      LongWritable key = new LongWritable();
+      UTF8 value = new UTF8();
+      for (int i = 0; i < 3; i++) {
+        int numSplits = random.nextInt(MAX_LENGTH/20)+1;
+        //LOG.info("splitting: requesting = " + numSplits);
+        FileSplit[] splits = format.getSplits(fs, job, numSplits);
+        //LOG.info("splitting: got =        " + splits.length);
+
+        // check each split
+        BitSet bits = new BitSet(length);
+        for (int j = 0; j < splits.length; j++) {
+          RecordReader reader = format.getRecordReader(fs, splits[j], job);
+          int count = 0;
+          while (reader.next(key, value)) {
+            int v = Integer.parseInt(value.toString());
+//             if (bits.get(v)) {
+//               LOG.info("splits["+j+"]="+splits[j]+" : " + v);
+//               LOG.info("@"+reader.getPos());
+//             }
+            assertFalse("Key in multiple partitions.", bits.get(v));
+            bits.set(v);
+            count++;
+          }
+          //LOG.info("splits["+j+"]="+splits[j]+" count=" + count);
+        }
+        assertEquals("Some keys in no partition.", length, bits.cardinality());
+      }
+
+    }
+  }
+
+  public static void main(String[] args) throws Exception {
+    new TestTextInputFormat().testFormat();
+  }
+}