You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ab...@apache.org on 2008/08/19 01:56:21 UTC
svn commit: r686900 - in /lucene/nutch/trunk: CHANGES.txt src/java/org/apache/nutch/indexer/IndexSorter.java src/test/org/apache/nutch/indexer/TestIndexSorter.java

Author: ab
Date: Mon Aug 18 16:56:20 2008
New Revision: 686900

URL: http://svn.apache.org/viewvc?rev=686900&view=rev
Log:
NUTCH-641 IndexSorter incorrectly copies stored fields.

Added:
    lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestIndexSorter.java   (with props)
Modified:
    lucene/nutch/trunk/CHANGES.txt
    lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSorter.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=686900&r1=686899&r2=686900&view=diff
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Mon Aug 18 16:56:20 2008
@@ -256,6 +256,9 @@
 93. NUTCH-634 - Upgrade Nutch to Hadoop 0.17.1 (Michael Gottesman, Lincoln
     Ritter, ab)
 
+94. NUTCH-641 - IndexSorter inorrectly copies stored fields (ab)
+
+
 Release 0.9 - 2007-04-02
 
  1. Changed log4j confiquration to log to stdout on commandline

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSorter.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSorter.java?rev=686900&r1=686899&r2=686900&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSorter.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSorter.java Mon Aug 18 16:56:20 2008
@@ -191,6 +191,11 @@
       return super.document(newToOld[n]);
     }
 
+    public Document document(int n, FieldSelector fieldSelector)
+        throws CorruptIndexException, IOException {
+      return super.document(newToOld[n], fieldSelector);
+    }
+
     public boolean isDeleted(int n) {
       return false;
     }
@@ -240,6 +245,10 @@
         return this.score < that.score ? 1 : -1 ;
       }
     }
+    
+    public String toString() {
+      return "oldDoc=" + oldDoc + ",score=" + score;
+    }
   }
 
   public IndexSorter() {

Added: lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestIndexSorter.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestIndexSorter.java?rev=686900&view=auto
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestIndexSorter.java (added)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestIndexSorter.java Mon Aug 18 16:56:20 2008
@@ -0,0 +1,145 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.indexer;
+
+import java.io.File;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileUtil;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.Field.Index;
+import org.apache.lucene.document.Field.Store;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.search.Similarity;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.FSDirectory;
+import org.apache.nutch.analysis.NutchDocumentAnalyzer;
+import org.apache.nutch.util.NutchConfiguration;
+
+import junit.framework.TestCase;
+
+public class TestIndexSorter extends TestCase {
+  private static final Log LOG = LogFactory.getLog(TestIndexSorter.class);
+  
+  private static final String INDEX_PLAIN = "index";
+  private static final String INDEX_SORTED = "index-sorted";
+  private static final int NUM_DOCS = 254;
+  private String[] fieldNames = new String[] {
+      "id",
+      "url",
+      "site",
+      "content",
+      "host",
+      "anchor",
+      "boost"
+  };
+  
+  Configuration conf = null;
+  File testDir = null;
+  Directory dir = null;
+  
+  
+  protected void setUp() throws Exception {
+    if (conf == null) conf = NutchConfiguration.create();
+    // create test index
+    testDir = new File("indexSorter-test-" + System.currentTimeMillis());
+    if (!testDir.mkdirs()) {
+      throw new Exception("Can't create test dir " + testDir.toString());
+    }
+    LOG.info("Creating test index: " + testDir.getAbsolutePath());
+    File plain = new File(testDir, INDEX_PLAIN);
+    Directory dir = FSDirectory.getDirectory(plain);
+    IndexWriter writer = new IndexWriter(dir, new NutchDocumentAnalyzer(conf), true);
+    // create test documents
+    for (int i = 0; i < NUM_DOCS; i++) {
+      Document doc = new Document();
+      for (int k = 0; k < fieldNames.length; k++) {
+        Field f;
+        Store s;
+        Index ix;
+        String val = null;
+        if (fieldNames[k].equals("id")) {
+          s = Store.YES;
+          ix = Index.UN_TOKENIZED;
+          val = String.valueOf(i);
+        } else if (fieldNames[k].equals("host")) {
+          s = Store.YES;
+          ix = Index.UN_TOKENIZED;
+          val = "www.example" + i + ".com";
+        } else if (fieldNames[k].equals("site")) {
+          s = Store.NO;
+          ix = Index.UN_TOKENIZED;
+          val = "www.example" + i + ".com";
+        } else if (fieldNames[k].equals("content")) {
+          s = Store.NO;
+          ix = Index.TOKENIZED;
+          val = "This is the content of the " + i + "-th document.";
+        } else if (fieldNames[k].equals("boost")) {
+          s = Store.YES;
+          ix = Index.NO;
+          // XXX note that this way we ensure different values of encoded boost
+          // XXX note also that for this reason we can't reliably test more than
+          // XXX 255 documents.
+          float boost = Similarity.decodeNorm((byte)(i + 1));
+          val = String.valueOf(boost);
+          doc.setBoost(boost);
+        } else {
+          s = Store.YES;
+          ix = Index.TOKENIZED;
+          if (fieldNames[k].equals("anchor")) {
+            val = "anchors to " + i + "-th page.";
+          } else if (fieldNames[k].equals("url")) {
+            val = "http://www.example" + i + ".com/" + i + ".html";
+          }
+        }
+        f = new Field(fieldNames[k], val, s, ix);
+        doc.add(f);
+      }
+      writer.addDocument(doc);
+    }
+    writer.optimize();
+    writer.close();
+  }
+  
+  protected void tearDown() throws Exception {
+    FileUtil.fullyDelete(testDir);
+  }
+  
+  public void testSorting() throws Exception {
+    IndexSorter sorter = new IndexSorter(conf);
+    sorter.sort(testDir);
+    // read back documents
+    IndexReader reader = IndexReader.open(new File(testDir, INDEX_SORTED));
+    assertEquals(reader.numDocs(), NUM_DOCS);
+    for (int i = 0; i < reader.maxDoc(); i++) {
+      Document doc = reader.document(i);
+      Field f = doc.getField("content");
+      assertNull(f);
+      f = doc.getField("boost");
+      float boost = Similarity.decodeNorm((byte)(NUM_DOCS - i));
+      String cmp = String.valueOf(boost);
+      assertEquals(cmp, f.stringValue());
+    }
+    reader.close();
+  }
+
+}

Propchange: lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestIndexSorter.java
------------------------------------------------------------------------------
    svn:eol-style = native