You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2011/10/14 21:33:12 UTC

svn commit: r1183467 - in /lucene/dev/trunk/lucene/src: java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsReader.java test/org/apache/lucene/index/TestForTooMuchCloning.java test/org/apache/lucene/index/codecs/pulsing/TestPulsingReuse.java

Author: rmuir
Date: Fri Oct 14 19:33:11 2011
New Revision: 1183467

URL: http://svn.apache.org/viewvc?rev=1183467&view=rev
Log:
LUCENE-3517: fix pulsingcodec to reuse its enums

Added:
    lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/codecs/pulsing/TestPulsingReuse.java   (with props)
Modified:
    lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsReader.java
    lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/TestForTooMuchCloning.java

Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsReader.java?rev=1183467&r1=1183466&r2=1183467&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsReader.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsReader.java Fri Oct 14 19:33:11 2011
@@ -18,6 +18,8 @@ package org.apache.lucene.index.codecs.p
  */
 
 import java.io.IOException;
+import java.util.IdentityHashMap;
+import java.util.Map;
 
 import org.apache.lucene.index.DocsAndPositionsEnum;
 import org.apache.lucene.index.DocsEnum;
@@ -29,6 +31,9 @@ import org.apache.lucene.index.codecs.Bl
 import org.apache.lucene.store.ByteArrayDataInput;
 import org.apache.lucene.store.IndexInput;
 import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.Attribute;
+import org.apache.lucene.util.AttributeImpl;
+import org.apache.lucene.util.AttributeSource;
 import org.apache.lucene.util.Bits;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.CodecUtil;
@@ -172,8 +177,6 @@ public class PulsingPostingsReader exten
     }
   }
 
-  // TODO: we could actually reuse, by having TL that
-  // holds the last wrapped reuse, and vice-versa
   @Override
   public DocsEnum docs(FieldInfo field, BlockTermState _termState, Bits liveDocs, DocsEnum reuse) throws IOException {
     PulsingTermState termState = (PulsingTermState) _termState;
@@ -185,20 +188,29 @@ public class PulsingPostingsReader exten
           postings = new PulsingDocsEnum(field);
         }
       } else {
-        postings = new PulsingDocsEnum(field);
+        // the 'reuse' is actually the wrapped enum
+        PulsingDocsEnum previous = (PulsingDocsEnum) getOther(reuse);
+        if (previous != null && previous.canReuse(field)) {
+          postings = previous;
+        } else {
+          postings = new PulsingDocsEnum(field);
+        }
+      }
+      if (reuse != postings) {
+        setOther(postings, reuse); // postings.other = reuse
       }
       return postings.reset(liveDocs, termState);
     } else {
-      // TODO: not great that we lose reuse of PulsingDocsEnum in this case:
       if (reuse instanceof PulsingDocsEnum) {
-        return wrappedPostingsReader.docs(field, termState.wrappedTermState, liveDocs, null);
+        DocsEnum wrapped = wrappedPostingsReader.docs(field, termState.wrappedTermState, liveDocs, getOther(reuse));
+        setOther(wrapped, reuse); // wrapped.other = reuse
+        return wrapped;
       } else {
         return wrappedPostingsReader.docs(field, termState.wrappedTermState, liveDocs, reuse);
       }
     }
   }
 
-  // TODO: -- not great that we can't always reuse
   @Override
   public DocsAndPositionsEnum docsAndPositions(FieldInfo field, BlockTermState _termState, Bits liveDocs, DocsAndPositionsEnum reuse) throws IOException {
     if (field.indexOptions != IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
@@ -216,13 +228,23 @@ public class PulsingPostingsReader exten
           postings = new PulsingDocsAndPositionsEnum(field);
         }
       } else {
-        postings = new PulsingDocsAndPositionsEnum(field);
+        // the 'reuse' is actually the wrapped enum
+        PulsingDocsAndPositionsEnum previous = (PulsingDocsAndPositionsEnum) getOther(reuse);
+        if (previous != null && previous.canReuse(field)) {
+          postings = previous;
+        } else {
+          postings = new PulsingDocsAndPositionsEnum(field);
+        }
+      }
+      if (reuse != postings) {
+        setOther(postings, reuse); // postings.other = reuse 
       }
-
       return postings.reset(liveDocs, termState);
     } else {
       if (reuse instanceof PulsingDocsAndPositionsEnum) {
-        return wrappedPostingsReader.docsAndPositions(field, termState.wrappedTermState, liveDocs, null);
+        DocsAndPositionsEnum wrapped = wrappedPostingsReader.docsAndPositions(field, termState.wrappedTermState, liveDocs, (DocsAndPositionsEnum) getOther(reuse));
+        setOther(wrapped, reuse); // wrapped.other = reuse
+        return wrapped;
       } else {
         return wrappedPostingsReader.docsAndPositions(field, termState.wrappedTermState, liveDocs, reuse);
       }
@@ -499,4 +521,69 @@ public class PulsingPostingsReader exten
   public void close() throws IOException {
     wrappedPostingsReader.close();
   }
+  
+  /** for a docsenum, gets the 'other' reused enum.
+   * Example: Pulsing(Standard).
+   * when doing a term range query you are switching back and forth
+   * between Pulsing and Standard
+   * 
+   * The way the reuse works is that Pulsing.other = Standard and
+   * Standard.other = Pulsing.
+   */
+  private DocsEnum getOther(DocsEnum de) {
+    if (de == null) {
+      return null;
+    } else {
+      final AttributeSource atts = de.attributes();
+      return atts.addAttribute(PulsingEnumAttribute.class).enums().get(this);
+    }
+  }
+  
+  /** 
+   * for a docsenum, sets the 'other' reused enum.
+   * see getOther for an example.
+   */
+  private DocsEnum setOther(DocsEnum de, DocsEnum other) {
+    final AttributeSource atts = de.attributes();
+    return atts.addAttribute(PulsingEnumAttribute.class).enums().put(this, other);
+  }
+
+  /** 
+   * A per-docsenum attribute that stores additional reuse information
+   * so that pulsing enums can keep a reference to their wrapped enums,
+   * and vice versa. this way we can always reuse.
+   * 
+   * @lucene.internal */
+  public static interface PulsingEnumAttribute extends Attribute {
+    public Map<PulsingPostingsReader,DocsEnum> enums();
+  }
+    
+  /** @lucene.internal */
+  public static final class PulsingEnumAttributeImpl extends AttributeImpl implements PulsingEnumAttribute {
+    // we could store 'other', but what if someone 'chained' multiple postings readers,
+    // this could cause problems?
+    // TODO: we should consider nuking this map and just making it so if you do this,
+    // you don't reuse? and maybe pulsingPostingsReader should throw an exc if it wraps
+    // another pulsing, because this is just stupid and wasteful. 
+    // we still have to be careful in case someone does Pulsing(Stomping(Pulsing(...
+    private final Map<PulsingPostingsReader,DocsEnum> enums = 
+      new IdentityHashMap<PulsingPostingsReader,DocsEnum>();
+      
+    public Map<PulsingPostingsReader,DocsEnum> enums() {
+      return enums;
+    }
+
+    @Override
+    public void clear() {
+      // our state is per-docsenum, so this makes no sense.
+      // its best not to clear, in case a wrapped enum has a per-doc attribute or something
+      // and is calling clearAttributes(), so they don't nuke the reuse information!
+    }
+
+    @Override
+    public void copyTo(AttributeImpl target) {
+      // this makes no sense for us, because our state is per-docsenum.
+      // we don't want to copy any stuff over to another docsenum ever!
+    }
+  }
 }

Modified: lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/TestForTooMuchCloning.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/TestForTooMuchCloning.java?rev=1183467&r1=1183466&r2=1183467&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/TestForTooMuchCloning.java (original)
+++ lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/TestForTooMuchCloning.java Fri Oct 14 19:33:11 2011
@@ -36,9 +36,6 @@ public class TestForTooMuchCloning exten
   // Make sure we don't clone IndexInputs too frequently
   // during merging:
   public void test() throws Exception {
-    String codec = CodecProvider.getDefault().getFieldCodec("field");
-    // TODO: once LUCENE-3517 is fixed, remove this:
-    assumeFalse("PulsingCodec fails this test because of over-cloning", codec.equals("Pulsing") || codec.equals("MockRandom"));
     final MockDirectoryWrapper dir = newDirectory();
     final TieredMergePolicy tmp = new TieredMergePolicy();
     tmp.setMaxMergeAtOnce(2);

Added: lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/codecs/pulsing/TestPulsingReuse.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/codecs/pulsing/TestPulsingReuse.java?rev=1183467&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/codecs/pulsing/TestPulsingReuse.java (added)
+++ lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/codecs/pulsing/TestPulsingReuse.java Fri Oct 14 19:33:11 2011
@@ -0,0 +1,216 @@
+package org.apache.lucene.index.codecs.pulsing;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.IdentityHashMap;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.TextField;
+import org.apache.lucene.index.CheckIndex;
+import org.apache.lucene.index.DocsAndPositionsEnum;
+import org.apache.lucene.index.DocsEnum;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.PerDocWriteState;
+import org.apache.lucene.index.RandomIndexWriter;
+import org.apache.lucene.index.SegmentInfo;
+import org.apache.lucene.index.SegmentReadState;
+import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.index.codecs.BlockTreeTermsReader;
+import org.apache.lucene.index.codecs.BlockTreeTermsWriter;
+import org.apache.lucene.index.codecs.Codec;
+import org.apache.lucene.index.codecs.CodecProvider;
+import org.apache.lucene.index.codecs.DefaultDocValuesConsumer;
+import org.apache.lucene.index.codecs.DefaultDocValuesProducer;
+import org.apache.lucene.index.codecs.FieldsConsumer;
+import org.apache.lucene.index.codecs.FieldsProducer;
+import org.apache.lucene.index.codecs.PerDocConsumer;
+import org.apache.lucene.index.codecs.PerDocValues;
+import org.apache.lucene.index.codecs.PostingsReaderBase;
+import org.apache.lucene.index.codecs.PostingsWriterBase;
+import org.apache.lucene.index.codecs.standard.StandardCodec;
+import org.apache.lucene.index.codecs.standard.StandardPostingsReader;
+import org.apache.lucene.index.codecs.standard.StandardPostingsWriter;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.MockDirectoryWrapper;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util._TestUtil;
+
+/**
+ * Tests that pulsing codec reuses its enums and wrapped enums
+ */
+public class TestPulsingReuse extends LuceneTestCase {
+  // TODO: this is a basic test. this thing is complicated, add more
+  public void testSophisticatedReuse() throws Exception {
+    // we always run this test with pulsing codec.
+    CodecProvider cp = _TestUtil.alwaysCodec(new PulsingCodec(1));
+    Directory dir = newDirectory();
+    RandomIndexWriter iw = new RandomIndexWriter(random, dir, 
+        newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setCodecProvider(cp));
+    Document doc = new Document();
+    doc.add(new Field("foo", "a b b c c c d e f g g h i i j j k", TextField.TYPE_UNSTORED));
+    iw.addDocument(doc);
+    IndexReader ir = iw.getReader();
+    iw.close();
+    
+    IndexReader segment = ir.getSequentialSubReaders()[0];
+    DocsEnum reuse = null;
+    Map<DocsEnum,Boolean> allEnums = new IdentityHashMap<DocsEnum,Boolean>();
+    TermsEnum te = segment.terms("foo").iterator();
+    while (te.next() != null) {
+      reuse = te.docs(null, reuse);
+      allEnums.put(reuse, true);
+    }
+    
+    assertEquals(2, allEnums.size());
+    
+    allEnums.clear();
+    DocsAndPositionsEnum posReuse = null;
+    te = segment.terms("foo").iterator();
+    while (te.next() != null) {
+      posReuse = te.docsAndPositions(null, posReuse);
+      allEnums.put(posReuse, true);
+    }
+    
+    assertEquals(2, allEnums.size());
+    
+    ir.close();
+    dir.close();
+  }
+  
+  /** tests reuse with Pulsing1(Pulsing2(Standard)) */
+  public void testNestedPulsing() throws Exception {
+    // we always run this test with pulsing codec.
+    CodecProvider cp = _TestUtil.alwaysCodec(new NestedPulsing());
+    MockDirectoryWrapper dir = newDirectory();
+    dir.setCheckIndexOnClose(false); // will do this ourselves, custom codec
+    RandomIndexWriter iw = new RandomIndexWriter(random, dir, 
+        newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setCodecProvider(cp));
+    Document doc = new Document();
+    doc.add(new Field("foo", "a b b c c c d e f g g g h i i j j k l l m m m", TextField.TYPE_UNSTORED));
+    // note: the reuse is imperfect, here we would have 4 enums (lost reuse when we get an enum for 'm')
+    // this is because we only track the 'last' enum we reused (not all).
+    // but this seems 'good enough' for now.
+    iw.addDocument(doc);
+    IndexReader ir = iw.getReader();
+    iw.close();
+    
+    IndexReader segment = ir.getSequentialSubReaders()[0];
+    DocsEnum reuse = null;
+    Map<DocsEnum,Boolean> allEnums = new IdentityHashMap<DocsEnum,Boolean>();
+    TermsEnum te = segment.terms("foo").iterator();
+    while (te.next() != null) {
+      reuse = te.docs(null, reuse);
+      allEnums.put(reuse, true);
+    }
+    
+    assertEquals(4, allEnums.size());
+    
+    allEnums.clear();
+    DocsAndPositionsEnum posReuse = null;
+    te = segment.terms("foo").iterator();
+    while (te.next() != null) {
+      posReuse = te.docsAndPositions(null, posReuse);
+      allEnums.put(posReuse, true);
+    }
+    
+    assertEquals(4, allEnums.size());
+    
+    ir.close();
+    CheckIndex ci = new CheckIndex(dir);
+    ci.checkIndex(null, cp);
+    dir.close();
+  }
+  
+  static class NestedPulsing extends Codec {
+    public NestedPulsing() {
+      super("NestedPulsing");
+    }
+    
+    @Override
+    public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
+      PostingsWriterBase docsWriter = new StandardPostingsWriter(state);
+
+      PostingsWriterBase pulsingWriterInner = new PulsingPostingsWriter(2, docsWriter);
+      PostingsWriterBase pulsingWriter = new PulsingPostingsWriter(1, pulsingWriterInner);
+      
+      // Terms dict
+      boolean success = false;
+      try {
+        FieldsConsumer ret = new BlockTreeTermsWriter(state, pulsingWriter, 
+            BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE, BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE);
+        success = true;
+        return ret;
+      } finally {
+        if (!success) {
+          pulsingWriter.close();
+        }
+      }
+    }
+
+    @Override
+    public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
+      PostingsReaderBase docsReader = new StandardPostingsReader(state.dir, state.segmentInfo, state.context, state.codecId);
+      PostingsReaderBase pulsingReaderInner = new PulsingPostingsReader(docsReader);
+      PostingsReaderBase pulsingReader = new PulsingPostingsReader(pulsingReaderInner);
+      boolean success = false;
+      try {
+        FieldsProducer ret = new BlockTreeTermsReader(
+                                                      state.dir, state.fieldInfos, state.segmentInfo.name,
+                                                      pulsingReader,
+                                                      state.context,
+                                                      state.codecId,
+                                                      state.termsIndexDivisor);
+        success = true;
+        return ret;
+      } finally {
+        if (!success) {
+          pulsingReader.close();
+        }
+      }
+    }
+
+    @Override
+    public PerDocConsumer docsConsumer(PerDocWriteState state) throws IOException {
+      return new DefaultDocValuesConsumer(state);
+    }
+
+    @Override
+    public PerDocValues docsProducer(SegmentReadState state) throws IOException {
+      return new DefaultDocValuesProducer(state);
+    }
+
+    @Override
+    public void files(Directory dir, SegmentInfo segmentInfo, int id, Set<String> files) throws IOException {
+      StandardPostingsReader.files(dir, segmentInfo, id, files);
+      BlockTreeTermsReader.files(dir, segmentInfo, id, files);
+      DefaultDocValuesConsumer.files(dir, segmentInfo, id, files);
+    }
+
+    @Override
+    public void getExtensions(Set<String> extensions) {
+      StandardCodec.getStandardExtensions(extensions);
+      DefaultDocValuesConsumer.getExtensions(extensions);
+    }
+  }
+}