You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2011/10/14 21:33:12 UTC
svn commit: r1183467 - in /lucene/dev/trunk/lucene/src:
java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsReader.java
test/org/apache/lucene/index/TestForTooMuchCloning.java
test/org/apache/lucene/index/codecs/pulsing/TestPulsingReuse.java
Author: rmuir
Date: Fri Oct 14 19:33:11 2011
New Revision: 1183467
URL: http://svn.apache.org/viewvc?rev=1183467&view=rev
Log:
LUCENE-3517: fix pulsingcodec to reuse its enums
Added:
lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/codecs/pulsing/TestPulsingReuse.java (with props)
Modified:
lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsReader.java
lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/TestForTooMuchCloning.java
Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsReader.java?rev=1183467&r1=1183466&r2=1183467&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsReader.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsReader.java Fri Oct 14 19:33:11 2011
@@ -18,6 +18,8 @@ package org.apache.lucene.index.codecs.p
*/
import java.io.IOException;
+import java.util.IdentityHashMap;
+import java.util.Map;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.DocsEnum;
@@ -29,6 +31,9 @@ import org.apache.lucene.index.codecs.Bl
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.Attribute;
+import org.apache.lucene.util.AttributeImpl;
+import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CodecUtil;
@@ -172,8 +177,6 @@ public class PulsingPostingsReader exten
}
}
- // TODO: we could actually reuse, by having TL that
- // holds the last wrapped reuse, and vice-versa
@Override
public DocsEnum docs(FieldInfo field, BlockTermState _termState, Bits liveDocs, DocsEnum reuse) throws IOException {
PulsingTermState termState = (PulsingTermState) _termState;
@@ -185,20 +188,29 @@ public class PulsingPostingsReader exten
postings = new PulsingDocsEnum(field);
}
} else {
- postings = new PulsingDocsEnum(field);
+ // the 'reuse' is actually the wrapped enum
+ PulsingDocsEnum previous = (PulsingDocsEnum) getOther(reuse);
+ if (previous != null && previous.canReuse(field)) {
+ postings = previous;
+ } else {
+ postings = new PulsingDocsEnum(field);
+ }
+ }
+ if (reuse != postings) {
+ setOther(postings, reuse); // postings.other = reuse
}
return postings.reset(liveDocs, termState);
} else {
- // TODO: not great that we lose reuse of PulsingDocsEnum in this case:
if (reuse instanceof PulsingDocsEnum) {
- return wrappedPostingsReader.docs(field, termState.wrappedTermState, liveDocs, null);
+ DocsEnum wrapped = wrappedPostingsReader.docs(field, termState.wrappedTermState, liveDocs, getOther(reuse));
+ setOther(wrapped, reuse); // wrapped.other = reuse
+ return wrapped;
} else {
return wrappedPostingsReader.docs(field, termState.wrappedTermState, liveDocs, reuse);
}
}
}
- // TODO: -- not great that we can't always reuse
@Override
public DocsAndPositionsEnum docsAndPositions(FieldInfo field, BlockTermState _termState, Bits liveDocs, DocsAndPositionsEnum reuse) throws IOException {
if (field.indexOptions != IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
@@ -216,13 +228,23 @@ public class PulsingPostingsReader exten
postings = new PulsingDocsAndPositionsEnum(field);
}
} else {
- postings = new PulsingDocsAndPositionsEnum(field);
+ // the 'reuse' is actually the wrapped enum
+ PulsingDocsAndPositionsEnum previous = (PulsingDocsAndPositionsEnum) getOther(reuse);
+ if (previous != null && previous.canReuse(field)) {
+ postings = previous;
+ } else {
+ postings = new PulsingDocsAndPositionsEnum(field);
+ }
+ }
+ if (reuse != postings) {
+ setOther(postings, reuse); // postings.other = reuse
}
-
return postings.reset(liveDocs, termState);
} else {
if (reuse instanceof PulsingDocsAndPositionsEnum) {
- return wrappedPostingsReader.docsAndPositions(field, termState.wrappedTermState, liveDocs, null);
+ DocsAndPositionsEnum wrapped = wrappedPostingsReader.docsAndPositions(field, termState.wrappedTermState, liveDocs, (DocsAndPositionsEnum) getOther(reuse));
+ setOther(wrapped, reuse); // wrapped.other = reuse
+ return wrapped;
} else {
return wrappedPostingsReader.docsAndPositions(field, termState.wrappedTermState, liveDocs, reuse);
}
@@ -499,4 +521,69 @@ public class PulsingPostingsReader exten
public void close() throws IOException {
wrappedPostingsReader.close();
}
+
+ /** for a docsenum, gets the 'other' reused enum.
+ * Example: Pulsing(Standard).
+ * when doing a term range query you are switching back and forth
+ * between Pulsing and Standard
+ *
+ * The way the reuse works is that Pulsing.other = Standard and
+ * Standard.other = Pulsing.
+ */
+ private DocsEnum getOther(DocsEnum de) {
+ if (de == null) {
+ return null;
+ } else {
+ final AttributeSource atts = de.attributes();
+ return atts.addAttribute(PulsingEnumAttribute.class).enums().get(this);
+ }
+ }
+
+ /**
+ * for a docsenum, sets the 'other' reused enum.
+ * see getOther for an example.
+ */
+ private DocsEnum setOther(DocsEnum de, DocsEnum other) {
+ final AttributeSource atts = de.attributes();
+ return atts.addAttribute(PulsingEnumAttribute.class).enums().put(this, other);
+ }
+
+ /**
+ * A per-docsenum attribute that stores additional reuse information
+ * so that pulsing enums can keep a reference to their wrapped enums,
+ * and vice versa. this way we can always reuse.
+ *
+ * @lucene.internal */
+ public static interface PulsingEnumAttribute extends Attribute {
+ public Map<PulsingPostingsReader,DocsEnum> enums();
+ }
+
+ /** @lucene.internal */
+ public static final class PulsingEnumAttributeImpl extends AttributeImpl implements PulsingEnumAttribute {
+ // we could store 'other', but what if someone 'chained' multiple postings readers,
+ // this could cause problems?
+ // TODO: we should consider nuking this map and just making it so if you do this,
+ // you don't reuse? and maybe pulsingPostingsReader should throw an exc if it wraps
+ // another pulsing, because this is just stupid and wasteful.
+ // we still have to be careful in case someone does Pulsing(Stomping(Pulsing(...
+ private final Map<PulsingPostingsReader,DocsEnum> enums =
+ new IdentityHashMap<PulsingPostingsReader,DocsEnum>();
+
+ public Map<PulsingPostingsReader,DocsEnum> enums() {
+ return enums;
+ }
+
+ @Override
+ public void clear() {
+ // our state is per-docsenum, so this makes no sense.
+ // its best not to clear, in case a wrapped enum has a per-doc attribute or something
+ // and is calling clearAttributes(), so they don't nuke the reuse information!
+ }
+
+ @Override
+ public void copyTo(AttributeImpl target) {
+ // this makes no sense for us, because our state is per-docsenum.
+ // we don't want to copy any stuff over to another docsenum ever!
+ }
+ }
}
Modified: lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/TestForTooMuchCloning.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/TestForTooMuchCloning.java?rev=1183467&r1=1183466&r2=1183467&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/TestForTooMuchCloning.java (original)
+++ lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/TestForTooMuchCloning.java Fri Oct 14 19:33:11 2011
@@ -36,9 +36,6 @@ public class TestForTooMuchCloning exten
// Make sure we don't clone IndexInputs too frequently
// during merging:
public void test() throws Exception {
- String codec = CodecProvider.getDefault().getFieldCodec("field");
- // TODO: once LUCENE-3517 is fixed, remove this:
- assumeFalse("PulsingCodec fails this test because of over-cloning", codec.equals("Pulsing") || codec.equals("MockRandom"));
final MockDirectoryWrapper dir = newDirectory();
final TieredMergePolicy tmp = new TieredMergePolicy();
tmp.setMaxMergeAtOnce(2);
Added: lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/codecs/pulsing/TestPulsingReuse.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/codecs/pulsing/TestPulsingReuse.java?rev=1183467&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/codecs/pulsing/TestPulsingReuse.java (added)
+++ lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/codecs/pulsing/TestPulsingReuse.java Fri Oct 14 19:33:11 2011
@@ -0,0 +1,216 @@
+package org.apache.lucene.index.codecs.pulsing;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.IdentityHashMap;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.TextField;
+import org.apache.lucene.index.CheckIndex;
+import org.apache.lucene.index.DocsAndPositionsEnum;
+import org.apache.lucene.index.DocsEnum;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.PerDocWriteState;
+import org.apache.lucene.index.RandomIndexWriter;
+import org.apache.lucene.index.SegmentInfo;
+import org.apache.lucene.index.SegmentReadState;
+import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.index.codecs.BlockTreeTermsReader;
+import org.apache.lucene.index.codecs.BlockTreeTermsWriter;
+import org.apache.lucene.index.codecs.Codec;
+import org.apache.lucene.index.codecs.CodecProvider;
+import org.apache.lucene.index.codecs.DefaultDocValuesConsumer;
+import org.apache.lucene.index.codecs.DefaultDocValuesProducer;
+import org.apache.lucene.index.codecs.FieldsConsumer;
+import org.apache.lucene.index.codecs.FieldsProducer;
+import org.apache.lucene.index.codecs.PerDocConsumer;
+import org.apache.lucene.index.codecs.PerDocValues;
+import org.apache.lucene.index.codecs.PostingsReaderBase;
+import org.apache.lucene.index.codecs.PostingsWriterBase;
+import org.apache.lucene.index.codecs.standard.StandardCodec;
+import org.apache.lucene.index.codecs.standard.StandardPostingsReader;
+import org.apache.lucene.index.codecs.standard.StandardPostingsWriter;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.MockDirectoryWrapper;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util._TestUtil;
+
+/**
+ * Tests that pulsing codec reuses its enums and wrapped enums
+ */
+public class TestPulsingReuse extends LuceneTestCase {
+ // TODO: this is a basic test. this thing is complicated, add more
+ public void testSophisticatedReuse() throws Exception {
+ // we always run this test with pulsing codec.
+ CodecProvider cp = _TestUtil.alwaysCodec(new PulsingCodec(1));
+ Directory dir = newDirectory();
+ RandomIndexWriter iw = new RandomIndexWriter(random, dir,
+ newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setCodecProvider(cp));
+ Document doc = new Document();
+ doc.add(new Field("foo", "a b b c c c d e f g g h i i j j k", TextField.TYPE_UNSTORED));
+ iw.addDocument(doc);
+ IndexReader ir = iw.getReader();
+ iw.close();
+
+ IndexReader segment = ir.getSequentialSubReaders()[0];
+ DocsEnum reuse = null;
+ Map<DocsEnum,Boolean> allEnums = new IdentityHashMap<DocsEnum,Boolean>();
+ TermsEnum te = segment.terms("foo").iterator();
+ while (te.next() != null) {
+ reuse = te.docs(null, reuse);
+ allEnums.put(reuse, true);
+ }
+
+ assertEquals(2, allEnums.size());
+
+ allEnums.clear();
+ DocsAndPositionsEnum posReuse = null;
+ te = segment.terms("foo").iterator();
+ while (te.next() != null) {
+ posReuse = te.docsAndPositions(null, posReuse);
+ allEnums.put(posReuse, true);
+ }
+
+ assertEquals(2, allEnums.size());
+
+ ir.close();
+ dir.close();
+ }
+
+ /** tests reuse with Pulsing1(Pulsing2(Standard)) */
+ public void testNestedPulsing() throws Exception {
+ // we always run this test with pulsing codec.
+ CodecProvider cp = _TestUtil.alwaysCodec(new NestedPulsing());
+ MockDirectoryWrapper dir = newDirectory();
+ dir.setCheckIndexOnClose(false); // will do this ourselves, custom codec
+ RandomIndexWriter iw = new RandomIndexWriter(random, dir,
+ newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setCodecProvider(cp));
+ Document doc = new Document();
+ doc.add(new Field("foo", "a b b c c c d e f g g g h i i j j k l l m m m", TextField.TYPE_UNSTORED));
+ // note: the reuse is imperfect, here we would have 4 enums (lost reuse when we get an enum for 'm')
+ // this is because we only track the 'last' enum we reused (not all).
+ // but this seems 'good enough' for now.
+ iw.addDocument(doc);
+ IndexReader ir = iw.getReader();
+ iw.close();
+
+ IndexReader segment = ir.getSequentialSubReaders()[0];
+ DocsEnum reuse = null;
+ Map<DocsEnum,Boolean> allEnums = new IdentityHashMap<DocsEnum,Boolean>();
+ TermsEnum te = segment.terms("foo").iterator();
+ while (te.next() != null) {
+ reuse = te.docs(null, reuse);
+ allEnums.put(reuse, true);
+ }
+
+ assertEquals(4, allEnums.size());
+
+ allEnums.clear();
+ DocsAndPositionsEnum posReuse = null;
+ te = segment.terms("foo").iterator();
+ while (te.next() != null) {
+ posReuse = te.docsAndPositions(null, posReuse);
+ allEnums.put(posReuse, true);
+ }
+
+ assertEquals(4, allEnums.size());
+
+ ir.close();
+ CheckIndex ci = new CheckIndex(dir);
+ ci.checkIndex(null, cp);
+ dir.close();
+ }
+
+ static class NestedPulsing extends Codec {
+ public NestedPulsing() {
+ super("NestedPulsing");
+ }
+
+ @Override
+ public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
+ PostingsWriterBase docsWriter = new StandardPostingsWriter(state);
+
+ PostingsWriterBase pulsingWriterInner = new PulsingPostingsWriter(2, docsWriter);
+ PostingsWriterBase pulsingWriter = new PulsingPostingsWriter(1, pulsingWriterInner);
+
+ // Terms dict
+ boolean success = false;
+ try {
+ FieldsConsumer ret = new BlockTreeTermsWriter(state, pulsingWriter,
+ BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE, BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE);
+ success = true;
+ return ret;
+ } finally {
+ if (!success) {
+ pulsingWriter.close();
+ }
+ }
+ }
+
+ @Override
+ public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
+ PostingsReaderBase docsReader = new StandardPostingsReader(state.dir, state.segmentInfo, state.context, state.codecId);
+ PostingsReaderBase pulsingReaderInner = new PulsingPostingsReader(docsReader);
+ PostingsReaderBase pulsingReader = new PulsingPostingsReader(pulsingReaderInner);
+ boolean success = false;
+ try {
+ FieldsProducer ret = new BlockTreeTermsReader(
+ state.dir, state.fieldInfos, state.segmentInfo.name,
+ pulsingReader,
+ state.context,
+ state.codecId,
+ state.termsIndexDivisor);
+ success = true;
+ return ret;
+ } finally {
+ if (!success) {
+ pulsingReader.close();
+ }
+ }
+ }
+
+ @Override
+ public PerDocConsumer docsConsumer(PerDocWriteState state) throws IOException {
+ return new DefaultDocValuesConsumer(state);
+ }
+
+ @Override
+ public PerDocValues docsProducer(SegmentReadState state) throws IOException {
+ return new DefaultDocValuesProducer(state);
+ }
+
+ @Override
+ public void files(Directory dir, SegmentInfo segmentInfo, int id, Set<String> files) throws IOException {
+ StandardPostingsReader.files(dir, segmentInfo, id, files);
+ BlockTreeTermsReader.files(dir, segmentInfo, id, files);
+ DefaultDocValuesConsumer.files(dir, segmentInfo, id, files);
+ }
+
+ @Override
+ public void getExtensions(Set<String> extensions) {
+ StandardCodec.getStandardExtensions(extensions);
+ DefaultDocValuesConsumer.getExtensions(extensions);
+ }
+ }
+}