You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mh...@apache.org on 2012/08/02 15:00:04 UTC
svn commit: r1368442 - in /lucene/dev/branches/branch_4x/lucene: ./
core/src/java/org/apache/lucene/codecs/
core/src/java/org/apache/lucene/codecs/bloom/
core/src/java/org/apache/lucene/util/
core/src/java/org/apache/lucene/util/hash/ core/src/resource...
Author: mharwood
Date: Thu Aug 2 13:00:03 2012
New Revision: 1368442
URL: http://svn.apache.org/viewvc?rev=1368442&view=rev
Log:
New addition: Lucene-4069 BloomFilterPostingsFormat for faster access to low-frequency terms such as primary keys.
Added:
lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/bloom/
lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/bloom/BloomFilterFactory.java
lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/bloom/BloomFilteringPostingsFormat.java
lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/bloom/DefaultBloomFilterFactory.java
lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/bloom/package.html
lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/FuzzySet.java
lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/hash/
lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/hash/HashFunction.java
lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/hash/MurmurHash2.java
lucene/dev/branches/branch_4x/lucene/core/src/resources/META-INF/services/org.apache.lucene.util.hash.HashFunction
lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/codecs/bloom/
lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/codecs/bloom/TestBloomFilteredLucene40Postings.java
Modified:
lucene/dev/branches/branch_4x/lucene/CHANGES.txt
lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/PostingsFormat.java
lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/FixedBitSet.java
lucene/dev/branches/branch_4x/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat
lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java
lucene/dev/branches/branch_4x/lucene/test-framework/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat
Modified: lucene/dev/branches/branch_4x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/CHANGES.txt?rev=1368442&r1=1368441&r2=1368442&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_4x/lucene/CHANGES.txt Thu Aug 2 13:00:03 2012
@@ -13,7 +13,10 @@ New features
* LUCENE-4249: Changed the explanation of the PayloadTermWeight to use the
underlying PayloadFunction's explanation as the explanation
for the payload score. (Scott Smerchek via Robert Muir)
-
+
+* LUCENE-4069: Added BloomFilteringPostingsFormat for use with low-frequency terms
+ such as primary keys (Mark Harwood, Mike McCandless)
+
* LUCENE-4201: Added JapaneseIterationMarkCharFilter to normalize Japanese
iteration marks. (Robert Muir, Christian Moen)
Modified: lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/PostingsFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/PostingsFormat.java?rev=1368442&r1=1368441&r2=1368442&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/PostingsFormat.java (original)
+++ lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/PostingsFormat.java Thu Aug 2 13:00:03 2012
@@ -53,7 +53,13 @@ public abstract class PostingsFormat imp
/** Reads a segment. NOTE: by the time this call
* returns, it must hold open any files it will need to
- * use; else, those files may be deleted. */
+ * use; else, those files may be deleted.
+ * Additionally, required files may be deleted during the execution of
+ * this call before there is a chance to open them. Under these
+ * circumstances an IOException should be thrown by the implementation.
+ * IOExceptions are expected and will automatically cause a retry of the
+ * segment opening logic with the newly revised segments.
+ * */
public abstract FieldsProducer fieldsProducer(SegmentReadState state) throws IOException;
@Override
Added: lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/bloom/BloomFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/bloom/BloomFilterFactory.java?rev=1368442&view=auto
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/bloom/BloomFilterFactory.java (added)
+++ lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/bloom/BloomFilterFactory.java Thu Aug 2 13:00:03 2012
@@ -0,0 +1,63 @@
+package org.apache.lucene.codecs.bloom;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.util.FuzzySet;
+
+
+/**
+ * Class used to create index-time {@link FuzzySet} appropriately configured for
+ * each field. Also called to right-size bitsets for serialization.
+ * @lucene.experimental
+ */
+public abstract class BloomFilterFactory {
+
+ /**
+ *
+ * @param state The content to be indexed
+ * @param info
+ * the field requiring a BloomFilter
+ * @return An appropriately sized set or null if no BloomFiltering required
+ */
+ public abstract FuzzySet getSetForField(SegmentWriteState state, FieldInfo info);
+
+ /**
+ * Called when downsizing bitsets for serialization
+ *
+ * @param fieldInfo
+ * The field with sparse set bits
+ * @param initialSet
+ * The bits accumulated
+ * @return null or a hopefully more densely packed, smaller bitset
+ */
+ public FuzzySet downsize(FieldInfo fieldInfo, FuzzySet initialSet) {
+ // Aim for a bitset size that would have 10% of bits set (so 90% of searches
+ // would fail-fast)
+ float targetMaxSaturation = 0.1f;
+ return initialSet.downsize(targetMaxSaturation);
+ }
+
+ /**
+ * Used to determine if the given filter has reached saturation and should be retired i.e. not saved any more
+ * @param bloomFilter
+ * @param fieldInfo
+ * @return
+ */
+ public abstract boolean isSaturated(FuzzySet bloomFilter, FieldInfo fieldInfo);
+
+}
Added: lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/bloom/BloomFilteringPostingsFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/bloom/BloomFilteringPostingsFormat.java?rev=1368442&view=auto
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/bloom/BloomFilteringPostingsFormat.java (added)
+++ lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/bloom/BloomFilteringPostingsFormat.java Thu Aug 2 13:00:03 2012
@@ -0,0 +1,514 @@
+package org.apache.lucene.codecs.bloom;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
+
+import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.codecs.FieldsConsumer;
+import org.apache.lucene.codecs.FieldsProducer;
+import org.apache.lucene.codecs.PostingsConsumer;
+import org.apache.lucene.codecs.PostingsFormat;
+import org.apache.lucene.codecs.TermStats;
+import org.apache.lucene.codecs.TermsConsumer;
+import org.apache.lucene.index.DocsAndPositionsEnum;
+import org.apache.lucene.index.DocsEnum;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.FieldsEnum;
+import org.apache.lucene.index.IndexFileNames;
+import org.apache.lucene.index.SegmentReadState;
+import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.AttributeSource;
+import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.FuzzySet;
+import org.apache.lucene.util.FuzzySet.ContainsResult;
+import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.automaton.CompiledAutomaton;
+import org.apache.lucene.util.hash.MurmurHash2;
+
+/**
+ * <p>
+ * A {@link PostingsFormat} useful for low doc-frequency fields such as primary
+ * keys. Bloom filters are maintained in a ".blm" file which offers "fast-fail"
+ * for reads in segments known to have no record of the key. A choice of
+ * delegate PostingsFormat is used to record all other Postings data.
+ * </p>
+ * <p>
+ * A choice of {@link BloomFilterFactory} can be passed to tailor Bloom Filter
+ * settings on a per-field basis. The default configuration is
+ * {@link DefaultBloomFilterFactory} which allocates a ~8mb bitset and hashes
+ * values using {@link MurmurHash2}. This should be suitable for most purposes.
+ * </p>
+ * <p>
+ * The format of the blm file is as follows:
+ * </p>
+ * <ul>
+ * <li>BloomFilter (.blm) --> Header, DelegatePostingsFormatName,
+ * NumFilteredFields, Filter<sup>NumFilteredFields</sup></li>
+ * <li>Filter --> FieldNumber, FuzzySet</li>
+ * <li>FuzzySet -->See {@link FuzzySet#serialize(DataOutput)}</li>
+ * <li>Header --> {@link CodecUtil#writeHeader CodecHeader}</li>
+ * <li>DelegatePostingsFormatName --> {@link DataOutput#writeString(String)
+ * String} The name of a ServiceProvider registered {@link PostingsFormat}</li>
+ * <li>NumFilteredFields --> {@link DataOutput#writeInt Uint32}</li>
+ * <li>FieldNumber --> {@link DataOutput#writeInt Uint32} The number of the
+ * field in this segment</li>
+ * </ul>
+ * @lucene.experimental
+ */
+public class BloomFilteringPostingsFormat extends PostingsFormat {
+
+ public static final String BLOOM_CODEC_NAME = "BloomFilter";
+ public static final int BLOOM_CODEC_VERSION = 1;
+
+ /** Extension of Bloom Filters file */
+ static final String BLOOM_EXTENSION = "blm";
+
+ BloomFilterFactory bloomFilterFactory = new DefaultBloomFilterFactory();
+ private PostingsFormat delegatePostingsFormat;
+
+ /**
+ * Creates Bloom filters for a selection of fields created in the index. This
+ * is recorded as a set of Bitsets held as a segment summary in an additional
+ * "blm" file. This PostingsFormat delegates to a choice of delegate
+ * PostingsFormat for encoding all other postings data.
+ *
+ * @param delegatePostingsFormat
+ * The PostingsFormat that records all the non-bloom filter data i.e.
+ * postings info.
+ * @param bloomFilterFactory
+ * The {@link BloomFilterFactory} responsible for sizing BloomFilters
+ * appropriately
+ */
+ public BloomFilteringPostingsFormat(PostingsFormat delegatePostingsFormat,
+ BloomFilterFactory bloomFilterFactory) {
+ super(BLOOM_CODEC_NAME);
+ this.delegatePostingsFormat = delegatePostingsFormat;
+ this.bloomFilterFactory = bloomFilterFactory;
+ }
+
+ /**
+ * Creates Bloom filters for a selection of fields created in the index. This
+ * is recorded as a set of Bitsets held as a segment summary in an additional
+ * "blm" file. This PostingsFormat delegates to a choice of delegate
+ * PostingsFormat for encoding all other postings data. This choice of
+ * constructor defaults to the {@link DefaultBloomFilterFactory} for
+ * configuring per-field BloomFilters.
+ *
+ * @param delegatePostingsFormat
+ * The PostingsFormat that records all the non-bloom filter data i.e.
+ * postings info.
+ */
+ public BloomFilteringPostingsFormat(PostingsFormat delegatePostingsFormat) {
+ this(delegatePostingsFormat, new DefaultBloomFilterFactory());
+ }
+
+ // Used only by core Lucene at read-time via Service Provider instantiation -
+ // do not use at Write-time in application code.
+ public BloomFilteringPostingsFormat() {
+ super(BLOOM_CODEC_NAME);
+ }
+
+ public FieldsConsumer fieldsConsumer(SegmentWriteState state)
+ throws IOException {
+ if (delegatePostingsFormat == null) {
+ throw new UnsupportedOperationException("Error - " + getClass().getName()
+ + " has been constructed without a choice of PostingsFormat");
+ }
+ return new BloomFilteredFieldsConsumer(
+ delegatePostingsFormat.fieldsConsumer(state), state,
+ delegatePostingsFormat);
+ }
+
+ public FieldsProducer fieldsProducer(SegmentReadState state)
+ throws IOException {
+ return new BloomFilteredFieldsProducer(state);
+ }
+
+ public class BloomFilteredFieldsProducer extends FieldsProducer {
+ private FieldsProducer delegateFieldsProducer;
+ HashMap<String,FuzzySet> bloomsByFieldName = new HashMap<String,FuzzySet>();
+
+ public BloomFilteredFieldsProducer(SegmentReadState state)
+ throws IOException {
+
+ String bloomFileName = IndexFileNames.segmentFileName(
+ state.segmentInfo.name, state.segmentSuffix, BLOOM_EXTENSION);
+ IndexInput bloomIn = null;
+ try {
+ bloomIn = state.dir.openInput(bloomFileName, state.context);
+ CodecUtil.checkHeader(bloomIn, BLOOM_CODEC_NAME, BLOOM_CODEC_VERSION,
+ BLOOM_CODEC_VERSION);
+ // // Load the hash function used in the BloomFilter
+ // hashFunction = HashFunction.forName(bloomIn.readString());
+ // Load the delegate postings format
+ PostingsFormat delegatePostingsFormat = PostingsFormat.forName(bloomIn
+ .readString());
+
+ this.delegateFieldsProducer = delegatePostingsFormat
+ .fieldsProducer(state);
+ int numBlooms = bloomIn.readInt();
+ for (int i = 0; i < numBlooms; i++) {
+ int fieldNum = bloomIn.readInt();
+ FuzzySet bloom = FuzzySet.deserialize(bloomIn);
+ FieldInfo fieldInfo = state.fieldInfos.fieldInfo(fieldNum);
+ bloomsByFieldName.put(fieldInfo.name, bloom);
+ }
+ } finally {
+ IOUtils.close(bloomIn);
+ }
+
+ }
+
+ public FieldsEnum iterator() throws IOException {
+ return new BloomFilteredFieldsEnum(delegateFieldsProducer.iterator(),
+ bloomsByFieldName);
+ }
+
+ public void close() throws IOException {
+ delegateFieldsProducer.close();
+ }
+
+ public Terms terms(String field) throws IOException {
+ FuzzySet filter = bloomsByFieldName.get(field);
+ if (filter == null) {
+ return delegateFieldsProducer.terms(field);
+ } else {
+ Terms result = delegateFieldsProducer.terms(field);
+ if (result == null) {
+ return null;
+ }
+ return new BloomFilteredTerms(result, filter);
+ }
+ }
+
+ public int size() throws IOException {
+ return delegateFieldsProducer.size();
+ }
+
+ public long getUniqueTermCount() throws IOException {
+ return delegateFieldsProducer.getUniqueTermCount();
+ }
+
+ // Not all fields in a segment may be subject to a bloom filter. This class
+ // wraps Terms objects appropriately if a filtering request is present
+ class BloomFilteredFieldsEnum extends FieldsEnum {
+ private FieldsEnum delegateFieldsEnum;
+ private HashMap<String,FuzzySet> bloomsByFieldName;
+ private String currentFieldName;
+
+ public BloomFilteredFieldsEnum(FieldsEnum iterator,
+ HashMap<String,FuzzySet> bloomsByFieldName) {
+ this.delegateFieldsEnum = iterator;
+ this.bloomsByFieldName = bloomsByFieldName;
+ }
+
+ public AttributeSource attributes() {
+ return delegateFieldsEnum.attributes();
+ }
+
+ public String next() throws IOException {
+ currentFieldName = delegateFieldsEnum.next();
+ return currentFieldName;
+ }
+
+ public Terms terms() throws IOException {
+ FuzzySet filter = bloomsByFieldName.get(currentFieldName);
+ if (filter == null) {
+ return delegateFieldsEnum.terms();
+ } else {
+ Terms result = delegateFieldsEnum.terms();
+ if (result == null) {
+ return null;
+ }
+ // wrap the terms object with a bloom filter
+ return new BloomFilteredTerms(result, filter);
+ }
+ }
+
+ }
+
+ class BloomFilteredTerms extends Terms {
+ private Terms delegateTerms;
+ private FuzzySet filter;
+
+ public BloomFilteredTerms(Terms terms, FuzzySet filter) {
+ this.delegateTerms = terms;
+ this.filter = filter;
+ }
+
+ @Override
+ public TermsEnum intersect(CompiledAutomaton compiled,
+ final BytesRef startTerm) throws IOException {
+ return delegateTerms.intersect(compiled, startTerm);
+ }
+
+ @Override
+ public TermsEnum iterator(TermsEnum reuse) throws IOException {
+ TermsEnum result;
+ if ((reuse != null) && (reuse instanceof BloomFilteredTermsEnum)) {
+ // recycle the existing BloomFilteredTermsEnum by asking the delegate
+ // to recycle its contained TermsEnum
+ BloomFilteredTermsEnum bfte = (BloomFilteredTermsEnum) reuse;
+ if (bfte.filter == filter) {
+ bfte.delegateTermsEnum = delegateTerms
+ .iterator(bfte.delegateTermsEnum);
+ return bfte;
+ }
+ }
+ // We have been handed something we cannot reuse (either null, wrong
+ // class or wrong filter) so allocate a new object
+ result = new BloomFilteredTermsEnum(delegateTerms.iterator(reuse),
+ filter);
+ return result;
+ }
+
+ @Override
+ public Comparator<BytesRef> getComparator() throws IOException {
+ return delegateTerms.getComparator();
+ }
+
+ @Override
+ public long size() throws IOException {
+ return delegateTerms.size();
+ }
+
+ @Override
+ public long getSumTotalTermFreq() throws IOException {
+ return delegateTerms.getSumTotalTermFreq();
+ }
+
+ @Override
+ public long getSumDocFreq() throws IOException {
+ return delegateTerms.getSumDocFreq();
+ }
+
+ @Override
+ public int getDocCount() throws IOException {
+ return delegateTerms.getDocCount();
+ }
+ }
+
+ class BloomFilteredTermsEnum extends TermsEnum {
+
+ TermsEnum delegateTermsEnum;
+ private FuzzySet filter;
+
+ public BloomFilteredTermsEnum(TermsEnum iterator, FuzzySet filter) {
+ this.delegateTermsEnum = iterator;
+ this.filter = filter;
+ }
+
+ @Override
+ public final BytesRef next() throws IOException {
+ return delegateTermsEnum.next();
+ }
+
+ @Override
+ public final Comparator<BytesRef> getComparator() {
+ return delegateTermsEnum.getComparator();
+ }
+
+ @Override
+ public final boolean seekExact(BytesRef text, boolean useCache)
+ throws IOException {
+ // The magical fail-fast speed up that is the entire point of all of
+ // this code - save a disk seek if there is a match on an in-memory
+ // structure
+ // that may occasionally give a false positive but guaranteed no false
+ // negatives
+ if (filter.contains(text) == ContainsResult.NO) {
+ return false;
+ }
+ return delegateTermsEnum.seekExact(text, useCache);
+ }
+
+ @Override
+ public final SeekStatus seekCeil(BytesRef text, boolean useCache)
+ throws IOException {
+ return delegateTermsEnum.seekCeil(text, useCache);
+ }
+
+ @Override
+ public final void seekExact(long ord) throws IOException {
+ delegateTermsEnum.seekExact(ord);
+ }
+
+ @Override
+ public final BytesRef term() throws IOException {
+ return delegateTermsEnum.term();
+ }
+
+ @Override
+ public final long ord() throws IOException {
+ return delegateTermsEnum.ord();
+ }
+
+ @Override
+ public final int docFreq() throws IOException {
+ return delegateTermsEnum.docFreq();
+ }
+
+ @Override
+ public final long totalTermFreq() throws IOException {
+ return delegateTermsEnum.totalTermFreq();
+ }
+
+
+ @Override
+ public DocsAndPositionsEnum docsAndPositions(Bits liveDocs,
+ DocsAndPositionsEnum reuse, int flags) throws IOException {
+ return delegateTermsEnum.docsAndPositions(liveDocs, reuse, flags);
+ }
+
+ @Override
+ public DocsEnum docs(Bits liveDocs, DocsEnum reuse, int flags)
+ throws IOException {
+ return delegateTermsEnum.docs(liveDocs, reuse, flags);
+ }
+
+
+ }
+
+ }
+
+ class BloomFilteredFieldsConsumer extends FieldsConsumer {
+ private FieldsConsumer delegateFieldsConsumer;
+ private Map<FieldInfo,FuzzySet> bloomFilters = new HashMap<FieldInfo,FuzzySet>();
+ private SegmentWriteState state;
+
+ // private PostingsFormat delegatePostingsFormat;
+
+ public BloomFilteredFieldsConsumer(FieldsConsumer fieldsConsumer,
+ SegmentWriteState state, PostingsFormat delegatePostingsFormat) {
+ this.delegateFieldsConsumer = fieldsConsumer;
+ // this.delegatePostingsFormat=delegatePostingsFormat;
+ this.state = state;
+ }
+
+ @Override
+ public TermsConsumer addField(FieldInfo field) throws IOException {
+ FuzzySet bloomFilter = bloomFilterFactory.getSetForField(state,field);
+ if (bloomFilter != null) {
+ assert bloomFilters.containsKey(field) == false;
+ bloomFilters.put(field, bloomFilter);
+ return new WrappedTermsConsumer(delegateFieldsConsumer.addField(field),bloomFilter);
+ } else {
+ // No, use the unfiltered fieldsConsumer - we are not interested in
+ // recording any term Bitsets.
+ return delegateFieldsConsumer.addField(field);
+ }
+ }
+
+ @Override
+ public void close() throws IOException {
+ delegateFieldsConsumer.close();
+ // Now we are done accumulating values for these fields
+ List<Entry<FieldInfo,FuzzySet>> nonSaturatedBlooms = new ArrayList<Map.Entry<FieldInfo,FuzzySet>>();
+
+ for (Entry<FieldInfo,FuzzySet> entry : bloomFilters.entrySet()) {
+ FuzzySet bloomFilter = entry.getValue();
+ if(!bloomFilterFactory.isSaturated(bloomFilter,entry.getKey())){
+ nonSaturatedBlooms.add(entry);
+ }
+ }
+ String bloomFileName = IndexFileNames.segmentFileName(
+ state.segmentInfo.name, state.segmentSuffix, BLOOM_EXTENSION);
+ IndexOutput bloomOutput = null;
+ try {
+ bloomOutput = state.directory
+ .createOutput(bloomFileName, state.context);
+ CodecUtil.writeHeader(bloomOutput, BLOOM_CODEC_NAME,
+ BLOOM_CODEC_VERSION);
+ // remember the name of the postings format we will delegate to
+ bloomOutput.writeString(delegatePostingsFormat.getName());
+
+ // First field in the output file is the number of fields+blooms saved
+ bloomOutput.writeInt(nonSaturatedBlooms.size());
+ for (Entry<FieldInfo,FuzzySet> entry : nonSaturatedBlooms) {
+ FieldInfo fieldInfo = entry.getKey();
+ FuzzySet bloomFilter = entry.getValue();
+ bloomOutput.writeInt(fieldInfo.number);
+ saveAppropriatelySizedBloomFilter(bloomOutput, bloomFilter, fieldInfo);
+ }
+ } finally {
+ IOUtils.close(bloomOutput);
+ }
+ //We are done with large bitsets so no need to keep them hanging around
+ bloomFilters.clear();
+ }
+
+ private void saveAppropriatelySizedBloomFilter(IndexOutput bloomOutput,
+ FuzzySet bloomFilter, FieldInfo fieldInfo) throws IOException {
+
+ FuzzySet rightSizedSet = bloomFilterFactory.downsize(fieldInfo,
+ bloomFilter);
+ if (rightSizedSet == null) {
+ rightSizedSet = bloomFilter;
+ }
+ rightSizedSet.serialize(bloomOutput);
+ }
+
+ }
+
+ class WrappedTermsConsumer extends TermsConsumer {
+ private TermsConsumer delegateTermsConsumer;
+ private FuzzySet bloomFilter;
+
+ public WrappedTermsConsumer(TermsConsumer termsConsumer,FuzzySet bloomFilter) {
+ this.delegateTermsConsumer = termsConsumer;
+ this.bloomFilter = bloomFilter;
+ }
+
+ public PostingsConsumer startTerm(BytesRef text) throws IOException {
+ return delegateTermsConsumer.startTerm(text);
+ }
+
+ public void finishTerm(BytesRef text, TermStats stats) throws IOException {
+
+ // Record this term in our BloomFilter
+ if (stats.docFreq > 0) {
+ bloomFilter.addValue(text);
+ }
+ delegateTermsConsumer.finishTerm(text, stats);
+ }
+
+ public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount)
+ throws IOException {
+ delegateTermsConsumer.finish(sumTotalTermFreq, sumDocFreq, docCount);
+ }
+
+ public Comparator<BytesRef> getComparator() throws IOException {
+ return delegateTermsConsumer.getComparator();
+ }
+
+ }
+
+}
Added: lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/bloom/DefaultBloomFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/bloom/DefaultBloomFilterFactory.java?rev=1368442&view=auto
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/bloom/DefaultBloomFilterFactory.java (added)
+++ lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/bloom/DefaultBloomFilterFactory.java Thu Aug 2 13:00:03 2012
@@ -0,0 +1,44 @@
+package org.apache.lucene.codecs.bloom;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.util.FuzzySet;
+import org.apache.lucene.util.hash.HashFunction;
+import org.apache.lucene.util.hash.MurmurHash2;
+
+/**
+ * Default policy is to allocate a bitset with 10% saturation given a unique term per document.
+ * Bits are set via MurmurHash2 hashing function.
+ * @lucene.experimental
+ */
+public class DefaultBloomFilterFactory extends BloomFilterFactory {
+
+ @Override
+ public FuzzySet getSetForField(SegmentWriteState state,FieldInfo info) {
+ //Assume all of the docs have a unique term (e.g. a primary key) and we hope to maintain a set with 10% of bits set
+ return FuzzySet.createSetBasedOnQuality(state.segmentInfo.getDocCount(), 0.10f, new MurmurHash2());
+ }
+
+ @Override
+ public boolean isSaturated(FuzzySet bloomFilter, FieldInfo fieldInfo) {
+ // Don't bother saving bitsets if >90% of bits are set - we don't want to
+ // throw any more memory at this problem.
+ return bloomFilter.getSaturation() > 0.9f;
+ }
+
+}
Added: lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/bloom/package.html
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/bloom/package.html?rev=1368442&view=auto
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/bloom/package.html (added)
+++ lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/codecs/bloom/package.html Thu Aug 2 13:00:03 2012
@@ -0,0 +1,25 @@
+<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html>
+<head>
+ <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
+</head>
+<body>
+Codec PostingsFormat for fast access to low-frequency terms such as primary key fields.
+</body>
+</html>
\ No newline at end of file
Modified: lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/FixedBitSet.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/FixedBitSet.java?rev=1368442&r1=1368441&r2=1368442&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/FixedBitSet.java (original)
+++ lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/FixedBitSet.java Thu Aug 2 13:00:03 2012
@@ -54,6 +54,12 @@ public final class FixedBitSet extends D
bits = new long[bits2words(numBits)];
}
+ public FixedBitSet(long[]storedBits,int numBits) {
+ this.numBits = numBits;
+ this.bits = storedBits;
+ }
+
+
/** Makes full copy. */
public FixedBitSet(FixedBitSet other) {
bits = new long[other.bits.length];
Added: lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/FuzzySet.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/FuzzySet.java?rev=1368442&view=auto
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/FuzzySet.java (added)
+++ lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/FuzzySet.java Thu Aug 2 13:00:03 2012
@@ -0,0 +1,292 @@
+package org.apache.lucene.util;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import java.io.IOException;
+
+import org.apache.lucene.store.DataInput;
+import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.util.hash.HashFunction;
+
+/**
+ * <p>
+ * A class used to represent a set of many, potentially large, values (e.g. many
+ * long strings such as URLs), using a significantly smaller amount of memory.
+ * </p>
+ * <p>
+ * The set is "lossy" in that it cannot definitively state that is does contain
+ * a value but it <em>can</em> definitively say if a value is <em>not</em> in
+ * the set. It can therefore be used as a Bloom Filter.
+ * </p>
+ * Another application of the set is that it can be used to perform fuzzy counting because
+ * it can estimate reasonably accurately how many unique values are contained in the set.
+ * </p>
+ * <p>This class is NOT threadsafe.</p>
+ * <p>
+ * Internally a Bitset is used to record values and once a client has finished recording
+ * a stream of values the {@link #downsize(float)} method can be used to create a suitably smaller set that
+ * is sized appropriately for the number of values recorded and desired saturation levels.
+ *
+ * </p>
+ * @lucene.experimental
+ */
+public class FuzzySet {
+
+ public static final int FUZZY_SERIALIZATION_VERSION=1;
+
+ public enum ContainsResult {
+ MAYBE, NO
+ };
+ private HashFunction hashFunction;
+ private FixedBitSet filter;
+ private int bloomSize;
+
+ //The sizes of BitSet used are all numbers that, when expressed in binary form,
+ //are all ones. This is to enable fast downsizing from one bitset to another
+ //by simply ANDing each set index in one bitset with the size of the target bitset
+ // - this provides a fast modulo of the number. Values previously accumulated in
+ // a large bitset and then mapped to a smaller set can be looked up using a single
+ // AND operation of the query term's hash rather than needing to perform a 2-step
+ // translation of the query term that mirrors the stored content's reprojections.
+ static final int usableBitSetSizes[];
+ static
+ {
+ usableBitSetSizes=new int[30];
+ int mask=1;
+ int size=mask;
+ for (int i = 0; i < usableBitSetSizes.length; i++) {
+ size=(size<<1)|mask;
+ usableBitSetSizes[i]=size;
+ }
+ }
+
+ /**
+ * Rounds down required maxNumberOfBits to the nearest number that is made up
+ * of all ones as a binary number.
+ * Use this method where controlling memory use is paramount.
+ */
+ public static int getNearestSetSize(int maxNumberOfBits)
+ {
+ int result=usableBitSetSizes[0];
+ for (int i = 0; i < usableBitSetSizes.length; i++) {
+ if(usableBitSetSizes[i]<=maxNumberOfBits)
+ {
+ result=usableBitSetSizes[i];
+ }
+ }
+ return result;
+ }
+
+ /**
+ * Use this method to choose a set size where accuracy (low content saturation) is more important
+ * than deciding how much memory to throw at the problem.
+ * @param maxNumberOfValuesExpected
+ * @param desiredSaturation A number between 0 and 1 expressing the % of bits set once all values have been recorded
+ * @return
+ */
+ public static int getNearestSetSize(int maxNumberOfValuesExpected,
+ float desiredSaturation) {
+ // Iterate around the various scales of bitset from smallest to largest looking for the first that
+ // satisfies value volumes at the chosen saturation level
+ for (int i = 0; i < usableBitSetSizes.length; i++) {
+ int numSetBitsAtDesiredSaturation = (int) (usableBitSetSizes[i] * desiredSaturation);
+ int estimatedNumUniqueValues = getEstimatedNumberUniqueValuesAllowingForCollisions(
+ usableBitSetSizes[i], numSetBitsAtDesiredSaturation);
+ if (estimatedNumUniqueValues > maxNumberOfValuesExpected) {
+ return usableBitSetSizes[i];
+ }
+ }
+ return -1;
+ }
+
+ public static FuzzySet createSetBasedOnMaxMemory(int maxNumBytes, HashFunction hashFunction)
+ {
+ int setSize=getNearestSetSize(maxNumBytes);
+ return new FuzzySet(new FixedBitSet(setSize+1),setSize,hashFunction);
+ }
+
+ public static FuzzySet createSetBasedOnQuality(int maxNumUniqueValues, float desiredMaxSaturation, HashFunction hashFunction)
+ {
+ int setSize=getNearestSetSize(maxNumUniqueValues,desiredMaxSaturation);
+ return new FuzzySet(new FixedBitSet(setSize+1),setSize,hashFunction);
+ }
+
+
+
+
+ private FuzzySet(FixedBitSet filter, int bloomSize, HashFunction hashFunction) {
+ super();
+ this.filter = filter;
+ this.bloomSize = bloomSize;
+ this.hashFunction=hashFunction;
+ }
+
+ /**
+ * The main method required for a Bloom filter which, given a value determines set membership.
+ * Unlike a conventional set, the fuzzy set returns NO or MAYBE rather than true or false.
+ * @param value
+ * @return NO or MAYBE
+ */
+ public ContainsResult contains(BytesRef value) {
+ int hash = hashFunction.hash(value);
+ if (hash < 0) {
+ hash = hash * -1;
+ }
+ return mayContainValue(hash);
+ }
+
+ /**
+ * Serializes the data set to file using the following format:
+ * <ul>
+ * <li>FuzzySet -->FuzzySetVersion,HashFunctionName,BloomSize,
+ * NumBitSetWords,BitSetWord<sup>NumBitSetWords</sup></li>
+ * <li>HashFunctionName --> {@link DataOutput#writeString(String) String} The
+ * name of a ServiceProvider registered {@link HashFunction}</li>
+ * <li>FuzzySetVersion --> {@link DataOutput#writeInt Uint32} The version number of the {@link FuzzySet} class</li>
+ * <li>BloomSize --> {@link DataOutput#writeInt Uint32} The modulo value used
+ * to project hashes into the field's Bitset</li>
+ * <li>NumBitSetWords --> {@link DataOutput#writeInt Uint32} The number of
+ * longs (as returned from {@link FixedBitSet#getBits})</li>
+ * <li>BitSetWord --> {@link DataOutput#writeLong Long} A long from the array
+ * returned by {@link FixedBitSet#getBits}</li>
+ * </ul>
+ * @param out Data output stream
+ * @throws IOException
+ */
+ public void serialize(DataOutput out) throws IOException
+ {
+ out.writeInt(FUZZY_SERIALIZATION_VERSION);
+ out.writeString(hashFunction.getName());
+ out.writeInt(bloomSize);
+ long[] bits = filter.getBits();
+ out.writeInt(bits.length);
+ for (int i = 0; i < bits.length; i++) {
+ // Can't used VLong encoding because cant cope with negative numbers
+ // output by FixedBitSet
+ out.writeLong(bits[i]);
+ }
+ }
+ public static FuzzySet deserialize(DataInput in) throws IOException
+ {
+ int version=in.readInt();
+ if(version!=FUZZY_SERIALIZATION_VERSION)
+ {
+ throw new IOException("Error deserializing: set version is not "+FUZZY_SERIALIZATION_VERSION);
+ }
+ HashFunction hashFunction=HashFunction.forName(in.readString());
+ int bloomSize=in.readInt();
+ int numLongs=in.readInt();
+ long[]longs=new long[numLongs];
+ for (int i = 0; i < numLongs; i++) {
+ longs[i]=in.readLong();
+ }
+ FixedBitSet bits = new FixedBitSet(longs,bloomSize+1);
+ return new FuzzySet(bits,bloomSize,hashFunction);
+ }
+
+ private ContainsResult mayContainValue(int positiveHash) {
+ assert positiveHash >= 0;
+ // Bloom sizes are always base 2 and so can be ANDed for a fast modulo
+ int pos = positiveHash & bloomSize;
+ if (filter.get(pos)) {
+ // This term may be recorded in this index (but could be a collision)
+ return ContainsResult.MAYBE;
+ }
+ // definitely NOT in this segment
+ return ContainsResult.NO;
+ }
+
+ /**
+ * Records a value in the set. The referenced bytes are hashed and then modulo n'd where n is the
+ * chosen size of the internal bitset.
+ * @param bytes
+ * @throws IOException
+ */
+ public void addValue(BytesRef value) throws IOException {
+ int hash = hashFunction.hash(value);
+ if (hash < 0) {
+ hash = hash * -1;
+ }
+ // Bitmasking using bloomSize is effectively a modulo operation.
+ int bloomPos = hash & bloomSize;
+ filter.set(bloomPos);
+ }
+
+
+ /**
+ *
+ * @param targetSaturation A number between 0 and 1 describing the % of bits that would ideally be set in the
+ * result. Lower values have better qccuracy but require more space.
+ * @return a smaller FuzzySet or null if the current set is already over-saturated
+ */
+ public FuzzySet downsize(float targetMaxSaturation)
+ {
+ int numBitsSet = filter.cardinality();
+ FixedBitSet rightSizedBitSet = filter;
+ int rightSizedBitSetSize = bloomSize;
+ //Hopefully find a smaller size bitset into which we can project accumulated values while maintaining desired saturation level
+ for (int i = 0; i < usableBitSetSizes.length; i++) {
+ int candidateBitsetSize = usableBitSetSizes[i];
+ float candidateSaturation = (float) numBitsSet
+ / (float) candidateBitsetSize;
+ if (candidateSaturation <= targetMaxSaturation) {
+ rightSizedBitSetSize = candidateBitsetSize;
+ break;
+ }
+ }
+ // Re-project the numbers to a smaller space if necessary
+ if (rightSizedBitSetSize < bloomSize) {
+ // Reset the choice of bitset to the smaller version
+ rightSizedBitSet = new FixedBitSet(rightSizedBitSetSize + 1);
+ // Map across the bits from the large set to the smaller one
+ int bitIndex = 0;
+ do {
+ bitIndex = filter.nextSetBit(bitIndex);
+ if (bitIndex >= 0) {
+ // Project the larger number into a smaller one effectively
+ // modulo-ing by using the target bitset size as a mask
+ int downSizedBitIndex = bitIndex & rightSizedBitSetSize;
+ rightSizedBitSet.set(downSizedBitIndex);
+ bitIndex++;
+ }
+ } while ( (bitIndex >= 0)&&(bitIndex<=bloomSize));
+ } else {
+ return null;
+ }
+ return new FuzzySet(rightSizedBitSet,rightSizedBitSetSize, hashFunction);
+ }
+
+ public int getEstimatedUniqueValues()
+ {
+ return getEstimatedNumberUniqueValuesAllowingForCollisions(bloomSize, filter.cardinality());
+ }
+
+ // Given a set size and a the number of set bits, produces an estimate of the number of unique values recorded
+ public static int getEstimatedNumberUniqueValuesAllowingForCollisions(
+ int setSize, int numRecordedBits) {
+ double setSizeAsDouble = setSize;
+ double numRecordedBitsAsDouble = numRecordedBits;
+ double saturation = numRecordedBitsAsDouble / setSizeAsDouble;
+ double logInverseSaturation = Math.log(1 - saturation) * -1;
+ return (int) (setSizeAsDouble * logInverseSaturation);
+ }
+
+ public float getSaturation() {
+ int numBitsSet = filter.cardinality();
+ return (float) numBitsSet / (float) bloomSize;
+ }
+}
\ No newline at end of file
Added: lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/hash/HashFunction.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/hash/HashFunction.java?rev=1368442&view=auto
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/hash/HashFunction.java (added)
+++ lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/hash/HashFunction.java Thu Aug 2 13:00:03 2012
@@ -0,0 +1,70 @@
+package org.apache.lucene.util.hash;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import java.util.Set;
+
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.NamedSPILoader;
+
+
+/**
+ * Base class for hashing functions that can be referred to by name.
+ * Subclasses are expected to provide threadsafe implementations of the hash function
+ * on the range of bytes referenced in the provided {@link BytesRef}
+ * @lucene.experimental
+ */
+public abstract class HashFunction implements NamedSPILoader.NamedSPI {
+
+ /**
+ * Hashes the contents of the referenced bytes
+ * @param bytes the data to be hashed
+ * @return the hash of the bytes referenced by bytes.offset and length bytes.length
+ */
+ public abstract int hash(BytesRef bytes);
+
+ private static final NamedSPILoader<HashFunction> loader =
+ new NamedSPILoader<HashFunction>(HashFunction.class);
+
+ private final String name;
+
+ public HashFunction(String name) {
+ NamedSPILoader.checkServiceName(name);
+ this.name = name;
+ }
+
+ /** Returns this codec's name */
+ @Override
+ public final String getName() {
+ return name;
+ }
+
+ /** looks up a hash function by name */
+ public static HashFunction forName(String name) {
+ return loader.lookup(name);
+ }
+
+ /** returns a list of all available hash function names */
+ public static Set<String> availableHashFunctionNames() {
+ return loader.availableServices();
+ }
+
+
+ @Override
+ public String toString() {
+ return name;
+ }
+}
Added: lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/hash/MurmurHash2.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/hash/MurmurHash2.java?rev=1368442&view=auto
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/hash/MurmurHash2.java (added)
+++ lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/hash/MurmurHash2.java Thu Aug 2 13:00:03 2012
@@ -0,0 +1,105 @@
+package org.apache.lucene.util.hash;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.util.BytesRef;
+
+/**
+ * This is a very fast, non-cryptographic hash suitable for general hash-based
+ * lookup. See http://murmurhash.googlepages.com/ for more details.
+ * <p>
+ * The C version of MurmurHash 2.0 found at that site was ported to Java by
+ * Andrzej Bialecki (ab at getopt org).
+ * </p>
+ * <p>
+ * The code from getopt.org was adapted by Mark Harwood in the form here as one of a pluggable choice of
+ * hashing functions as the core function had to be adapted to work with BytesRefs with offsets and lengths
+ * rather than raw byte arrays.
+ * </p>
+ * @lucene.experimental
+ */
+public class MurmurHash2 extends HashFunction{
+
+
+ public static final String HASH_NAME="MurmurHash2";
+
+ public MurmurHash2() {
+ super(HASH_NAME);
+ }
+
+ public static int hash(byte[] data, int seed, int offset, int len) {
+ int m = 0x5bd1e995;
+ int r = 24;
+ int h = seed ^ len;
+ int len_4 = len >> 2;
+ for (int i = 0; i < len_4; i++) {
+ int i_4 = offset + (i << 2);
+ int k = data[i_4 + 3];
+ k = k << 8;
+ k = k | (data[i_4 + 2] & 0xff);
+ k = k << 8;
+ k = k | (data[i_4 + 1] & 0xff);
+ k = k << 8;
+ k = k | (data[i_4 + 0] & 0xff);
+ k *= m;
+ k ^= k >>> r;
+ k *= m;
+ h *= m;
+ h ^= k;
+ }
+ int len_m = len_4 << 2;
+ int left = len - len_m;
+ if (left != 0) {
+ if (left >= 3) {
+ h ^= data[offset + len - 3] << 16;
+ }
+ if (left >= 2) {
+ h ^= data[offset + len - 2] << 8;
+ }
+ if (left >= 1) {
+ h ^= data[offset + len - 1];
+ }
+ h *= m;
+ }
+ h ^= h >>> 13;
+ h *= m;
+ h ^= h >>> 15;
+ return h;
+ }
+
+ /**
+ * Generates 32 bit hash from byte array with default seed value.
+ *
+ * @param data
+ * byte array to hash
+ * @param offset
+ * the start position in the array to hash
+ * @param len
+ * length of the array elements to hash
+ * @return 32 bit hash of the given array
+ */
+ public static final int hash32(final byte[] data, int offset, int len) {
+ return MurmurHash2.hash(data, 0x9747b28c, offset, len);
+ }
+
+
+ @Override
+ public final int hash(BytesRef br) {
+ return hash32(br.bytes, br.offset, br.length);
+ }
+
+}
Modified: lucene/dev/branches/branch_4x/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat?rev=1368442&r1=1368441&r2=1368442&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat (original)
+++ lucene/dev/branches/branch_4x/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat Thu Aug 2 13:00:03 2012
@@ -17,4 +17,5 @@ org.apache.lucene.codecs.lucene40.Lucene
org.apache.lucene.codecs.pulsing.Pulsing40PostingsFormat
org.apache.lucene.codecs.simpletext.SimpleTextPostingsFormat
org.apache.lucene.codecs.memory.MemoryPostingsFormat
+org.apache.lucene.codecs.bloom.BloomFilteringPostingsFormat
org.apache.lucene.codecs.memory.DirectPostingsFormat
Added: lucene/dev/branches/branch_4x/lucene/core/src/resources/META-INF/services/org.apache.lucene.util.hash.HashFunction
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/resources/META-INF/services/org.apache.lucene.util.hash.HashFunction?rev=1368442&view=auto
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/core/src/resources/META-INF/services/org.apache.lucene.util.hash.HashFunction (added)
+++ lucene/dev/branches/branch_4x/lucene/core/src/resources/META-INF/services/org.apache.lucene.util.hash.HashFunction Thu Aug 2 13:00:03 2012
@@ -0,0 +1,16 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+org.apache.lucene.util.hash.MurmurHash2
Added: lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/codecs/bloom/TestBloomFilteredLucene40Postings.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/codecs/bloom/TestBloomFilteredLucene40Postings.java?rev=1368442&view=auto
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/codecs/bloom/TestBloomFilteredLucene40Postings.java (added)
+++ lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/codecs/bloom/TestBloomFilteredLucene40Postings.java Thu Aug 2 13:00:03 2012
@@ -0,0 +1,77 @@
+package org.apache.lucene.codecs.bloom;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.codecs.FieldsConsumer;
+import org.apache.lucene.codecs.FieldsProducer;
+import org.apache.lucene.codecs.PostingsFormat;
+import org.apache.lucene.codecs.bloom.BloomFilteringPostingsFormat;
+import org.apache.lucene.codecs.lucene40.Lucene40PostingsFormat;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.SegmentReadState;
+import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.util.FuzzySet;
+import org.apache.lucene.util.hash.MurmurHash2;
+
+/**
+ * A class used for testing {@link BloomFilteringPostingsFormat} with a concrete
+ * delegate (Lucene40). Creates a Bloom filter on ALL fields and with tiny
+ * amounts of memory reserved for the filter. DO NOT USE IN A PRODUCTION
+ * APPLICATION This is not a realistic application of Bloom Filters as they
+ * ordinarily are larger and operate on only primary key type fields.
+ */
+public class TestBloomFilteredLucene40Postings extends PostingsFormat {
+
+ private BloomFilteringPostingsFormat delegate;
+
+ // Special class used to avoid OOM exceptions where Junit tests create many
+ // fields.
+ static class LowMemoryBloomFactory extends BloomFilterFactory {
+ @Override
+ public FuzzySet getSetForField(SegmentWriteState state,FieldInfo info) {
+ return FuzzySet.createSetBasedOnMaxMemory(1024, new MurmurHash2());
+ }
+
+ @Override
+ public boolean isSaturated(FuzzySet bloomFilter, FieldInfo fieldInfo) {
+ // For test purposes always maintain the BloomFilter - even past the point
+ // of usefulness when all bits are set
+ return false;
+ }
+ }
+
+ public TestBloomFilteredLucene40Postings() {
+ super("TestBloomFilteredLucene40Postings");
+ delegate = new BloomFilteringPostingsFormat(new Lucene40PostingsFormat(),
+ new LowMemoryBloomFactory());
+ }
+
+ @Override
+ public FieldsConsumer fieldsConsumer(SegmentWriteState state)
+ throws IOException {
+ return delegate.fieldsConsumer(state);
+ }
+
+ @Override
+ public FieldsProducer fieldsProducer(SegmentReadState state)
+ throws IOException {
+ return delegate.fieldsProducer(state);
+ }
+}
Modified: lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java?rev=1368442&r1=1368441&r2=1368442&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java (original)
+++ lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java Thu Aug 2 13:00:03 2012
@@ -29,6 +29,7 @@ import java.util.Set;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.asserting.AssertingPostingsFormat;
+import org.apache.lucene.codecs.bloom.TestBloomFilteredLucene40Postings;
import org.apache.lucene.codecs.lucene40.Lucene40Codec;
import org.apache.lucene.codecs.lucene40.Lucene40PostingsFormat;
import org.apache.lucene.codecs.lucene40ords.Lucene40WithOrds;
@@ -96,6 +97,10 @@ public class RandomCodec extends Lucene4
new Pulsing40PostingsFormat(1 + random.nextInt(20), minItemsPerBlock, maxItemsPerBlock),
// add pulsing again with (usually) different parameters
new Pulsing40PostingsFormat(1 + random.nextInt(20), minItemsPerBlock, maxItemsPerBlock),
+ //TODO as a PostingsFormat which wraps others, we should allow TestBloomFilteredLucene40Postings to be constructed
+ //with a choice of concrete PostingsFormats. Maybe useful to have a generic means of marking and dealing
+ //with such "wrapper" classes?
+ new TestBloomFilteredLucene40Postings(),
new MockSepPostingsFormat(),
new MockFixedIntBlockPostingsFormat(_TestUtil.nextInt(random, 1, 2000)),
new MockVariableIntBlockPostingsFormat( _TestUtil.nextInt(random, 1, 127)),
Modified: lucene/dev/branches/branch_4x/lucene/test-framework/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/test-framework/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat?rev=1368442&r1=1368441&r2=1368442&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/test-framework/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat (original)
+++ lucene/dev/branches/branch_4x/lucene/test-framework/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat Thu Aug 2 13:00:03 2012
@@ -20,5 +20,6 @@ org.apache.lucene.codecs.mocksep.MockSep
org.apache.lucene.codecs.nestedpulsing.NestedPulsingPostingsFormat
org.apache.lucene.codecs.ramonly.RAMOnlyPostingsFormat
org.apache.lucene.codecs.lucene40ords.Lucene40WithOrds
+org.apache.lucene.codecs.bloom.TestBloomFilteredLucene40Postings
org.apache.lucene.codecs.asserting.AssertingPostingsFormat