You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by sy...@apache.org on 2014/09/16 00:47:00 UTC
[02/11] Skeleton porting of Lucene.Net.Queries
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/882f487d/src/Lucene.Net.Queries/TermFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Queries/TermFilter.cs b/src/Lucene.Net.Queries/TermFilter.cs
new file mode 100644
index 0000000..1e4ffd0
--- /dev/null
+++ b/src/Lucene.Net.Queries/TermFilter.cs
@@ -0,0 +1,139 @@
+namespace org.apache.lucene.queries
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ using AtomicReaderContext = org.apache.lucene.index.AtomicReaderContext;
+ using DocsEnum = org.apache.lucene.index.DocsEnum;
+ using Term = org.apache.lucene.index.Term;
+ using Terms = org.apache.lucene.index.Terms;
+ using TermsEnum = org.apache.lucene.index.TermsEnum;
+ using DocIdSet = org.apache.lucene.search.DocIdSet;
+ using DocIdSetIterator = org.apache.lucene.search.DocIdSetIterator;
+ using Filter = org.apache.lucene.search.Filter;
+ using Bits = org.apache.lucene.util.Bits;
+
+ /// <summary>
+ /// A filter that includes documents that match with a specific term.
+ /// </summary>
+ public sealed class TermFilter : Filter
+ {
+
+ private readonly Term term;
+
+ /// <param name="term"> The term documents need to have in order to be a match for this filter. </param>
+ public TermFilter(Term term)
+ {
+ if (term == null)
+ {
+ throw new System.ArgumentException("Term must not be null");
+ }
+ else if (term.field() == null)
+ {
+ throw new System.ArgumentException("Field must not be null");
+ }
+ this.term = term;
+ }
+
+ /// <returns> The term this filter includes documents with. </returns>
+ public Term Term
+ {
+ get
+ {
+ return term;
+ }
+ }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public org.apache.lucene.search.DocIdSet getDocIdSet(org.apache.lucene.index.AtomicReaderContext context, final org.apache.lucene.util.Bits acceptDocs) throws java.io.IOException
+//JAVA TO C# CONVERTER WARNING: 'final' parameters are not available in .NET:
+ public override DocIdSet getDocIdSet(AtomicReaderContext context, Bits acceptDocs)
+ {
+ Terms terms = context.reader().terms(term.field());
+ if (terms == null)
+ {
+ return null;
+ }
+
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final org.apache.lucene.index.TermsEnum termsEnum = terms.iterator(null);
+ TermsEnum termsEnum = terms.iterator(null);
+ if (!termsEnum.seekExact(term.bytes()))
+ {
+ return null;
+ }
+ return new DocIdSetAnonymousInnerClassHelper(this, acceptDocs, termsEnum);
+ }
+
+ private class DocIdSetAnonymousInnerClassHelper : DocIdSet
+ {
+ private readonly TermFilter outerInstance;
+
+ private Bits acceptDocs;
+ private TermsEnum termsEnum;
+
+ public DocIdSetAnonymousInnerClassHelper(TermFilter outerInstance, Bits acceptDocs, TermsEnum termsEnum)
+ {
+ this.outerInstance = outerInstance;
+ this.acceptDocs = acceptDocs;
+ this.termsEnum = termsEnum;
+ }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public org.apache.lucene.search.DocIdSetIterator iterator() throws java.io.IOException
+ public override DocIdSetIterator iterator()
+ {
+ return termsEnum.docs(acceptDocs, null, DocsEnum.FLAG_NONE);
+ }
+
+ }
+
+ public override bool Equals(object o)
+ {
+ if (this == o)
+ {
+ return true;
+ }
+ if (o == null || this.GetType() != o.GetType())
+ {
+ return false;
+ }
+
+ TermFilter that = (TermFilter) o;
+
+ if (term != null ?!term.Equals(that.term) : that.term != null)
+ {
+ return false;
+ }
+
+ return true;
+ }
+
+ public override int GetHashCode()
+ {
+ return term != null ? term.GetHashCode() : 0;
+ }
+
+ public override string ToString()
+ {
+ return term.field() + ":" + term.text();
+ }
+
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/882f487d/src/Lucene.Net.Queries/TermsFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Queries/TermsFilter.cs b/src/Lucene.Net.Queries/TermsFilter.cs
new file mode 100644
index 0000000..be418fb
--- /dev/null
+++ b/src/Lucene.Net.Queries/TermsFilter.cs
@@ -0,0 +1,439 @@
+using System;
+using System.Collections.Generic;
+using System.Text;
+
+namespace org.apache.lucene.queries
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ using org.apache.lucene.index;
+ using DocIdSet = org.apache.lucene.search.DocIdSet;
+ using DocIdSetIterator = org.apache.lucene.search.DocIdSetIterator;
+ using Filter = org.apache.lucene.search.Filter;
+ using ArrayUtil = org.apache.lucene.util.ArrayUtil;
+ using Bits = org.apache.lucene.util.Bits;
+ using BytesRef = org.apache.lucene.util.BytesRef;
+ using FixedBitSet = org.apache.lucene.util.FixedBitSet;
+
+
+ /// <summary>
+ /// Constructs a filter for docs matching any of the terms added to this class.
+ /// Unlike a RangeFilter this can be used for filtering on multiple terms that are not necessarily in
+ /// a sequence. An example might be a collection of primary keys from a database query result or perhaps
+ /// a choice of "category" labels picked by the end user. As a filter, this is much faster than the
+ /// equivalent query (a BooleanQuery with many "should" TermQueries)
+ /// </summary>
+ public sealed class TermsFilter : Filter
+ {
+
+ /*
+ * this class is often used for large number of terms in a single field.
+ * to optimize for this case and to be filter-cache friendly we
+ * serialize all terms into a single byte array and store offsets
+ * in a parallel array to keep the # of object constant and speed up
+ * equals / hashcode.
+ *
+ * This adds quite a bit of complexity but allows large term filters to
+ * be efficient for GC and cache-lookups
+ */
+ private readonly int[] offsets;
+ private readonly sbyte[] termsBytes;
+ private readonly TermsAndField[] termsAndFields;
+ private readonly int hashCode_Renamed; // cached hashcode for fast cache lookups
+ private const int PRIME = 31;
+
+ /// <summary>
+ /// Creates a new <seealso cref="TermsFilter"/> from the given list. The list
+ /// can contain duplicate terms and multiple fields.
+ /// </summary>
+//JAVA TO C# CONVERTER WARNING: 'final' parameters are not available in .NET:
+//ORIGINAL LINE: public TermsFilter(final java.util.List<Term> terms)
+ public TermsFilter(IList<Term> terms) : this(new FieldAndTermEnumAnonymousInnerClassHelper(this, terms), terms.Count)
+ {
+ }
+
+ private class FieldAndTermEnumAnonymousInnerClassHelper : FieldAndTermEnum
+ {
+ private readonly TermsFilter outerInstance;
+
+ private IList<Term> terms;
+
+ public FieldAndTermEnumAnonymousInnerClassHelper(TermsFilter outerInstance, IList<Term> terms)
+ {
+ this.outerInstance = outerInstance;
+ this.terms = terms;
+ iter = sort(terms).GetEnumerator();
+ }
+
+ // we need to sort for deduplication and to have a common cache key
+ internal readonly IEnumerator<Term> iter;
+ public override BytesRef next()
+ {
+ if (iter.hasNext())
+ {
+ Term next = iter.next();
+ field = next.field();
+ return next.bytes();
+ }
+ return null;
+ }
+ }
+
+ /// <summary>
+ /// Creates a new <seealso cref="TermsFilter"/> from the given <seealso cref="BytesRef"/> list for
+ /// a single field.
+ /// </summary>
+//JAVA TO C# CONVERTER WARNING: 'final' parameters are not available in .NET:
+//ORIGINAL LINE: public TermsFilter(final String field, final java.util.List<org.apache.lucene.util.BytesRef> terms)
+ public TermsFilter(string field, IList<BytesRef> terms) : this(new FieldAndTermEnumAnonymousInnerClassHelper2(this, field, terms), terms.Count)
+ {
+ }
+
+ private class FieldAndTermEnumAnonymousInnerClassHelper2 : FieldAndTermEnum
+ {
+ private readonly TermsFilter outerInstance;
+
+ private IList<BytesRef> terms;
+
+ public FieldAndTermEnumAnonymousInnerClassHelper2(TermsFilter outerInstance, string field, IList<BytesRef> terms) : base(field)
+ {
+ this.outerInstance = outerInstance;
+ this.terms = terms;
+ iter = sort(terms).GetEnumerator();
+ }
+
+ // we need to sort for deduplication and to have a common cache key
+ internal readonly IEnumerator<BytesRef> iter;
+ public override BytesRef next()
+ {
+ if (iter.hasNext())
+ {
+ return iter.next();
+ }
+ return null;
+ }
+ }
+
+ /// <summary>
+ /// Creates a new <seealso cref="TermsFilter"/> from the given <seealso cref="BytesRef"/> array for
+ /// a single field.
+ /// </summary>
+//JAVA TO C# CONVERTER WARNING: 'final' parameters are not available in .NET:
+//ORIGINAL LINE: public TermsFilter(final String field, final org.apache.lucene.util.BytesRef...terms)
+ public TermsFilter(string field, params BytesRef[] terms) : this(field, Arrays.asList(terms))
+ {
+ // this ctor prevents unnecessary Term creations
+ }
+
+ /// <summary>
+ /// Creates a new <seealso cref="TermsFilter"/> from the given array. The array can
+ /// contain duplicate terms and multiple fields.
+ /// </summary>
+//JAVA TO C# CONVERTER WARNING: 'final' parameters are not available in .NET:
+//ORIGINAL LINE: public TermsFilter(final Term... terms)
+ public TermsFilter(params Term[] terms) : this(Arrays.asList(terms))
+ {
+ }
+
+
+ private TermsFilter(FieldAndTermEnum iter, int length)
+ {
+ // TODO: maybe use oal.index.PrefixCodedTerms instead?
+ // If number of terms is more than a few hundred it
+ // should be a win
+
+ // TODO: we also pack terms in FieldCache/DocValues
+ // ... maybe we can refactor to share that code
+
+ // TODO: yet another option is to build the union of the terms in
+ // an automaton an call intersect on the termsenum if the density is high
+
+ int hash = 9;
+ sbyte[] serializedTerms = new sbyte[0];
+ this.offsets = new int[length + 1];
+ int lastEndOffset = 0;
+ int index = 0;
+ List<TermsAndField> termsAndFields = new List<TermsAndField>();
+ TermsAndField lastTermsAndField = null;
+ BytesRef previousTerm = null;
+ string previousField = null;
+ BytesRef currentTerm;
+ string currentField;
+ while ((currentTerm = iter.next()) != null)
+ {
+ currentField = iter.field();
+ if (currentField == null)
+ {
+ throw new System.ArgumentException("Field must not be null");
+ }
+ if (previousField != null)
+ {
+ // deduplicate
+ if (previousField.Equals(currentField))
+ {
+ if (previousTerm.bytesEquals(currentTerm))
+ {
+ continue;
+ }
+ }
+ else
+ {
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final int start = lastTermsAndField == null ? 0 : lastTermsAndField.end;
+ int start = lastTermsAndField == null ? 0 : lastTermsAndField.end;
+ lastTermsAndField = new TermsAndField(start, index, previousField);
+ termsAndFields.Add(lastTermsAndField);
+ }
+ }
+ hash = PRIME * hash + currentField.GetHashCode();
+ hash = PRIME * hash + currentTerm.GetHashCode();
+ if (serializedTerms.Length < lastEndOffset + currentTerm.length)
+ {
+ serializedTerms = ArrayUtil.grow(serializedTerms, lastEndOffset + currentTerm.length);
+ }
+ Array.Copy(currentTerm.bytes, currentTerm.offset, serializedTerms, lastEndOffset, currentTerm.length);
+ offsets[index] = lastEndOffset;
+ lastEndOffset += currentTerm.length;
+ index++;
+ previousTerm = currentTerm;
+ previousField = currentField;
+ }
+ offsets[index] = lastEndOffset;
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final int start = lastTermsAndField == null ? 0 : lastTermsAndField.end;
+ int start = lastTermsAndField == null ? 0 : lastTermsAndField.end;
+ lastTermsAndField = new TermsAndField(start, index, previousField);
+ termsAndFields.Add(lastTermsAndField);
+ this.termsBytes = ArrayUtil.shrink(serializedTerms, lastEndOffset);
+ this.termsAndFields = termsAndFields.ToArray();
+ this.hashCode_Renamed = hash;
+
+ }
+
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public org.apache.lucene.search.DocIdSet getDocIdSet(AtomicReaderContext context, org.apache.lucene.util.Bits acceptDocs) throws java.io.IOException
+ public override DocIdSet getDocIdSet(AtomicReaderContext context, Bits acceptDocs)
+ {
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final AtomicReader reader = context.reader();
+ AtomicReader reader = context.reader();
+ FixedBitSet result = null; // lazy init if needed - no need to create a big bitset ahead of time
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final Fields fields = reader.fields();
+ Fields fields = reader.fields();
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final org.apache.lucene.util.BytesRef spare = new org.apache.lucene.util.BytesRef(this.termsBytes);
+ BytesRef spare = new BytesRef(this.termsBytes);
+ if (fields == null)
+ {
+ return result;
+ }
+ Terms terms = null;
+ TermsEnum termsEnum = null;
+ DocsEnum docs = null;
+ foreach (TermsAndField termsAndField in this.termsAndFields)
+ {
+ if ((terms = fields.terms(termsAndField.field)) != null)
+ {
+ termsEnum = terms.iterator(termsEnum); // this won't return null
+ for (int i = termsAndField.start; i < termsAndField.end; i++)
+ {
+ spare.offset = offsets[i];
+ spare.length = offsets[i + 1] - offsets[i];
+ if (termsEnum.seekExact(spare))
+ {
+ docs = termsEnum.docs(acceptDocs, docs, DocsEnum.FLAG_NONE); // no freq since we don't need them
+ if (result == null)
+ {
+ if (docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS)
+ {
+ result = new FixedBitSet(reader.maxDoc());
+ // lazy init but don't do it in the hot loop since we could read many docs
+ result.set(docs.docID());
+ }
+ }
+ while (docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS)
+ {
+ result.set(docs.docID());
+ }
+ }
+ }
+ }
+ }
+ return result;
+ }
+
+ public override bool Equals(object obj)
+ {
+ if (this == obj)
+ {
+ return true;
+ }
+ if ((obj == null) || (obj.GetType() != this.GetType()))
+ {
+ return false;
+ }
+
+ TermsFilter test = (TermsFilter) obj;
+ // first check the fields before even comparing the bytes
+ if (test.hashCode_Renamed == hashCode_Renamed && Arrays.Equals(termsAndFields, test.termsAndFields))
+ {
+ int lastOffset = termsAndFields[termsAndFields.Length - 1].end;
+ // compare offsets since we sort they must be identical
+ if (ArrayUtil.Equals(offsets, 0, test.offsets, 0, lastOffset + 1))
+ {
+ // straight byte comparison since we sort they must be identical
+ return ArrayUtil.Equals(termsBytes, 0, test.termsBytes, 0, offsets[lastOffset]);
+ }
+ }
+ return false;
+ }
+
+ public override int GetHashCode()
+ {
+ return hashCode_Renamed;
+ }
+
+ public override string ToString()
+ {
+ StringBuilder builder = new StringBuilder();
+ BytesRef spare = new BytesRef(termsBytes);
+ bool first = true;
+ for (int i = 0; i < termsAndFields.Length; i++)
+ {
+ TermsAndField current = termsAndFields[i];
+ for (int j = current.start; j < current.end; j++)
+ {
+ spare.offset = offsets[j];
+ spare.length = offsets[j + 1] - offsets[j];
+ if (!first)
+ {
+ builder.Append(' ');
+ }
+ first = false;
+ builder.Append(current.field).Append(':');
+ builder.Append(spare.utf8ToString());
+ }
+ }
+
+ return builder.ToString();
+ }
+
+ private sealed class TermsAndField
+ {
+ internal readonly int start;
+ internal readonly int end;
+ internal readonly string field;
+
+
+ internal TermsAndField(int start, int end, string field) : base()
+ {
+ this.start = start;
+ this.end = end;
+ this.field = field;
+ }
+
+ public override int GetHashCode()
+ {
+ const int prime = 31;
+ int result = 1;
+ result = prime * result + ((field == null) ? 0 : field.GetHashCode());
+ result = prime * result + end;
+ result = prime * result + start;
+ return result;
+ }
+
+ public override bool Equals(object obj)
+ {
+ if (this == obj)
+ {
+ return true;
+ }
+ if (obj == null)
+ {
+ return false;
+ }
+ if (this.GetType() != obj.GetType())
+ {
+ return false;
+ }
+ TermsAndField other = (TermsAndField) obj;
+ if (field == null)
+ {
+ if (other.field != null)
+ {
+ return false;
+ }
+ }
+ else if (!field.Equals(other.field))
+ {
+ return false;
+ }
+ if (end != other.end)
+ {
+ return false;
+ }
+ if (start != other.start)
+ {
+ return false;
+ }
+ return true;
+ }
+
+ }
+
+ private abstract class FieldAndTermEnum
+ {
+ protected internal string field_Renamed;
+
+ public abstract BytesRef next();
+
+ public FieldAndTermEnum()
+ {
+ }
+
+ public FieldAndTermEnum(string field)
+ {
+ this.field_Renamed = field;
+ }
+
+ public virtual string field()
+ {
+ return field_Renamed;
+ }
+ }
+
+ /*
+ * simple utility that returns the in-place sorted list
+ */
+//JAVA TO C# CONVERTER TODO TASK: Java wildcard generics are not converted to .NET:
+//ORIGINAL LINE: private static <T extends Comparable<? base T>> java.util.List<T> sort(java.util.List<T> toSort)
+ private static IList<T> sort<T>(IList<T> toSort) where T : Comparable<? base T>
+ {
+ if (toSort.Count == 0)
+ {
+ throw new System.ArgumentException("no terms provided");
+ }
+ toSort.Sort();
+ return toSort;
+ }
+ }
+
+}
\ No newline at end of file