You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by cc...@apache.org on 2012/03/23 21:16:38 UTC

svn commit: r1304578 [2/2] - /incubator/lucene.net/trunk/src/contrib/Highlighter/

Added: incubator/lucene.net/trunk/src/contrib/Highlighter/WeightedSpanTermExtractor.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Highlighter/WeightedSpanTermExtractor.cs?rev=1304578&view=auto
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Highlighter/WeightedSpanTermExtractor.cs (added)
+++ incubator/lucene.net/trunk/src/contrib/Highlighter/WeightedSpanTermExtractor.cs Fri Mar 23 20:16:37 2012
@@ -0,0 +1,673 @@
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Linq;
+using System.Text;
+using Lucene.Net.Analysis;
+using Lucene.Net.Index;
+using Lucene.Net.Search.Spans;
+using Lucene.Net.Store;
+using Lucene.Net.Support;
+using Lucene.Net.Util;
+
+namespace Lucene.Net.Search.Highlight
+{
+    /**
+     * Class used to extract {@link WeightedSpanTerm}s from a {@link Query} based on whether 
+     * {@link Term}s from the {@link Query} are contained in a supplied {@link TokenStream}.
+     */
+
+    public class WeightedSpanTermExtractor
+    {
+
+        private String fieldName;
+        private TokenStream tokenStream;
+        private IDictionary<String, IndexReader> readers = new HashMap<String, IndexReader>(10);
+        private String defaultField;
+        private bool expandMultiTermQuery;
+        private bool cachedTokenStream;
+        private bool wrapToCaching = true;
+
+        public WeightedSpanTermExtractor()
+        {
+        }
+
+        public WeightedSpanTermExtractor(String defaultField)
+        {
+            if (defaultField != null)
+            {
+                this.defaultField = StringHelper.Intern(defaultField);
+            }
+        }
+
+        private void closeReaders()
+        {
+            ICollection<IndexReader> readerSet = readers.Values;
+
+            foreach (IndexReader reader in readerSet)
+            {
+                try
+                {
+                    reader.Close();
+                }
+                catch (IOException e)
+                {
+                    // alert?
+                }
+            }
+        }
+
+        /**
+         * Fills a <code>Map</code> with <@link WeightedSpanTerm>s using the terms from the supplied <code>Query</code>.
+         * 
+         * @param query
+         *          Query to extract Terms from
+         * @param terms
+         *          Map to place created WeightedSpanTerms in
+         * @throws IOException
+         */
+
+        private void extract(Query query, IDictionary<String, WeightedSpanTerm> terms)
+        {
+            if (query is BooleanQuery)
+            {
+                BooleanClause[] queryClauses = ((BooleanQuery) query).GetClauses();
+
+                for (int i = 0; i < queryClauses.Length; i++)
+                {
+                    if (!queryClauses[i].Prohibited)
+                    {
+                        extract(queryClauses[i].Query, terms);
+                    }
+                }
+            }
+            else if (query is PhraseQuery)
+            {
+                PhraseQuery phraseQuery = ((PhraseQuery) query);
+                Term[] phraseQueryTerms = phraseQuery.GetTerms();
+                SpanQuery[] clauses = new SpanQuery[phraseQueryTerms.Length];
+                for (int i = 0; i < phraseQueryTerms.Length; i++)
+                {
+                    clauses[i] = new SpanTermQuery(phraseQueryTerms[i]);
+                }
+                int slop = phraseQuery.Slop;
+                int[] positions = phraseQuery.GetPositions();
+                // add largest position increment to slop
+                if (positions.Length > 0)
+                {
+                    int lastPos = positions[0];
+                    int largestInc = 0;
+                    int sz = positions.Length;
+                    for (int i = 1; i < sz; i++)
+                    {
+                        int pos = positions[i];
+                        int inc = pos - lastPos;
+                        if (inc > largestInc)
+                        {
+                            largestInc = inc;
+                        }
+                        lastPos = pos;
+                    }
+                    if (largestInc > 1)
+                    {
+                        slop += largestInc;
+                    }
+                }
+
+                bool inorder = slop == 0;
+
+                SpanNearQuery sp = new SpanNearQuery(clauses, slop, inorder);
+                sp.Boost = query.Boost;
+                extractWeightedSpanTerms(terms, sp);
+            }
+            else if (query is TermQuery)
+            {
+                extractWeightedTerms(terms, query);
+            }
+            else if (query is SpanQuery)
+            {
+                extractWeightedSpanTerms(terms, (SpanQuery) query);
+            }
+            else if (query is FilteredQuery)
+            {
+                extract(((FilteredQuery) query).Query, terms);
+            }
+            else if (query is DisjunctionMaxQuery)
+            {
+                foreach (var q in ((DisjunctionMaxQuery) query))
+                {
+                    extract(q, terms);
+                }
+            }
+            else if (query is MultiTermQuery && expandMultiTermQuery)
+            {
+                MultiTermQuery mtq = ((MultiTermQuery) query);
+                if (mtq.QueryRewriteMethod != MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE)
+                {
+                    mtq = (MultiTermQuery) mtq.Clone();
+                    mtq.QueryRewriteMethod = MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE;
+                    query = mtq;
+                }
+                FakeReader fReader = new FakeReader();
+                MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE.Rewrite(fReader, mtq);
+                if (fReader.Field != null)
+                {
+                    IndexReader ir = getReaderForField(fReader.Field);
+                    extract(query.Rewrite(ir), terms);
+                }
+            }
+            else if (query is MultiPhraseQuery)
+            {
+                MultiPhraseQuery mpq = (MultiPhraseQuery) query;
+                IList<Term[]> termArrays = mpq.GetTermArrays();
+                int[] positions = mpq.GetPositions();
+                if (positions.Length > 0)
+                {
+
+                    int maxPosition = positions[positions.Length - 1];
+                    for (int i = 0; i < positions.Length - 1; ++i)
+                    {
+                        if (positions[i] > maxPosition)
+                        {
+                            maxPosition = positions[i];
+                        }
+                    }
+
+                    var disjunctLists = new IList<SpanQuery>[maxPosition + 1];
+                    int distinctPositions = 0;
+
+                    for (int i = 0; i < termArrays.Count; ++i)
+                    {
+                        Term[] termArray = termArrays[i];
+                        IList<SpanQuery> disjuncts = disjunctLists[positions[i]];
+                        if (disjuncts == null)
+                        {
+                            disjuncts = (disjunctLists[positions[i]] = new List<SpanQuery>(termArray.Length));
+                            ++distinctPositions;
+                        }
+                        for (int j = 0; j < termArray.Length; ++j)
+                        {
+                            disjuncts.Add(new SpanTermQuery(termArray[j]));
+                        }
+                    }
+
+                    int positionGaps = 0;
+                    int position = 0;
+                    var clauses = new SpanQuery[distinctPositions];
+                    for (int i = 0; i < disjunctLists.Length; ++i)
+                    {
+                        IList<SpanQuery> disjuncts = disjunctLists[i];
+                        if (disjuncts != null)
+                        {
+                            clauses[position++] = new SpanOrQuery(disjuncts.ToArray());
+                        }
+                        else
+                        {
+                            ++positionGaps;
+                        }
+                    }
+
+                    int slop = mpq.Slop;
+                    bool inorder = (slop == 0);
+
+                    SpanNearQuery sp = new SpanNearQuery(clauses, slop + positionGaps, inorder);
+                    sp.Boost = query.Boost;
+                    extractWeightedSpanTerms(terms, sp);
+                }
+            }
+        }
+
+        /**
+         * Fills a <code>Map</code> with <@link WeightedSpanTerm>s using the terms from the supplied <code>SpanQuery</code>.
+         * 
+         * @param terms
+         *          Map to place created WeightedSpanTerms in
+         * @param spanQuery
+         *          SpanQuery to extract Terms from
+         * @throws IOException
+         */
+
+        private void extractWeightedSpanTerms(IDictionary<String, WeightedSpanTerm> terms, SpanQuery spanQuery)
+        {
+            ISet<String> fieldNames;
+
+            if (fieldName == null)
+            {
+                fieldNames = new HashSet<String>();
+                collectSpanQueryFields(spanQuery, fieldNames);
+            }
+            else
+            {
+                fieldNames = new HashSet<String> {fieldName};
+            }
+            // To support the use of the default field name
+            if (defaultField != null)
+            {
+                fieldNames.Add(defaultField);
+            }
+
+            HashMap<String, SpanQuery> queries = new HashMap<String, SpanQuery>();
+
+            ISet<Term> nonWeightedTerms = new HashSet<Term>();
+            bool mrq = mustRewriteQuery(spanQuery);
+            if (mrq)
+            {
+                foreach (var field in fieldNames)
+                {
+                    SpanQuery rewrittenQuery = (SpanQuery) spanQuery.Rewrite(getReaderForField(field));
+                    queries[field] = rewrittenQuery;
+                    rewrittenQuery.ExtractTerms(nonWeightedTerms);
+                }
+            }
+            else
+            {
+                spanQuery.ExtractTerms(nonWeightedTerms);
+            }
+
+            List<PositionSpan> spanPositions = new List<PositionSpan>();
+
+            foreach (String field in fieldNames)
+            {
+
+                IndexReader reader = getReaderForField(field);
+                Spans.Spans spans = mrq ? queries[field].GetSpans(reader) : spanQuery.GetSpans(reader);
+
+
+                // collect span positions
+                while (spans.Next())
+                {
+                    spanPositions.Add(new PositionSpan(spans.Start(), spans.End() - 1));
+                }
+
+            }
+
+            if (spanPositions.Count == 0)
+            {
+                // no spans found
+                return;
+            }
+
+            foreach (Term queryTerm in nonWeightedTerms)
+            {
+
+                if (fieldNameComparator(queryTerm.Field))
+                {
+                    WeightedSpanTerm weightedSpanTerm = terms[queryTerm.Text];
+
+                    if (weightedSpanTerm == null)
+                    {
+                        weightedSpanTerm = new WeightedSpanTerm(spanQuery.Boost, queryTerm.Text);
+                        weightedSpanTerm.addPositionSpans(spanPositions);
+                        weightedSpanTerm.setPositionSensitive(true);
+                        terms[queryTerm.Text] = weightedSpanTerm;
+                    }
+                    else
+                    {
+                        if (spanPositions.Count > 0)
+                        {
+                            weightedSpanTerm.addPositionSpans(spanPositions);
+                        }
+                    }
+                }
+            }
+        }
+
+        /**
+         * Fills a <code>Map</code> with <@link WeightedSpanTerm>s using the terms from the supplied <code>Query</code>.
+         * 
+         * @param terms
+         *          Map to place created WeightedSpanTerms in
+         * @param query
+         *          Query to extract Terms from
+         * @throws IOException
+         */
+
+        private void extractWeightedTerms(IDictionary<String, WeightedSpanTerm> terms, Query query)
+        {
+            ISet<Term> nonWeightedTerms = new HashSet<Term>();
+            query.ExtractTerms(nonWeightedTerms);
+
+            foreach (Term queryTerm in nonWeightedTerms)
+            {
+
+                if (fieldNameComparator(queryTerm.Field))
+                {
+                    WeightedSpanTerm weightedSpanTerm = new WeightedSpanTerm(query.Boost, queryTerm.Text);
+                    terms[queryTerm.Text] = weightedSpanTerm;
+                }
+            }
+        }
+
+        /**
+         * Necessary to implement matches for queries against <code>defaultField</code>
+         */
+
+        private bool fieldNameComparator(String fieldNameToCheck)
+        {
+            bool rv = fieldName == null || fieldNameToCheck == fieldName
+                      || fieldNameToCheck == defaultField;
+            return rv;
+        }
+
+        private IndexReader getReaderForField(String field)
+        {
+            if (wrapToCaching && !cachedTokenStream && !(tokenStream is CachingTokenFilter))
+            {
+                tokenStream = new CachingTokenFilter(tokenStream);
+                cachedTokenStream = true;
+            }
+            IndexReader reader = readers[field];
+            if (reader == null)
+            {
+                //MemoryIndex indexer = new MemoryIndex();
+                //indexer.AddField(field, tokenStream);
+                //tokenStream.Reset();
+                //IndexSearcher searcher = indexer.CreateSearcher();
+                //reader = searcher.IndexReader;
+                //readers[field] = reader;
+            }
+
+            return reader;
+        }
+
+        /**
+         * Creates a Map of <code>WeightedSpanTerms</code> from the given <code>Query</code> and <code>TokenStream</code>.
+         * 
+         * <p>
+         * 
+         * @param query
+         *          that caused hit
+         * @param tokenStream
+         *          of text to be highlighted
+         * @return Map containing WeightedSpanTerms
+         * @throws IOException
+         */
+
+        public HashMap<String, WeightedSpanTerm> getWeightedSpanTerms(Query query, TokenStream tokenStream)
+        {
+            return getWeightedSpanTerms(query, tokenStream, null);
+        }
+
+        /**
+         * Creates a Map of <code>WeightedSpanTerms</code> from the given <code>Query</code> and <code>TokenStream</code>.
+         * 
+         * <p>
+         * 
+         * @param query
+         *          that caused hit
+         * @param tokenStream
+         *          of text to be highlighted
+         * @param fieldName
+         *          restricts Term's used based on field name
+         * @return Map containing WeightedSpanTerms
+         * @throws IOException
+         */
+
+        public HashMap<String, WeightedSpanTerm> getWeightedSpanTerms(Query query, TokenStream tokenStream,
+                                                                      String fieldName)
+        {
+            if (fieldName != null)
+            {
+                this.fieldName = StringHelper.Intern(fieldName);
+            }
+            else
+            {
+                this.fieldName = null;
+            }
+
+            HashMap<String, WeightedSpanTerm> terms = new PositionCheckingMap<String>();
+            this.tokenStream = tokenStream;
+            try
+            {
+                extract(query, terms);
+            }
+            finally
+            {
+                closeReaders();
+            }
+
+            return terms;
+        }
+
+        /**
+         * Creates a Map of <code>WeightedSpanTerms</code> from the given <code>Query</code> and <code>TokenStream</code>. Uses a supplied
+         * <code>IndexReader</code> to properly weight terms (for gradient highlighting).
+         * 
+         * <p>
+         * 
+         * @param query
+         *          that caused hit
+         * @param tokenStream
+         *          of text to be highlighted
+         * @param fieldName
+         *          restricts Term's used based on field name
+         * @param reader
+         *          to use for scoring
+         * @return Map of WeightedSpanTerms with quasi tf/idf scores
+         * @throws IOException
+         */
+
+        public HashMap<String, WeightedSpanTerm> getWeightedSpanTermsWithScores(Query query, TokenStream tokenStream,
+                                                                                String fieldName,
+                                                                                IndexReader reader)
+        {
+            if (fieldName != null)
+            {
+                this.fieldName = StringHelper.Intern(fieldName);
+            }
+            else
+            {
+                this.fieldName = null;
+            }
+            this.tokenStream = tokenStream;
+
+            HashMap<String, WeightedSpanTerm> terms = new PositionCheckingMap<String>();
+            extract(query, terms);
+
+            int totalNumDocs = reader.NumDocs();
+            var weightedTerms = terms.Keys;
+
+            try
+            {
+                foreach (var term in weightedTerms)
+                {
+                    WeightedSpanTerm weightedSpanTerm = terms[term];
+                    int docFreq = reader.DocFreq(new Term(fieldName, weightedSpanTerm.term));
+                    // docFreq counts deletes
+                    if (totalNumDocs < docFreq)
+                    {
+                        docFreq = totalNumDocs;
+                    }
+                    // IDF algorithm taken from DefaultSimilarity class
+                    float idf = (float) (Math.Log((float) totalNumDocs/(double) (docFreq + 1)) + 1.0);
+                    weightedSpanTerm.weight *= idf;
+                }
+
+            }
+            finally
+            {
+
+                closeReaders();
+            }
+
+            return terms;
+        }
+
+        private void collectSpanQueryFields(SpanQuery spanQuery, ISet<String> fieldNames)
+        {
+            if (spanQuery is FieldMaskingSpanQuery)
+            {
+                collectSpanQueryFields(((FieldMaskingSpanQuery) spanQuery).MaskedQuery, fieldNames);
+            }
+            else if (spanQuery is SpanFirstQuery)
+            {
+                collectSpanQueryFields(((SpanFirstQuery) spanQuery).Match, fieldNames);
+            }
+            else if (spanQuery is SpanNearQuery)
+            {
+                foreach (SpanQuery clause in ((SpanNearQuery) spanQuery).GetClauses())
+                {
+                    collectSpanQueryFields(clause, fieldNames);
+                }
+            }
+            else if (spanQuery is SpanNotQuery)
+            {
+                collectSpanQueryFields(((SpanNotQuery) spanQuery).Include, fieldNames);
+            }
+            else if (spanQuery is SpanOrQuery)
+            {
+                foreach (SpanQuery clause in ((SpanOrQuery) spanQuery).GetClauses())
+                {
+                    collectSpanQueryFields(clause, fieldNames);
+                }
+            }
+            else
+            {
+                fieldNames.Add(spanQuery.Field);
+            }
+        }
+
+        private bool mustRewriteQuery(SpanQuery spanQuery)
+        {
+            if (!expandMultiTermQuery)
+            {
+                return false; // Will throw UnsupportedOperationException in case of a SpanRegexQuery.
+            }
+            else if (spanQuery is FieldMaskingSpanQuery)
+            {
+                return mustRewriteQuery(((FieldMaskingSpanQuery) spanQuery).MaskedQuery);
+            }
+            else if (spanQuery is SpanFirstQuery)
+            {
+                return mustRewriteQuery(((SpanFirstQuery) spanQuery).Match);
+            }
+            else if (spanQuery is SpanNearQuery)
+            {
+                foreach (SpanQuery clause in ((SpanNearQuery) spanQuery).GetClauses())
+                {
+                    if (mustRewriteQuery(clause))
+                    {
+                        return true;
+                    }
+                }
+                return false;
+            }
+            else if (spanQuery is SpanNotQuery)
+            {
+                SpanNotQuery spanNotQuery = (SpanNotQuery) spanQuery;
+                return mustRewriteQuery(spanNotQuery.Include) || mustRewriteQuery(spanNotQuery.Exclude);
+            }
+            else if (spanQuery is SpanOrQuery)
+            {
+                foreach (SpanQuery clause in ((SpanOrQuery) spanQuery).GetClauses())
+                {
+                    if (mustRewriteQuery(clause))
+                    {
+                        return true;
+                    }
+                }
+                return false;
+            }
+            else if (spanQuery is SpanTermQuery)
+            {
+                return false;
+            }
+            else
+            {
+                return true;
+            }
+        }
+
+        /**
+         * This class makes sure that if both position sensitive and insensitive
+         * versions of the same term are added, the position insensitive one wins.
+         */
+
+        private class PositionCheckingMap<K> : HashMap<K, WeightedSpanTerm>
+        {
+
+            public void PutAll(IDictionary<K, WeightedSpanTerm> m)
+            {
+                foreach (var entry in m)
+                {
+                    this[entry.Key] = entry.Value;
+                }
+            }
+
+            public override void Add(K key, WeightedSpanTerm value)
+            {
+                WeightedSpanTerm prev = base[key] = value;
+
+                WeightedSpanTerm prevTerm = prev;
+                WeightedSpanTerm newTerm = value;
+                if (!prevTerm.isPositionSensitive())
+                {
+                    newTerm.setPositionSensitive(false);
+                }
+            }
+
+        }
+
+        public bool getExpandMultiTermQuery()
+        {
+            return expandMultiTermQuery;
+        }
+
+        public void setExpandMultiTermQuery(bool expandMultiTermQuery)
+        {
+            this.expandMultiTermQuery = expandMultiTermQuery;
+        }
+
+        public bool isCachedTokenStream()
+        {
+            return cachedTokenStream;
+        }
+
+        public TokenStream getTokenStream()
+        {
+            return tokenStream;
+        }
+
+        /**
+         * By default, {@link TokenStream}s that are not of the type
+         * {@link CachingTokenFilter} are wrapped in a {@link CachingTokenFilter} to
+         * ensure an efficient reset - if you are already using a different caching
+         * {@link TokenStream} impl and you don't want it to be wrapped, set this to
+         * false.
+         * 
+         * @param wrap
+         */
+
+        public void setWrapIfNotCachingTokenFilter(bool wrap)
+        {
+            this.wrapToCaching = wrap;
+        }
+
+        /**
+         * 
+         * A fake IndexReader class to extract the field from a MultiTermQuery
+         * 
+         */
+        private class FakeReader : FilterIndexReader
+        {
+            //See if this will work.
+            private static IndexReader EMPTY_MEMORY_INDEX_READER = IndexReader.Open(new RAMDirectory());
+            //private static IndexReader EMPTY_MEMORY_INDEX_READER = new MemoryIndex().createSearcher().getIndexReader();
+
+            public FakeReader()
+                : base(EMPTY_MEMORY_INDEX_READER)
+            {
+            }
+
+            public string Field { get; set; }
+
+            public override TermEnum Terms(Term t)
+            {
+                // only set first fieldname, maybe use a Set?
+                if (t != null && Field == null)
+                    Field = t.Field;
+                return base.Terms(t);
+            }
+        }
+    }
+}
\ No newline at end of file

Modified: incubator/lucene.net/trunk/src/contrib/Highlighter/WeightedTerm.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Highlighter/WeightedTerm.cs?rev=1304578&r1=1304577&r2=1304578&view=diff
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Highlighter/WeightedTerm.cs (original)
+++ incubator/lucene.net/trunk/src/contrib/Highlighter/WeightedTerm.cs Fri Mar 23 20:16:37 2012
@@ -26,6 +26,7 @@ namespace Lucene.Net.Highlight
 	{
 		internal float weight; // multiplier
 		internal System.String term; //stemmed form
+
 		public WeightedTerm(float weight, System.String term)
 		{
 			this.weight = weight;