You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2014/01/26 05:49:19 UTC
svn commit: r1561451 - in /lucene/dev/trunk: lucene/
lucene/core/src/java/org/apache/lucene/search/
lucene/core/src/java/org/apache/lucene/search/spans/
lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/
lucene/highlighter/src/test...
Author: rmuir
Date: Sun Jan 26 04:49:18 2014
New Revision: 1561451
URL: http://svn.apache.org/r1561451
Log:
LUCENE-5415: add multitermquery support to PostingsHighlighter
Added:
lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/MultiTermHighlighting.java (with props)
lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestMultiTermHighlighting.java (with props)
Modified:
lucene/dev/trunk/lucene/CHANGES.txt
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/AutomatonQuery.java
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/FuzzyQuery.java
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/spans/SpanMultiTermQueryWrapper.java
lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java
lucene/dev/trunk/solr/core/src/java/org/apache/solr/highlight/PostingsSolrHighlighter.java
lucene/dev/trunk/solr/core/src/test/org/apache/solr/highlight/TestPostingsSolrHighlighter.java
Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1561451&r1=1561450&r2=1561451&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Sun Jan 26 04:49:18 2014
@@ -116,6 +116,9 @@ New Features
fixes too. More info:
https://github.com/spatial4j/spatial4j/blob/master/CHANGES.md (David Smiley)
+* LUCENE-5415: Add multitermquery (wildcards,prefix,etc) to PostingsHighlighter.
+ (Mike McCandless, Robert Muir)
+
Build
* LUCENE-5217: Maven config: get dependencies from Ant+Ivy config; disable
@@ -201,6 +204,9 @@ Bug fixes
the same Directory to multiple concurrent addIndexes calls (which is
anyways unusual). (Robert Muir, Mike McCandless)
+* LUCENE-5415: SpanMultiTermQueryWrapper didn't handle its boost in
+ hashcode/equals/tostring/rewrite. (Robert Muir)
+
API Changes
* LUCENE-5339: The facet module was simplified/reworked to make the
Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/AutomatonQuery.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/AutomatonQuery.java?rev=1561451&r1=1561450&r2=1561451&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/AutomatonQuery.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/AutomatonQuery.java Sun Jan 26 04:49:18 2014
@@ -128,4 +128,9 @@ public class AutomatonQuery extends Mult
buffer.append(ToStringUtils.boost(getBoost()));
return buffer.toString();
}
+
+ /** Returns the automaton used to create this query */
+ public Automaton getAutomaton() {
+ return automaton;
+ }
}
Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/FuzzyQuery.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/FuzzyQuery.java?rev=1561451&r1=1561450&r2=1561451&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/FuzzyQuery.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/FuzzyQuery.java Sun Jan 26 04:49:18 2014
@@ -138,6 +138,14 @@ public class FuzzyQuery extends MultiTer
public int getPrefixLength() {
return prefixLength;
}
+
+ /**
+ * Returns true if transpositions should be treated as a primitive edit operation.
+ * If this is false, comparisons will implement the classic Levenshtein algorithm.
+ */
+ public boolean getTranspositions() {
+ return transpositions;
+ }
@Override
protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException {
Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/spans/SpanMultiTermQueryWrapper.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/spans/SpanMultiTermQueryWrapper.java?rev=1561451&r1=1561450&r2=1561451&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/spans/SpanMultiTermQueryWrapper.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/search/spans/SpanMultiTermQueryWrapper.java Sun Jan 26 04:49:18 2014
@@ -100,6 +100,11 @@ public class SpanMultiTermQueryWrapper<Q
public String getField() {
return query.getField();
}
+
+ /** Returns the wrapped query */
+ public Query getWrappedQuery() {
+ return query;
+ }
@Override
public String toString(String field) {
@@ -107,6 +112,10 @@ public class SpanMultiTermQueryWrapper<Q
builder.append("SpanMultiTermQueryWrapper(");
builder.append(query.toString(field));
builder.append(")");
+ if (getBoost() != 1F) {
+ builder.append('^');
+ builder.append(getBoost());
+ }
return builder.toString();
}
@@ -115,22 +124,26 @@ public class SpanMultiTermQueryWrapper<Q
final Query q = query.rewrite(reader);
if (!(q instanceof SpanQuery))
throw new UnsupportedOperationException("You can only use SpanMultiTermQueryWrapper with a suitable SpanRewriteMethod.");
+ q.setBoost(q.getBoost() * getBoost()); // multiply boost
return q;
}
@Override
public int hashCode() {
- return 31 * query.hashCode();
+ final int prime = 31;
+ int result = super.hashCode();
+ result = prime * result + query.hashCode();
+ return result;
}
@Override
- @SuppressWarnings({"rawtypes","unchecked"})
public boolean equals(Object obj) {
if (this == obj) return true;
- if (obj == null) return false;
+ if (!super.equals(obj)) return false;
if (getClass() != obj.getClass()) return false;
- final SpanMultiTermQueryWrapper other = (SpanMultiTermQueryWrapper) obj;
- return query.equals(other.query);
+ SpanMultiTermQueryWrapper<?> other = (SpanMultiTermQueryWrapper<?>) obj;
+ if (!query.equals(other.query)) return false;
+ return true;
}
/** Abstract class that defines how the query is rewritten. */
Added: lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/MultiTermHighlighting.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/MultiTermHighlighting.java?rev=1561451&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/MultiTermHighlighting.java (added)
+++ lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/MultiTermHighlighting.java Sun Jan 26 04:49:18 2014
@@ -0,0 +1,284 @@
+package org.apache.lucene.search.postingshighlight;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Comparator;
+import java.util.List;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.index.DocsAndPositionsEnum;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.AutomatonQuery;
+import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.DisjunctionMaxQuery;
+import org.apache.lucene.search.FuzzyQuery;
+import org.apache.lucene.search.PrefixQuery;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.TermRangeQuery;
+import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper;
+import org.apache.lucene.search.spans.SpanNearQuery;
+import org.apache.lucene.search.spans.SpanNotQuery;
+import org.apache.lucene.search.spans.SpanOrQuery;
+import org.apache.lucene.search.spans.SpanPositionCheckQuery;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.CharsRef;
+import org.apache.lucene.util.UnicodeUtil;
+import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.BasicAutomata;
+import org.apache.lucene.util.automaton.BasicOperations;
+import org.apache.lucene.util.automaton.CharacterRunAutomaton;
+import org.apache.lucene.util.automaton.LevenshteinAutomata;
+
+/**
+ * Support for highlighting multiterm queries in PostingsHighlighter.
+ */
+class MultiTermHighlighting {
+
+ /**
+ * Extracts all MultiTermQueries for {@code field}, and returns equivalent
+ * automata that will match terms.
+ */
+ static CharacterRunAutomaton[] extractAutomata(Query query, String field) {
+ List<CharacterRunAutomaton> list = new ArrayList<>();
+ if (query instanceof BooleanQuery) {
+ BooleanClause clauses[] = ((BooleanQuery) query).getClauses();
+ for (BooleanClause clause : clauses) {
+ if (!clause.isProhibited()) {
+ list.addAll(Arrays.asList(extractAutomata(clause.getQuery(), field)));
+ }
+ }
+ } else if (query instanceof DisjunctionMaxQuery) {
+ for (Query sub : ((DisjunctionMaxQuery) query).getDisjuncts()) {
+ list.addAll(Arrays.asList(extractAutomata(sub, field)));
+ }
+ } else if (query instanceof SpanOrQuery) {
+ for (Query sub : ((SpanOrQuery) query).getClauses()) {
+ list.addAll(Arrays.asList(extractAutomata(sub, field)));
+ }
+ } else if (query instanceof SpanNearQuery) {
+ for (Query sub : ((SpanNearQuery) query).getClauses()) {
+ list.addAll(Arrays.asList(extractAutomata(sub, field)));
+ }
+ } else if (query instanceof SpanNotQuery) {
+ list.addAll(Arrays.asList(extractAutomata(((SpanNotQuery) query).getInclude(), field)));
+ } else if (query instanceof SpanPositionCheckQuery) {
+ list.addAll(Arrays.asList(extractAutomata(((SpanPositionCheckQuery) query).getMatch(), field)));
+ } else if (query instanceof SpanMultiTermQueryWrapper) {
+ list.addAll(Arrays.asList(extractAutomata(((SpanMultiTermQueryWrapper<?>) query).getWrappedQuery(), field)));
+ } else if (query instanceof AutomatonQuery) {
+ final AutomatonQuery aq = (AutomatonQuery) query;
+ if (aq.getField().equals(field)) {
+ list.add(new CharacterRunAutomaton(aq.getAutomaton()) {
+ @Override
+ public String toString() {
+ return aq.toString();
+ }
+ });
+ }
+ } else if (query instanceof PrefixQuery) {
+ final PrefixQuery pq = (PrefixQuery) query;
+ Term prefix = pq.getPrefix();
+ if (prefix.field().equals(field)) {
+ list.add(new CharacterRunAutomaton(BasicOperations.concatenate(BasicAutomata.makeString(prefix.text()),
+ BasicAutomata.makeAnyString())) {
+ @Override
+ public String toString() {
+ return pq.toString();
+ }
+ });
+ }
+ } else if (query instanceof FuzzyQuery) {
+ final FuzzyQuery fq = (FuzzyQuery) query;
+ if (fq.getField().equals(field)) {
+ String utf16 = fq.getTerm().text();
+ int termText[] = new int[utf16.codePointCount(0, utf16.length())];
+ for (int cp, i = 0, j = 0; i < utf16.length(); i += Character.charCount(cp)) {
+ termText[j++] = cp = utf16.codePointAt(i);
+ }
+ int termLength = termText.length;
+ int prefixLength = Math.min(fq.getPrefixLength(), termLength);
+ String suffix = UnicodeUtil.newString(termText, prefixLength, termText.length - prefixLength);
+ LevenshteinAutomata builder = new LevenshteinAutomata(suffix, fq.getTranspositions());
+ Automaton automaton = builder.toAutomaton(fq.getMaxEdits());
+ if (prefixLength > 0) {
+ Automaton prefix = BasicAutomata.makeString(UnicodeUtil.newString(termText, 0, prefixLength));
+ automaton = BasicOperations.concatenate(prefix, automaton);
+ }
+ list.add(new CharacterRunAutomaton(automaton) {
+ @Override
+ public String toString() {
+ return fq.toString();
+ }
+ });
+ }
+ } else if (query instanceof TermRangeQuery) {
+ final TermRangeQuery tq = (TermRangeQuery) query;
+ if (tq.getField().equals(field)) {
+ final CharsRef lowerBound;
+ if (tq.getLowerTerm() == null) {
+ lowerBound = null;
+ } else {
+ lowerBound = new CharsRef(tq.getLowerTerm().utf8ToString());
+ }
+
+ final CharsRef upperBound;
+ if (tq.getUpperTerm() == null) {
+ upperBound = null;
+ } else {
+ upperBound = new CharsRef(tq.getUpperTerm().utf8ToString());
+ }
+
+ final boolean includeLower = tq.includesLower();
+ final boolean includeUpper = tq.includesUpper();
+ final CharsRef scratch = new CharsRef();
+ final Comparator<CharsRef> comparator = CharsRef.getUTF16SortedAsUTF8Comparator();
+
+ // this is *not* an automaton, but its very simple
+ list.add(new CharacterRunAutomaton(BasicAutomata.makeEmpty()) {
+ @Override
+ public boolean run(char[] s, int offset, int length) {
+ scratch.chars = s;
+ scratch.offset = offset;
+ scratch.length = length;
+
+ if (lowerBound != null) {
+ int cmp = comparator.compare(scratch, lowerBound);
+ if (cmp < 0 || (!includeLower && cmp == 0)) {
+ return false;
+ }
+ }
+
+ if (upperBound != null) {
+ int cmp = comparator.compare(scratch, upperBound);
+ if (cmp > 0 || (!includeUpper && cmp == 0)) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ @Override
+ public String toString() {
+ return tq.toString();
+ }
+ });
+ }
+ }
+ return list.toArray(new CharacterRunAutomaton[list.size()]);
+ }
+
+ /**
+ * Returns a "fake" DocsAndPositionsEnum over the tokenstream, returning offsets where {@code matchers}
+ * matches tokens.
+ * <p>
+ * This is solely used internally by PostingsHighlighter: <b>DO NOT USE THIS METHOD!</b>
+ */
+ static DocsAndPositionsEnum getDocsEnum(final TokenStream ts, final CharacterRunAutomaton[] matchers) throws IOException {
+ final CharTermAttribute charTermAtt = ts.addAttribute(CharTermAttribute.class);
+ final OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
+ ts.reset();
+
+ // TODO: we could use CachingWrapperFilter, (or consume twice) to allow us to have a true freq()
+ // but this would have a performance cost for likely little gain in the user experience, it
+ // would only serve to make this method less bogus.
+ // instead, we always return freq() = Integer.MAX_VALUE and let PH terminate based on offset...
+
+ return new DocsAndPositionsEnum() {
+ int currentDoc = -1;
+ int currentMatch = -1;
+ int currentStartOffset = -1;
+ int currentEndOffset = -1;
+ TokenStream stream = ts;
+
+ final BytesRef matchDescriptions[] = new BytesRef[matchers.length];
+
+ @Override
+ public int nextPosition() throws IOException {
+ if (stream != null) {
+ while (stream.incrementToken()) {
+ for (int i = 0; i < matchers.length; i++) {
+ if (matchers[i].run(charTermAtt.buffer(), 0, charTermAtt.length())) {
+ currentStartOffset = offsetAtt.startOffset();
+ currentEndOffset = offsetAtt.endOffset();
+ currentMatch = i;
+ return 0;
+ }
+ }
+ }
+ stream.end();
+ stream.close();
+ stream = null;
+ }
+ // exhausted
+ currentStartOffset = currentEndOffset = Integer.MAX_VALUE;
+ return Integer.MAX_VALUE;
+ }
+
+ @Override
+ public int freq() throws IOException {
+ return Integer.MAX_VALUE; // lie
+ }
+
+ @Override
+ public int startOffset() throws IOException {
+ assert currentStartOffset >= 0;
+ return currentStartOffset;
+ }
+
+ @Override
+ public int endOffset() throws IOException {
+ assert currentEndOffset >= 0;
+ return currentEndOffset;
+ }
+
+ @Override
+ public BytesRef getPayload() throws IOException {
+ if (matchDescriptions[currentMatch] == null) {
+ matchDescriptions[currentMatch] = new BytesRef(matchers[currentMatch].toString());
+ }
+ return matchDescriptions[currentMatch];
+ }
+
+ @Override
+ public int docID() {
+ return currentDoc;
+ }
+
+ @Override
+ public int nextDoc() throws IOException {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public int advance(int target) throws IOException {
+ return currentDoc = target;
+ }
+
+ @Override
+ public long cost() {
+ return 0;
+ }
+ };
+ }
+}
Modified: lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java?rev=1561451&r1=1561450&r2=1561451&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java (original)
+++ lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java Sun Jan 26 04:49:18 2014
@@ -30,6 +30,7 @@ import java.util.PriorityQueue;
import java.util.SortedSet;
import java.util.TreeSet;
+import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.AtomicReader;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.DocsAndPositionsEnum;
@@ -50,6 +51,7 @@ import org.apache.lucene.search.TopDocs;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.InPlaceMergeSorter;
import org.apache.lucene.util.UnicodeUtil;
+import org.apache.lucene.util.automaton.CharacterRunAutomaton;
/**
* Simple highlighter that does not analyze fields nor use
@@ -64,6 +66,14 @@ import org.apache.lucene.util.UnicodeUti
* into a {@link Passage}, and then scores each Passage using a separate {@link PassageScorer}.
* Passages are finally formatted into highlighted snippets with a {@link PassageFormatter}.
* <p>
+ * You can customize the behavior by subclassing this highlighter, some important hooks:
+ * <ul>
+ * <li>{@link #getBreakIterator(String)}: Customize how the text is divided into passages.
+ * <li>{@link #getScorer(String)}: Customize how passages are ranked.
+ * <li>{@link #getFormatter(String)}: Customize how snippets are formatted.
+ * <li>{@link #getIndexAnalyzer(String)}: Enable highlighting of MultiTermQuerys such as {@code WildcardQuery}.
+ * </ul>
+ * <p>
* <b>WARNING</b>: The code is very new and probably still has some exciting bugs!
* <p>
* Example usage:
@@ -335,9 +345,9 @@ public class PostingsHighlighter {
throw new IllegalArgumentException("invalid number of maxPassagesIn");
}
final IndexReader reader = searcher.getIndexReader();
- query = rewrite(query);
+ Query rewritten = rewrite(query);
SortedSet<Term> queryTerms = new TreeSet<Term>();
- query.extractTerms(queryTerms);
+ rewritten.extractTerms(queryTerms);
IndexReaderContext readerContext = reader.getContext();
List<AtomicReaderContext> leaves = readerContext.leaves();
@@ -389,7 +399,7 @@ public class PostingsHighlighter {
for(Term term : fieldTerms) {
terms[termUpto++] = term.bytes();
}
- Map<Integer,Object> fieldHighlights = highlightField(field, contents[i], getBreakIterator(field), terms, docids, leaves, numPassages);
+ Map<Integer,Object> fieldHighlights = highlightField(field, contents[i], getBreakIterator(field), terms, docids, leaves, numPassages, query);
Object[] result = new Object[docids.length];
for (int j = 0; j < docidsIn.length; j++) {
@@ -432,8 +442,18 @@ public class PostingsHighlighter {
protected char getMultiValuedSeparator(String field) {
return ' ';
}
+
+ /**
+ * Returns the analyzer originally used to index the content for {@code field}.
+ * <p>
+ * This is used to highlight some MultiTermQueries.
+ * @return Analyzer or null (the default, meaning no special multi-term processing)
+ */
+ protected Analyzer getIndexAnalyzer(String field) {
+ return null;
+ }
- private Map<Integer,Object> highlightField(String field, String contents[], BreakIterator bi, BytesRef terms[], int[] docids, List<AtomicReaderContext> leaves, int maxPassages) throws IOException {
+ private Map<Integer,Object> highlightField(String field, String contents[], BreakIterator bi, BytesRef terms[], int[] docids, List<AtomicReaderContext> leaves, int maxPassages, Query query) throws IOException {
Map<Integer,Object> highlights = new HashMap<Integer,Object>();
// reuse in the real sense... for docs in same segment we just advance our old enum
@@ -445,6 +465,21 @@ public class PostingsHighlighter {
if (fieldFormatter == null) {
throw new NullPointerException("PassageFormatter cannot be null");
}
+
+ // check if we should do any multitermprocessing
+ Analyzer analyzer = getIndexAnalyzer(field);
+ CharacterRunAutomaton automata[] = new CharacterRunAutomaton[0];
+ if (analyzer != null) {
+ automata = MultiTermHighlighting.extractAutomata(query, field);
+ }
+
+ final BytesRef allTerms[];
+ if (automata.length > 0) {
+ allTerms = new BytesRef[terms.length + 1];
+ System.arraycopy(terms, 0, allTerms, 0, terms.length);
+ } else {
+ allTerms = terms;
+ }
for (int i = 0; i < docids.length; i++) {
String content = contents[i];
@@ -462,9 +497,14 @@ public class PostingsHighlighter {
}
if (leaf != lastLeaf) {
termsEnum = t.iterator(null);
- postings = new DocsAndPositionsEnum[terms.length];
+ postings = new DocsAndPositionsEnum[allTerms.length];
+ }
+ if (automata.length > 0) {
+ DocsAndPositionsEnum dp = MultiTermHighlighting.getDocsEnum(analyzer.tokenStream(field, content), automata);
+ dp.advance(doc - subContext.docBase);
+ postings[terms.length] = dp;
}
- Passage passages[] = highlightDoc(field, terms, content.length(), bi, doc - subContext.docBase, termsEnum, postings, maxPassages);
+ Passage passages[] = highlightDoc(field, allTerms, content.length(), bi, doc - subContext.docBase, termsEnum, postings, maxPassages);
if (passages.length == 0) {
passages = getEmptyHighlight(field, bi, maxPassages);
}
@@ -593,7 +633,13 @@ public class PostingsHighlighter {
int tf = 0;
while (true) {
tf++;
- current.addMatch(start, end, terms[off.id]);
+ BytesRef term = terms[off.id];
+ if (term == null) {
+ // multitermquery match, pull from payload
+ term = off.dp.getPayload();
+ assert term != null;
+ }
+ current.addMatch(start, end, term);
if (off.pos == dp.freq()) {
break; // removed from pq
} else {
Added: lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestMultiTermHighlighting.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestMultiTermHighlighting.java?rev=1561451&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestMultiTermHighlighting.java (added)
+++ lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestMultiTermHighlighting.java Sun Jan 26 04:49:18 2014
@@ -0,0 +1,797 @@
+package org.apache.lucene.search.postingshighlight;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.FieldType;
+import org.apache.lucene.document.TextField;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.index.RandomIndexWriter;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.FieldInfo.IndexOptions;
+import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.DisjunctionMaxQuery;
+import org.apache.lucene.search.FuzzyQuery;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.MatchAllDocsQuery;
+import org.apache.lucene.search.PrefixQuery;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.RegexpQuery;
+import org.apache.lucene.search.Sort;
+import org.apache.lucene.search.TermRangeQuery;
+import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.search.WildcardQuery;
+import org.apache.lucene.search.spans.SpanFirstQuery;
+import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper;
+import org.apache.lucene.search.spans.SpanNearQuery;
+import org.apache.lucene.search.spans.SpanNotQuery;
+import org.apache.lucene.search.spans.SpanOrQuery;
+import org.apache.lucene.search.spans.SpanQuery;
+import org.apache.lucene.search.spans.SpanTermQuery;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.LuceneTestCase.SuppressCodecs;
+
+/**
+ * Some tests that override {@link PostingsHighlighter#getIndexAnalyzer} to
+ * highlight wilcard, fuzzy, etc queries.
+ */
+@SuppressCodecs({"MockFixedIntBlock", "MockVariableIntBlock", "MockSep", "MockRandom"})
+public class TestMultiTermHighlighting extends LuceneTestCase {
+
+ public void testWildcards() throws Exception {
+ Directory dir = newDirectory();
+ // use simpleanalyzer for more natural tokenization (else "test." is a token)
+ final Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
+ IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer);
+ iwc.setMergePolicy(newLogMergePolicy());
+ RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
+
+ FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
+ offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
+ Field body = new Field("body", "", offsetsType);
+ Document doc = new Document();
+ doc.add(body);
+
+ body.setStringValue("This is a test.");
+ iw.addDocument(doc);
+ body.setStringValue("Test a one sentence document.");
+ iw.addDocument(doc);
+
+ IndexReader ir = iw.getReader();
+ iw.close();
+
+ IndexSearcher searcher = newSearcher(ir);
+ PostingsHighlighter highlighter = new PostingsHighlighter() {
+ @Override
+ protected Analyzer getIndexAnalyzer(String field) {
+ return analyzer;
+ }
+ };
+ Query query = new WildcardQuery(new Term("body", "te*"));
+ TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER);
+ assertEquals(2, topDocs.totalHits);
+ String snippets[] = highlighter.highlight("body", query, searcher, topDocs);
+ assertEquals(2, snippets.length);
+ assertEquals("This is a <b>test</b>.", snippets[0]);
+ assertEquals("<b>Test</b> a one sentence document.", snippets[1]);
+
+ // wrong field
+ BooleanQuery bq = new BooleanQuery();
+ bq.add(new MatchAllDocsQuery(), BooleanClause.Occur.SHOULD);
+ bq.add(new WildcardQuery(new Term("bogus", "te*")), BooleanClause.Occur.SHOULD);
+ topDocs = searcher.search(bq, null, 10, Sort.INDEXORDER);
+ assertEquals(2, topDocs.totalHits);
+ snippets = highlighter.highlight("body", bq, searcher, topDocs);
+ assertEquals(2, snippets.length);
+ assertEquals("This is a test.", snippets[0]);
+ assertEquals("Test a one sentence document.", snippets[1]);
+
+ ir.close();
+ dir.close();
+ }
+
+ public void testOnePrefix() throws Exception {
+ Directory dir = newDirectory();
+ // use simpleanalyzer for more natural tokenization (else "test." is a token)
+ final Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
+ IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer);
+ iwc.setMergePolicy(newLogMergePolicy());
+ RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
+
+ FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
+ offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
+ Field body = new Field("body", "", offsetsType);
+ Document doc = new Document();
+ doc.add(body);
+
+ body.setStringValue("This is a test.");
+ iw.addDocument(doc);
+ body.setStringValue("Test a one sentence document.");
+ iw.addDocument(doc);
+
+ IndexReader ir = iw.getReader();
+ iw.close();
+
+ IndexSearcher searcher = newSearcher(ir);
+ PostingsHighlighter highlighter = new PostingsHighlighter() {
+ @Override
+ protected Analyzer getIndexAnalyzer(String field) {
+ return analyzer;
+ }
+ };
+ Query query = new PrefixQuery(new Term("body", "te"));
+ TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER);
+ assertEquals(2, topDocs.totalHits);
+ String snippets[] = highlighter.highlight("body", query, searcher, topDocs);
+ assertEquals(2, snippets.length);
+ assertEquals("This is a <b>test</b>.", snippets[0]);
+ assertEquals("<b>Test</b> a one sentence document.", snippets[1]);
+
+ // wrong field
+ BooleanQuery bq = new BooleanQuery();
+ bq.add(new MatchAllDocsQuery(), BooleanClause.Occur.SHOULD);
+ bq.add(new PrefixQuery(new Term("bogus", "te")), BooleanClause.Occur.SHOULD);
+ topDocs = searcher.search(bq, null, 10, Sort.INDEXORDER);
+ assertEquals(2, topDocs.totalHits);
+ snippets = highlighter.highlight("body", bq, searcher, topDocs);
+ assertEquals(2, snippets.length);
+ assertEquals("This is a test.", snippets[0]);
+ assertEquals("Test a one sentence document.", snippets[1]);
+
+ ir.close();
+ dir.close();
+ }
+
+ public void testOneRegexp() throws Exception {
+ Directory dir = newDirectory();
+ // use simpleanalyzer for more natural tokenization (else "test." is a token)
+ final Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
+ IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer);
+ iwc.setMergePolicy(newLogMergePolicy());
+ RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
+
+ FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
+ offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
+ Field body = new Field("body", "", offsetsType);
+ Document doc = new Document();
+ doc.add(body);
+
+ body.setStringValue("This is a test.");
+ iw.addDocument(doc);
+ body.setStringValue("Test a one sentence document.");
+ iw.addDocument(doc);
+
+ IndexReader ir = iw.getReader();
+ iw.close();
+
+ IndexSearcher searcher = newSearcher(ir);
+ PostingsHighlighter highlighter = new PostingsHighlighter() {
+ @Override
+ protected Analyzer getIndexAnalyzer(String field) {
+ return analyzer;
+ }
+ };
+ Query query = new RegexpQuery(new Term("body", "te.*"));
+ TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER);
+ assertEquals(2, topDocs.totalHits);
+ String snippets[] = highlighter.highlight("body", query, searcher, topDocs);
+ assertEquals(2, snippets.length);
+ assertEquals("This is a <b>test</b>.", snippets[0]);
+ assertEquals("<b>Test</b> a one sentence document.", snippets[1]);
+
+ // wrong field
+ BooleanQuery bq = new BooleanQuery();
+ bq.add(new MatchAllDocsQuery(), BooleanClause.Occur.SHOULD);
+ bq.add(new RegexpQuery(new Term("bogus", "te.*")), BooleanClause.Occur.SHOULD);
+ topDocs = searcher.search(bq, null, 10, Sort.INDEXORDER);
+ assertEquals(2, topDocs.totalHits);
+ snippets = highlighter.highlight("body", bq, searcher, topDocs);
+ assertEquals(2, snippets.length);
+ assertEquals("This is a test.", snippets[0]);
+ assertEquals("Test a one sentence document.", snippets[1]);
+
+ ir.close();
+ dir.close();
+ }
+
+ public void testOneFuzzy() throws Exception {
+ Directory dir = newDirectory();
+ // use simpleanalyzer for more natural tokenization (else "test." is a token)
+ final Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
+ IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer);
+ iwc.setMergePolicy(newLogMergePolicy());
+ RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
+
+ FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
+ offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
+ Field body = new Field("body", "", offsetsType);
+ Document doc = new Document();
+ doc.add(body);
+
+ body.setStringValue("This is a test.");
+ iw.addDocument(doc);
+ body.setStringValue("Test a one sentence document.");
+ iw.addDocument(doc);
+
+ IndexReader ir = iw.getReader();
+ iw.close();
+
+ IndexSearcher searcher = newSearcher(ir);
+ PostingsHighlighter highlighter = new PostingsHighlighter() {
+ @Override
+ protected Analyzer getIndexAnalyzer(String field) {
+ return analyzer;
+ }
+ };
+ Query query = new FuzzyQuery(new Term("body", "tets"), 1);
+ TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER);
+ assertEquals(2, topDocs.totalHits);
+ String snippets[] = highlighter.highlight("body", query, searcher, topDocs);
+ assertEquals(2, snippets.length);
+ assertEquals("This is a <b>test</b>.", snippets[0]);
+ assertEquals("<b>Test</b> a one sentence document.", snippets[1]);
+
+ // with prefix
+ query = new FuzzyQuery(new Term("body", "tets"), 1, 2);
+ topDocs = searcher.search(query, null, 10, Sort.INDEXORDER);
+ assertEquals(2, topDocs.totalHits);
+ snippets = highlighter.highlight("body", query, searcher, topDocs);
+ assertEquals(2, snippets.length);
+ assertEquals("This is a <b>test</b>.", snippets[0]);
+ assertEquals("<b>Test</b> a one sentence document.", snippets[1]);
+
+ // wrong field
+ BooleanQuery bq = new BooleanQuery();
+ bq.add(new MatchAllDocsQuery(), BooleanClause.Occur.SHOULD);
+ bq.add(new FuzzyQuery(new Term("bogus", "tets"), 1), BooleanClause.Occur.SHOULD);
+ topDocs = searcher.search(bq, null, 10, Sort.INDEXORDER);
+ assertEquals(2, topDocs.totalHits);
+ snippets = highlighter.highlight("body", bq, searcher, topDocs);
+ assertEquals(2, snippets.length);
+ assertEquals("This is a test.", snippets[0]);
+ assertEquals("Test a one sentence document.", snippets[1]);
+
+ ir.close();
+ dir.close();
+ }
+
+ public void testRanges() throws Exception {
+ Directory dir = newDirectory();
+ // use simpleanalyzer for more natural tokenization (else "test." is a token)
+ final Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
+ IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer);
+ iwc.setMergePolicy(newLogMergePolicy());
+ RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
+
+ FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
+ offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
+ Field body = new Field("body", "", offsetsType);
+ Document doc = new Document();
+ doc.add(body);
+
+ body.setStringValue("This is a test.");
+ iw.addDocument(doc);
+ body.setStringValue("Test a one sentence document.");
+ iw.addDocument(doc);
+
+ IndexReader ir = iw.getReader();
+ iw.close();
+
+ IndexSearcher searcher = newSearcher(ir);
+ PostingsHighlighter highlighter = new PostingsHighlighter() {
+ @Override
+ protected Analyzer getIndexAnalyzer(String field) {
+ return analyzer;
+ }
+ };
+ Query query = TermRangeQuery.newStringRange("body", "ta", "tf", true, true);
+ TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER);
+ assertEquals(2, topDocs.totalHits);
+ String snippets[] = highlighter.highlight("body", query, searcher, topDocs);
+ assertEquals(2, snippets.length);
+ assertEquals("This is a <b>test</b>.", snippets[0]);
+ assertEquals("<b>Test</b> a one sentence document.", snippets[1]);
+
+ // null start
+ query = TermRangeQuery.newStringRange("body", null, "tf", true, true);
+ topDocs = searcher.search(query, null, 10, Sort.INDEXORDER);
+ assertEquals(2, topDocs.totalHits);
+ snippets = highlighter.highlight("body", query, searcher, topDocs);
+ assertEquals(2, snippets.length);
+ assertEquals("This <b>is</b> <b>a</b> <b>test</b>.", snippets[0]);
+ assertEquals("<b>Test</b> <b>a</b> <b>one</b> <b>sentence</b> <b>document</b>.", snippets[1]);
+
+ // null end
+ query = TermRangeQuery.newStringRange("body", "ta", null, true, true);
+ topDocs = searcher.search(query, null, 10, Sort.INDEXORDER);
+ assertEquals(2, topDocs.totalHits);
+ snippets = highlighter.highlight("body", query, searcher, topDocs);
+ assertEquals(2, snippets.length);
+ assertEquals("<b>This</b> is a <b>test</b>.", snippets[0]);
+ assertEquals("<b>Test</b> a one sentence document.", snippets[1]);
+
+ // exact start inclusive
+ query = TermRangeQuery.newStringRange("body", "test", "tf", true, true);
+ topDocs = searcher.search(query, null, 10, Sort.INDEXORDER);
+ assertEquals(2, topDocs.totalHits);
+ snippets = highlighter.highlight("body", query, searcher, topDocs);
+ assertEquals(2, snippets.length);
+ assertEquals("This is a <b>test</b>.", snippets[0]);
+ assertEquals("<b>Test</b> a one sentence document.", snippets[1]);
+
+ // exact end inclusive
+ query = TermRangeQuery.newStringRange("body", "ta", "test", true, true);
+ topDocs = searcher.search(query, null, 10, Sort.INDEXORDER);
+ assertEquals(2, topDocs.totalHits);
+ snippets = highlighter.highlight("body", query, searcher, topDocs);
+ assertEquals(2, snippets.length);
+ assertEquals("This is a <b>test</b>.", snippets[0]);
+ assertEquals("<b>Test</b> a one sentence document.", snippets[1]);
+
+ // exact start exclusive
+ BooleanQuery bq = new BooleanQuery();
+ bq.add(new MatchAllDocsQuery(), BooleanClause.Occur.SHOULD);
+ bq.add(TermRangeQuery.newStringRange("body", "test", "tf", false, true), BooleanClause.Occur.SHOULD);
+ topDocs = searcher.search(bq, null, 10, Sort.INDEXORDER);
+ assertEquals(2, topDocs.totalHits);
+ snippets = highlighter.highlight("body", bq, searcher, topDocs);
+ assertEquals(2, snippets.length);
+ assertEquals("This is a test.", snippets[0]);
+ assertEquals("Test a one sentence document.", snippets[1]);
+
+ // exact end exclusive
+ bq = new BooleanQuery();
+ bq.add(new MatchAllDocsQuery(), BooleanClause.Occur.SHOULD);
+ bq.add(TermRangeQuery.newStringRange("body", "ta", "test", true, false), BooleanClause.Occur.SHOULD);
+ topDocs = searcher.search(bq, null, 10, Sort.INDEXORDER);
+ assertEquals(2, topDocs.totalHits);
+ snippets = highlighter.highlight("body", bq, searcher, topDocs);
+ assertEquals(2, snippets.length);
+ assertEquals("This is a test.", snippets[0]);
+ assertEquals("Test a one sentence document.", snippets[1]);
+
+ // wrong field
+ bq = new BooleanQuery();
+ bq.add(new MatchAllDocsQuery(), BooleanClause.Occur.SHOULD);
+ bq.add(TermRangeQuery.newStringRange("bogus", "ta", "tf", true, true), BooleanClause.Occur.SHOULD);
+ topDocs = searcher.search(bq, null, 10, Sort.INDEXORDER);
+ assertEquals(2, topDocs.totalHits);
+ snippets = highlighter.highlight("body", bq, searcher, topDocs);
+ assertEquals(2, snippets.length);
+ assertEquals("This is a test.", snippets[0]);
+ assertEquals("Test a one sentence document.", snippets[1]);
+
+ ir.close();
+ dir.close();
+ }
+
+ public void testWildcardInBoolean() throws Exception {
+ Directory dir = newDirectory();
+ // use simpleanalyzer for more natural tokenization (else "test." is a token)
+ final Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
+ IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer);
+ iwc.setMergePolicy(newLogMergePolicy());
+ RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
+
+ FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
+ offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
+ Field body = new Field("body", "", offsetsType);
+ Document doc = new Document();
+ doc.add(body);
+
+ body.setStringValue("This is a test.");
+ iw.addDocument(doc);
+ body.setStringValue("Test a one sentence document.");
+ iw.addDocument(doc);
+
+ IndexReader ir = iw.getReader();
+ iw.close();
+
+ IndexSearcher searcher = newSearcher(ir);
+ PostingsHighlighter highlighter = new PostingsHighlighter() {
+ @Override
+ protected Analyzer getIndexAnalyzer(String field) {
+ return analyzer;
+ }
+ };
+ BooleanQuery query = new BooleanQuery();
+ query.add(new WildcardQuery(new Term("body", "te*")), BooleanClause.Occur.SHOULD);
+ TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER);
+ assertEquals(2, topDocs.totalHits);
+ String snippets[] = highlighter.highlight("body", query, searcher, topDocs);
+ assertEquals(2, snippets.length);
+ assertEquals("This is a <b>test</b>.", snippets[0]);
+ assertEquals("<b>Test</b> a one sentence document.", snippets[1]);
+
+ // must not
+ query = new BooleanQuery();
+ query.add(new MatchAllDocsQuery(), BooleanClause.Occur.SHOULD);
+ query.add(new WildcardQuery(new Term("bogus", "te*")), BooleanClause.Occur.MUST_NOT);
+ topDocs = searcher.search(query, null, 10, Sort.INDEXORDER);
+ assertEquals(2, topDocs.totalHits);
+ snippets = highlighter.highlight("body", query, searcher, topDocs);
+ assertEquals(2, snippets.length);
+ assertEquals("This is a test.", snippets[0]);
+ assertEquals("Test a one sentence document.", snippets[1]);
+
+ ir.close();
+ dir.close();
+ }
+
+ public void testWildcardInDisjunctionMax() throws Exception {
+ Directory dir = newDirectory();
+ // use simpleanalyzer for more natural tokenization (else "test." is a token)
+ final Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
+ IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer);
+ iwc.setMergePolicy(newLogMergePolicy());
+ RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
+
+ FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
+ offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
+ Field body = new Field("body", "", offsetsType);
+ Document doc = new Document();
+ doc.add(body);
+
+ body.setStringValue("This is a test.");
+ iw.addDocument(doc);
+ body.setStringValue("Test a one sentence document.");
+ iw.addDocument(doc);
+
+ IndexReader ir = iw.getReader();
+ iw.close();
+
+ IndexSearcher searcher = newSearcher(ir);
+ PostingsHighlighter highlighter = new PostingsHighlighter() {
+ @Override
+ protected Analyzer getIndexAnalyzer(String field) {
+ return analyzer;
+ }
+ };
+ DisjunctionMaxQuery query = new DisjunctionMaxQuery(0);
+ query.add(new WildcardQuery(new Term("body", "te*")));
+ TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER);
+ assertEquals(2, topDocs.totalHits);
+ String snippets[] = highlighter.highlight("body", query, searcher, topDocs);
+ assertEquals(2, snippets.length);
+ assertEquals("This is a <b>test</b>.", snippets[0]);
+ assertEquals("<b>Test</b> a one sentence document.", snippets[1]);
+
+ ir.close();
+ dir.close();
+ }
+
+ public void testSpanWildcard() throws Exception {
+ Directory dir = newDirectory();
+ // use simpleanalyzer for more natural tokenization (else "test." is a token)
+ final Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
+ IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer);
+ iwc.setMergePolicy(newLogMergePolicy());
+ RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
+
+ FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
+ offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
+ Field body = new Field("body", "", offsetsType);
+ Document doc = new Document();
+ doc.add(body);
+
+ body.setStringValue("This is a test.");
+ iw.addDocument(doc);
+ body.setStringValue("Test a one sentence document.");
+ iw.addDocument(doc);
+
+ IndexReader ir = iw.getReader();
+ iw.close();
+
+ IndexSearcher searcher = newSearcher(ir);
+ PostingsHighlighter highlighter = new PostingsHighlighter() {
+ @Override
+ protected Analyzer getIndexAnalyzer(String field) {
+ return analyzer;
+ }
+ };
+ Query query = new SpanMultiTermQueryWrapper<WildcardQuery>(new WildcardQuery(new Term("body", "te*")));
+ TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER);
+ assertEquals(2, topDocs.totalHits);
+ String snippets[] = highlighter.highlight("body", query, searcher, topDocs);
+ assertEquals(2, snippets.length);
+ assertEquals("This is a <b>test</b>.", snippets[0]);
+ assertEquals("<b>Test</b> a one sentence document.", snippets[1]);
+
+ ir.close();
+ dir.close();
+ }
+
+ public void testSpanOr() throws Exception {
+ Directory dir = newDirectory();
+ // use simpleanalyzer for more natural tokenization (else "test." is a token)
+ final Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
+ IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer);
+ iwc.setMergePolicy(newLogMergePolicy());
+ RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
+
+ FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
+ offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
+ Field body = new Field("body", "", offsetsType);
+ Document doc = new Document();
+ doc.add(body);
+
+ body.setStringValue("This is a test.");
+ iw.addDocument(doc);
+ body.setStringValue("Test a one sentence document.");
+ iw.addDocument(doc);
+
+ IndexReader ir = iw.getReader();
+ iw.close();
+
+ IndexSearcher searcher = newSearcher(ir);
+ PostingsHighlighter highlighter = new PostingsHighlighter() {
+ @Override
+ protected Analyzer getIndexAnalyzer(String field) {
+ return analyzer;
+ }
+ };
+ SpanQuery childQuery = new SpanMultiTermQueryWrapper<WildcardQuery>(new WildcardQuery(new Term("body", "te*")));
+ Query query = new SpanOrQuery(new SpanQuery[] { childQuery });
+ TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER);
+ assertEquals(2, topDocs.totalHits);
+ String snippets[] = highlighter.highlight("body", query, searcher, topDocs);
+ assertEquals(2, snippets.length);
+ assertEquals("This is a <b>test</b>.", snippets[0]);
+ assertEquals("<b>Test</b> a one sentence document.", snippets[1]);
+
+ ir.close();
+ dir.close();
+ }
+
+ public void testSpanNear() throws Exception {
+ Directory dir = newDirectory();
+ // use simpleanalyzer for more natural tokenization (else "test." is a token)
+ final Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
+ IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer);
+ iwc.setMergePolicy(newLogMergePolicy());
+ RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
+
+ FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
+ offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
+ Field body = new Field("body", "", offsetsType);
+ Document doc = new Document();
+ doc.add(body);
+
+ body.setStringValue("This is a test.");
+ iw.addDocument(doc);
+ body.setStringValue("Test a one sentence document.");
+ iw.addDocument(doc);
+
+ IndexReader ir = iw.getReader();
+ iw.close();
+
+ IndexSearcher searcher = newSearcher(ir);
+ PostingsHighlighter highlighter = new PostingsHighlighter() {
+ @Override
+ protected Analyzer getIndexAnalyzer(String field) {
+ return analyzer;
+ }
+ };
+ SpanQuery childQuery = new SpanMultiTermQueryWrapper<WildcardQuery>(new WildcardQuery(new Term("body", "te*")));
+ Query query = new SpanNearQuery(new SpanQuery[] { childQuery }, 0, true);
+ TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER);
+ assertEquals(2, topDocs.totalHits);
+ String snippets[] = highlighter.highlight("body", query, searcher, topDocs);
+ assertEquals(2, snippets.length);
+ assertEquals("This is a <b>test</b>.", snippets[0]);
+ assertEquals("<b>Test</b> a one sentence document.", snippets[1]);
+
+ ir.close();
+ dir.close();
+ }
+
+ public void testSpanNot() throws Exception {
+ Directory dir = newDirectory();
+ // use simpleanalyzer for more natural tokenization (else "test." is a token)
+ final Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
+ IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer);
+ iwc.setMergePolicy(newLogMergePolicy());
+ RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
+
+ FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
+ offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
+ Field body = new Field("body", "", offsetsType);
+ Document doc = new Document();
+ doc.add(body);
+
+ body.setStringValue("This is a test.");
+ iw.addDocument(doc);
+ body.setStringValue("Test a one sentence document.");
+ iw.addDocument(doc);
+
+ IndexReader ir = iw.getReader();
+ iw.close();
+
+ IndexSearcher searcher = newSearcher(ir);
+ PostingsHighlighter highlighter = new PostingsHighlighter() {
+ @Override
+ protected Analyzer getIndexAnalyzer(String field) {
+ return analyzer;
+ }
+ };
+ SpanQuery include = new SpanMultiTermQueryWrapper<WildcardQuery>(new WildcardQuery(new Term("body", "te*")));
+ SpanQuery exclude = new SpanTermQuery(new Term("body", "bogus"));
+ Query query = new SpanNotQuery(include, exclude);
+ TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER);
+ assertEquals(2, topDocs.totalHits);
+ String snippets[] = highlighter.highlight("body", query, searcher, topDocs);
+ assertEquals(2, snippets.length);
+ assertEquals("This is a <b>test</b>.", snippets[0]);
+ assertEquals("<b>Test</b> a one sentence document.", snippets[1]);
+
+ ir.close();
+ dir.close();
+ }
+
+ public void testSpanPositionCheck() throws Exception {
+ Directory dir = newDirectory();
+ // use simpleanalyzer for more natural tokenization (else "test." is a token)
+ final Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
+ IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer);
+ iwc.setMergePolicy(newLogMergePolicy());
+ RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
+
+ FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
+ offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
+ Field body = new Field("body", "", offsetsType);
+ Document doc = new Document();
+ doc.add(body);
+
+ body.setStringValue("This is a test.");
+ iw.addDocument(doc);
+ body.setStringValue("Test a one sentence document.");
+ iw.addDocument(doc);
+
+ IndexReader ir = iw.getReader();
+ iw.close();
+
+ IndexSearcher searcher = newSearcher(ir);
+ PostingsHighlighter highlighter = new PostingsHighlighter() {
+ @Override
+ protected Analyzer getIndexAnalyzer(String field) {
+ return analyzer;
+ }
+ };
+ SpanQuery childQuery = new SpanMultiTermQueryWrapper<WildcardQuery>(new WildcardQuery(new Term("body", "te*")));
+ Query query = new SpanFirstQuery(childQuery, 1000000);
+ TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER);
+ assertEquals(2, topDocs.totalHits);
+ String snippets[] = highlighter.highlight("body", query, searcher, topDocs);
+ assertEquals(2, snippets.length);
+ assertEquals("This is a <b>test</b>.", snippets[0]);
+ assertEquals("<b>Test</b> a one sentence document.", snippets[1]);
+
+ ir.close();
+ dir.close();
+ }
+
+ /** Runs a query with two MTQs and confirms the formatter
+ * can tell which query matched which hit. */
+ public void testWhichMTQMatched() throws Exception {
+ Directory dir = newDirectory();
+ // use simpleanalyzer for more natural tokenization (else "test." is a token)
+ final Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
+ IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer);
+ iwc.setMergePolicy(newLogMergePolicy());
+ RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
+
+ FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
+ offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
+ Field body = new Field("body", "", offsetsType);
+ Document doc = new Document();
+ doc.add(body);
+
+ body.setStringValue("Test a one sentence document.");
+ iw.addDocument(doc);
+
+ IndexReader ir = iw.getReader();
+ iw.close();
+
+ IndexSearcher searcher = newSearcher(ir);
+ PostingsHighlighter highlighter = new PostingsHighlighter() {
+ @Override
+ protected Analyzer getIndexAnalyzer(String field) {
+ return analyzer;
+ }
+ };
+ BooleanQuery query = new BooleanQuery();
+ query.add(new WildcardQuery(new Term("body", "te*")), BooleanClause.Occur.SHOULD);
+ query.add(new WildcardQuery(new Term("body", "one")), BooleanClause.Occur.SHOULD);
+ query.add(new WildcardQuery(new Term("body", "se*")), BooleanClause.Occur.SHOULD);
+ TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER);
+ assertEquals(1, topDocs.totalHits);
+ String snippets[] = highlighter.highlight("body", query, searcher, topDocs);
+ assertEquals(1, snippets.length);
+
+ // Default formatter just bolds each hit:
+ assertEquals("<b>Test</b> a <b>one</b> <b>sentence</b> document.", snippets[0]);
+
+ // Now use our own formatter, that also stuffs the
+ // matching term's text into the result:
+ highlighter = new PostingsHighlighter() {
+ @Override
+ protected Analyzer getIndexAnalyzer(String field) {
+ return analyzer;
+ }
+
+ @Override
+ protected PassageFormatter getFormatter(String field) {
+ return new PassageFormatter() {
+
+ @Override
+ public Object format(Passage passages[], String content) {
+ // Copied from DefaultPassageFormatter, but
+ // tweaked to include the matched term:
+ StringBuilder sb = new StringBuilder();
+ int pos = 0;
+ for (Passage passage : passages) {
+ // don't add ellipsis if its the first one, or if its connected.
+ if (passage.startOffset > pos && pos > 0) {
+ sb.append("... ");
+ }
+ pos = passage.startOffset;
+ for (int i = 0; i < passage.numMatches; i++) {
+ int start = passage.matchStarts[i];
+ int end = passage.matchEnds[i];
+ // its possible to have overlapping terms
+ if (start > pos) {
+ sb.append(content, pos, start);
+ }
+ if (end > pos) {
+ sb.append("<b>");
+ sb.append(content, Math.max(pos, start), end);
+ sb.append('(');
+ sb.append(passage.getMatchTerms()[i].utf8ToString());
+ sb.append(')');
+ sb.append("</b>");
+ pos = end;
+ }
+ }
+ // its possible a "term" from the analyzer could span a sentence boundary.
+ sb.append(content, pos, Math.max(pos, passage.endOffset));
+ pos = passage.endOffset;
+ }
+ return sb.toString();
+ }
+ };
+ }
+ };
+
+ assertEquals(1, topDocs.totalHits);
+ snippets = highlighter.highlight("body", query, searcher, topDocs);
+ assertEquals(1, snippets.length);
+
+ // Default formatter bolds each hit:
+ assertEquals("<b>Test(body:te*)</b> a <b>one(body:one)</b> <b>sentence(body:se*)</b> document.", snippets[0]);
+
+ ir.close();
+ dir.close();
+ }
+}
Modified: lucene/dev/trunk/solr/core/src/java/org/apache/solr/highlight/PostingsSolrHighlighter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/java/org/apache/solr/highlight/PostingsSolrHighlighter.java?rev=1561451&r1=1561450&r2=1561451&view=diff
==============================================================================
--- lucene/dev/trunk/solr/core/src/java/org/apache/solr/highlight/PostingsSolrHighlighter.java (original)
+++ lucene/dev/trunk/solr/core/src/java/org/apache/solr/highlight/PostingsSolrHighlighter.java Sun Jan 26 04:49:18 2014
@@ -24,6 +24,7 @@ import java.util.Locale;
import java.util.Map;
import java.util.Set;
+import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.StoredDocument;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.postingshighlight.DefaultPassageFormatter;
@@ -68,6 +69,7 @@ import org.apache.solr.util.plugin.Plugi
* <str name="hl.bs.type">SENTENCE</str>
* <int name="hl.maxAnalyzedChars">10000</int>
* <str name="hl.multiValuedSeparatorChar"> </str>
+ * <bool name="hl.highlightMultiTerm">false</bool>
* </lst>
* </requestHandler>
* </pre>
@@ -98,6 +100,7 @@ import org.apache.solr.util.plugin.Plugi
* <li>hl.bs.variant (string) specifies country code for BreakIterator. default is empty string (root locale)
* <li>hl.maxAnalyzedChars specifies how many characters at most will be processed in a document.
* <li>hl.multiValuedSeparatorChar specifies the logical separator between values for multi-valued fields.
+ * <li>hl.highlightMultiTerm enables highlighting for range/wildcard/fuzzy/prefix queries.
* NOTE: currently hl.maxAnalyzedChars cannot yet be specified per-field
* </ul>
*
@@ -132,6 +135,8 @@ public class PostingsSolrHighlighter ext
maxPassages[i] = params.getFieldInt(fieldNames[i], HighlightParams.SNIPPETS, 1);
}
+ final IndexSchema schema = req.getSchema();
+
PostingsHighlighter highlighter = new PostingsHighlighter(maxLength) {
@Override
protected Passage[] getEmptyHighlight(String fieldName, BreakIterator bi, int maxPassages) {
@@ -178,6 +183,15 @@ public class PostingsSolrHighlighter ext
}
return sep.charAt(0);
}
+
+ @Override
+ protected Analyzer getIndexAnalyzer(String field) {
+ if (params.getFieldBool(field, HighlightParams.HIGHLIGHT_MULTI_TERM, false)) {
+ return schema.getAnalyzer();
+ } else {
+ return null;
+ }
+ }
};
Map<String,String[]> snippets = highlighter.highlightFields(fieldNames, query, searcher, docIDs, maxPassages);
Modified: lucene/dev/trunk/solr/core/src/test/org/apache/solr/highlight/TestPostingsSolrHighlighter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/test/org/apache/solr/highlight/TestPostingsSolrHighlighter.java?rev=1561451&r1=1561450&r2=1561451&view=diff
==============================================================================
--- lucene/dev/trunk/solr/core/src/test/org/apache/solr/highlight/TestPostingsSolrHighlighter.java (original)
+++ lucene/dev/trunk/solr/core/src/test/org/apache/solr/highlight/TestPostingsSolrHighlighter.java Sun Jan 26 04:49:18 2014
@@ -155,4 +155,12 @@ public class TestPostingsSolrHighlighter
req("q", "text:document", "sort", "id asc", "hl", "true", "hl.encoder", "html"),
"//lst[@name='highlighting']/lst[@name='103']/arr[@name='text']/str='<em>Document</em> one has a first <i>sentence</i>.'");
}
+
+ public void testWildcard() {
+ assertQ("simplest test",
+ req("q", "text:doc*ment", "sort", "id asc", "hl", "true", "hl.highlightMultiTerm", "true"),
+ "count(//lst[@name='highlighting']/*)=2",
+ "//lst[@name='highlighting']/lst[@name='101']/arr[@name='text']/str='<em>document</em> one'",
+ "//lst[@name='highlighting']/lst[@name='102']/arr[@name='text']/str='second <em>document</em>'");
+ }
}