You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2013/10/21 19:13:23 UTC
svn commit: r1534281 - in /lucene/dev/trunk/lucene: ./
core/src/java/org/apache/lucene/index/ core/src/java/org/apache/lucene/util/
core/src/test/org/apache/lucene/index/ core/src/test/org/apache/lucene/util/
highlighter/src/java/org/apache/lucene/sear...
Author: jpountz
Date: Mon Oct 21 17:13:23 2013
New Revision: 1534281
URL: http://svn.apache.org/r1534281
Log:
LUCENE-5274: FastVectorHighlighter now supports highlighting against several indexed fields
Added:
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/MergedIterator.java (with props)
lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/TestMergedIterator.java (with props)
Removed:
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/MergedIterator.java
Modified:
lucene/dev/trunk/lucene/CHANGES.txt
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/CoalescedDeletes.java
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/MultiFields.java
lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/index/TestPrefixCodedTerms.java
lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FastVectorHighlighter.java
lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java
lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java
lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FastVectorHighlighterTest.java
lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FieldPhraseListTest.java
lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FieldTermStackTest.java
Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1534281&r1=1534280&r2=1534281&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Mon Oct 21 17:13:23 2013
@@ -117,6 +117,9 @@ New Features
compute each suggestion's weight using a javascript expression.
(Areek Zillur via Mike McCandless)
+* LUCENE-5274: FastVectorHighlighter now supports highlighting against several
+ indexed fields. (Nik Everett via Adrien Grand)
+
Bug Fixes
* LUCENE-4998: Fixed a few places to pass IOContext.READONCE instead
Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/CoalescedDeletes.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/CoalescedDeletes.java?rev=1534281&r1=1534280&r2=1534281&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/CoalescedDeletes.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/CoalescedDeletes.java Mon Oct 21 17:13:23 2013
@@ -25,6 +25,7 @@ import java.util.Map;
import org.apache.lucene.search.Query;
import org.apache.lucene.index.BufferedDeletesStream.QueryAndLimit;
+import org.apache.lucene.util.MergedIterator;
class CoalescedDeletes {
final Map<Query,Integer> queries = new HashMap<Query,Integer>();
Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/MultiFields.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/MultiFields.java?rev=1534281&r1=1534280&r2=1534281&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/MultiFields.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/MultiFields.java Mon Oct 21 17:13:23 2013
@@ -28,6 +28,7 @@ import java.util.concurrent.ConcurrentHa
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.MergedIterator;
/**
* Exposes flex API, merged from flex API of sub-segments.
Added: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/MergedIterator.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/MergedIterator.java?rev=1534281&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/MergedIterator.java (added)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/MergedIterator.java Mon Oct 21 17:13:23 2013
@@ -0,0 +1,160 @@
+package org.apache.lucene.util;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Iterator;
+import java.util.NoSuchElementException;
+
+import org.apache.lucene.util.PriorityQueue;
+
+/**
+ * Provides a merged sorted view from several sorted iterators.
+ * <p>
+ * If built with <code>removeDuplicates</code> set to true and an element
+ * appears in multiple iterators then it is deduplicated, that is this iterator
+ * returns the sorted union of elements.
+ * <p>
+ * If built with <code>removeDuplicates</code> set to false then all elements
+ * in all iterators are returned.
+ * <p>
+ * Caveats:
+ * <ul>
+ * <li>The behavior is undefined if the iterators are not actually sorted.
+ * <li>Null elements are unsupported.
+ * <li>If removeDuplicates is set to true and if a single iterator contains
+ * duplicates then they will not be deduplicated.
+ * <li>When elements are deduplicated it is not defined which one is returned.
+ * <li>If removeDuplicates is set to false then the order in which duplicates
+ * are returned isn't defined.
+ * </ul>
+ * @lucene.internal
+ */
+public final class MergedIterator<T extends Comparable<T>> implements Iterator<T> {
+ private T current;
+ private final TermMergeQueue<T> queue;
+ private final SubIterator<T>[] top;
+ private final boolean removeDuplicates;
+ private int numTop;
+
+ @SuppressWarnings({"unchecked","rawtypes"})
+ public MergedIterator(Iterator<T>... iterators) {
+ this(true, iterators);
+ }
+
+ @SuppressWarnings({"unchecked","rawtypes"})
+ public MergedIterator(boolean removeDuplicates, Iterator<T>... iterators) {
+ this.removeDuplicates = removeDuplicates;
+ queue = new TermMergeQueue<T>(iterators.length);
+ top = new SubIterator[iterators.length];
+ int index = 0;
+ for (Iterator<T> iterator : iterators) {
+ if (iterator.hasNext()) {
+ SubIterator<T> sub = new SubIterator<T>();
+ sub.current = iterator.next();
+ sub.iterator = iterator;
+ sub.index = index++;
+ queue.add(sub);
+ }
+ }
+ }
+
+ @Override
+ public boolean hasNext() {
+ if (queue.size() > 0) {
+ return true;
+ }
+
+ for (int i = 0; i < numTop; i++) {
+ if (top[i].iterator.hasNext()) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ @Override
+ public T next() {
+ // restore queue
+ pushTop();
+
+ // gather equal top elements
+ if (queue.size() > 0) {
+ pullTop();
+ } else {
+ current = null;
+ }
+ if (current == null) {
+ throw new NoSuchElementException();
+ }
+ return current;
+ }
+
+ @Override
+ public void remove() {
+ throw new UnsupportedOperationException();
+ }
+
+ private void pullTop() {
+ assert numTop == 0;
+ top[numTop++] = queue.pop();
+ if (removeDuplicates) {
+ // extract all subs from the queue that have the same top element
+ while (queue.size() != 0
+ && queue.top().current.equals(top[0].current)) {
+ top[numTop++] = queue.pop();
+ }
+ }
+ current = top[0].current;
+ }
+
+ private void pushTop() {
+ // call next() on each top, and put back into queue
+ for (int i = 0; i < numTop; i++) {
+ if (top[i].iterator.hasNext()) {
+ top[i].current = top[i].iterator.next();
+ queue.add(top[i]);
+ } else {
+ // no more elements
+ top[i].current = null;
+ }
+ }
+ numTop = 0;
+ }
+
+ private static class SubIterator<I extends Comparable<I>> {
+ Iterator<I> iterator;
+ I current;
+ int index;
+ }
+
+ private static class TermMergeQueue<C extends Comparable<C>> extends PriorityQueue<SubIterator<C>> {
+ TermMergeQueue(int size) {
+ super(size);
+ }
+
+ @Override
+ protected boolean lessThan(SubIterator<C> a, SubIterator<C> b) {
+ final int cmp = a.current.compareTo(b.current);
+ if (cmp != 0) {
+ return cmp < 0;
+ } else {
+ return a.index < b.index;
+ }
+ }
+ }
+}
Modified: lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/index/TestPrefixCodedTerms.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/index/TestPrefixCodedTerms.java?rev=1534281&r1=1534280&r2=1534281&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/index/TestPrefixCodedTerms.java (original)
+++ lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/index/TestPrefixCodedTerms.java Mon Oct 21 17:13:23 2013
@@ -25,6 +25,7 @@ import java.util.Set;
import java.util.TreeSet;
import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.MergedIterator;
import org.apache.lucene.util._TestUtil;
public class TestPrefixCodedTerms extends LuceneTestCase {
@@ -66,15 +67,6 @@ public class TestPrefixCodedTerms extend
}
assertFalse(expected.hasNext());
}
-
- @SuppressWarnings("unchecked")
- public void testMergeEmpty() {
- Iterator<Term> merged = new MergedIterator<Term>();
- assertFalse(merged.hasNext());
-
- merged = new MergedIterator<Term>(new PrefixCodedTerms.Builder().finish().iterator(), new PrefixCodedTerms.Builder().finish().iterator());
- assertFalse(merged.hasNext());
- }
@SuppressWarnings("unchecked")
public void testMergeOne() {
Added: lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/TestMergedIterator.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/TestMergedIterator.java?rev=1534281&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/TestMergedIterator.java (added)
+++ lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/TestMergedIterator.java Mon Oct 21 17:13:23 2013
@@ -0,0 +1,154 @@
+package org.apache.lucene.util;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Random;
+
+import com.carrotsearch.randomizedtesting.annotations.Repeat;
+
+public class TestMergedIterator extends LuceneTestCase {
+ private static final int REPEATS = 2;
+ private static final int VALS_TO_MERGE = 15000;
+
+ @SuppressWarnings({"rawtypes", "unchecked"})
+ public void testMergeEmpty() {
+ Iterator<Integer> merged = new MergedIterator<Integer>();
+ assertFalse(merged.hasNext());
+
+ merged = new MergedIterator<Integer>(new ArrayList<Integer>().iterator());
+ assertFalse(merged.hasNext());
+
+ Iterator<Integer>[] itrs = new Iterator[random().nextInt(100)];
+ for (int i = 0; i < itrs.length; i++) {
+ itrs[i] = new ArrayList<Integer>().iterator();
+ }
+ merged = new MergedIterator<Integer>( itrs );
+ assertFalse(merged.hasNext());
+ }
+
+ @Repeat(iterations = REPEATS)
+ public void testNoDupsRemoveDups() {
+ testCase(1, 1, true);
+ }
+
+ @Repeat(iterations = REPEATS)
+ public void testOffItrDupsRemoveDups() {
+ testCase(3, 1, true);
+ }
+
+ @Repeat(iterations = REPEATS)
+ public void testOnItrDupsRemoveDups() {
+ testCase(1, 3, true);
+ }
+
+ @Repeat(iterations = REPEATS)
+ public void testOnItrRandomDupsRemoveDups() {
+ testCase(1, -3, true);
+ }
+
+ @Repeat(iterations = REPEATS)
+ public void testBothDupsRemoveDups() {
+ testCase(3, 3, true);
+ }
+
+ @Repeat(iterations = REPEATS)
+ public void testBothDupsWithRandomDupsRemoveDups() {
+ testCase(3, -3, true);
+ }
+
+ @Repeat(iterations = REPEATS)
+ public void testNoDupsKeepDups() {
+ testCase(1, 1, false);
+ }
+
+ @Repeat(iterations = REPEATS)
+ public void testOffItrDupsKeepDups() {
+ testCase(3, 1, false);
+ }
+
+ @Repeat(iterations = REPEATS)
+ public void testOnItrDupsKeepDups() {
+ testCase(1, 3, false);
+ }
+
+ @Repeat(iterations = REPEATS)
+ public void testOnItrRandomDupsKeepDups() {
+ testCase(1, -3, false);
+ }
+
+ @Repeat(iterations = REPEATS)
+ public void testBothDupsKeepDups() {
+ testCase(3, 3, false);
+ }
+
+ @Repeat(iterations = REPEATS)
+ public void testBothDupsWithRandomDupsKeepDups() {
+ testCase(3, -3, false);
+ }
+
+ private void testCase(int itrsWithVal, int specifiedValsOnItr, boolean removeDups) {
+ // Build a random number of lists
+ List<Integer> expected = new ArrayList<Integer>();
+ Random random = new Random(random().nextLong());
+ int numLists = itrsWithVal + random.nextInt(1000 - itrsWithVal);
+ @SuppressWarnings({"rawtypes", "unchecked"})
+ List<Integer>[] lists = new List[numLists];
+ for (int i = 0; i < numLists; i++) {
+ lists[i] = new ArrayList<Integer>();
+ }
+ int start = random.nextInt(1000000);
+ int end = start + VALS_TO_MERGE / itrsWithVal / Math.abs(specifiedValsOnItr);
+ for (int i = start; i < end; i++) {
+ int maxList = lists.length;
+ int maxValsOnItr = 0;
+ int sumValsOnItr = 0;
+ for (int itrWithVal = 0; itrWithVal < itrsWithVal; itrWithVal++) {
+ int list = random.nextInt(maxList);
+ int valsOnItr = specifiedValsOnItr < 0 ? (1 + random.nextInt(-specifiedValsOnItr)) : specifiedValsOnItr;
+ maxValsOnItr = Math.max(maxValsOnItr, valsOnItr);
+ sumValsOnItr += valsOnItr;
+ for (int valOnItr = 0; valOnItr < valsOnItr; valOnItr++) {
+ lists[list].add(i);
+ }
+ maxList = maxList - 1;
+ ArrayUtil.swap(lists, list, maxList);
+ }
+ int maxCount = removeDups ? maxValsOnItr : sumValsOnItr;
+ for (int count = 0; count < maxCount; count++) {
+ expected.add(i);
+ }
+ }
+ // Now check that they get merged cleanly
+ @SuppressWarnings({"rawtypes", "unchecked"})
+ Iterator<Integer>[] itrs = new Iterator[numLists];
+ for (int i = 0; i < numLists; i++) {
+ itrs[i] = lists[i].iterator();
+ }
+
+ MergedIterator<Integer> mergedItr = new MergedIterator<Integer>(removeDups, itrs);
+ Iterator<Integer> expectedItr = expected.iterator();
+ while (expectedItr.hasNext()) {
+ assertTrue(mergedItr.hasNext());
+ assertEquals(expectedItr.next(), mergedItr.next());
+ }
+ assertFalse(mergedItr.hasNext());
+ }
+}
Modified: lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FastVectorHighlighter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FastVectorHighlighter.java?rev=1534281&r1=1534280&r2=1534281&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FastVectorHighlighter.java (original)
+++ lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FastVectorHighlighter.java Mon Oct 21 17:13:23 2013
@@ -18,6 +18,8 @@ package org.apache.lucene.search.vectorh
*/
import java.io.IOException;
+import java.util.Iterator;
+import java.util.Set;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.Query;
@@ -28,7 +30,6 @@ import org.apache.lucene.search.highligh
*
*/
public class FastVectorHighlighter {
-
public static final boolean DEFAULT_PHRASE_HIGHLIGHT = true;
public static final boolean DEFAULT_FIELD_MATCH = true;
private final boolean phraseHighlight;
@@ -186,16 +187,71 @@ public class FastVectorHighlighter {
return fragmentsBuilder.createFragments( reader, docId, fieldName, fieldFragList, maxNumFragments,
preTags, postTags, encoder );
}
-
+
+ /**
+ * Return the best fragments. Matches are scanned from matchedFields and turned into fragments against
+ * storedField. The highlighting may not make sense if matchedFields has matches with offsets that don't
+ * correspond features in storedField. It will outright throw a {@code StringIndexOutOfBoundsException}
+ * if matchedFields produces offsets outside of storedField. As such it is advisable that all
+ * matchedFields share the same source as storedField or are at least a prefix of it.
+ *
+ * @param fieldQuery {@link FieldQuery} object
+ * @param reader {@link IndexReader} of the index
+ * @param docId document id to be highlighted
+ * @param storedField field of the document that stores the text
+ * @param matchedFields fields of the document to scan for matches
+ * @param fragCharSize the length (number of chars) of a fragment
+ * @param maxNumFragments maximum number of fragments
+ * @param fragListBuilder {@link FragListBuilder} object
+ * @param fragmentsBuilder {@link FragmentsBuilder} object
+ * @param preTags pre-tags to be used to highlight terms
+ * @param postTags post-tags to be used to highlight terms
+ * @param encoder an encoder that generates encoded text
+ * @return created fragments or null when no fragments created.
+ * size of the array can be less than maxNumFragments
+ * @throws IOException If there is a low-level I/O error
+ */
+ public final String[] getBestFragments( final FieldQuery fieldQuery, IndexReader reader, int docId,
+ String storedField, Set< String > matchedFields, int fragCharSize, int maxNumFragments,
+ FragListBuilder fragListBuilder, FragmentsBuilder fragmentsBuilder,
+ String[] preTags, String[] postTags, Encoder encoder ) throws IOException {
+ FieldFragList fieldFragList =
+ getFieldFragList( fragListBuilder, fieldQuery, reader, docId, matchedFields, fragCharSize );
+ return fragmentsBuilder.createFragments( reader, docId, storedField, fieldFragList, maxNumFragments,
+ preTags, postTags, encoder );
+ }
+
+ /**
+ * Build a FieldFragList for one field.
+ */
private FieldFragList getFieldFragList( FragListBuilder fragListBuilder,
final FieldQuery fieldQuery, IndexReader reader, int docId,
- String fieldName, int fragCharSize ) throws IOException {
- FieldTermStack fieldTermStack = new FieldTermStack( reader, docId, fieldName, fieldQuery );
+ String matchedField, int fragCharSize ) throws IOException {
+ FieldTermStack fieldTermStack = new FieldTermStack( reader, docId, matchedField, fieldQuery );
FieldPhraseList fieldPhraseList = new FieldPhraseList( fieldTermStack, fieldQuery, phraseLimit );
return fragListBuilder.createFieldFragList( fieldPhraseList, fragCharSize );
}
/**
+ * Build a FieldFragList for more than one field.
+ */
+ private FieldFragList getFieldFragList( FragListBuilder fragListBuilder,
+ final FieldQuery fieldQuery, IndexReader reader, int docId,
+ Set< String > matchedFields, int fragCharSize ) throws IOException {
+ Iterator< String > matchedFieldsItr = matchedFields.iterator();
+ if ( !matchedFieldsItr.hasNext() ) {
+ throw new IllegalArgumentException( "matchedFields must contain at least on field name." );
+ }
+ FieldPhraseList[] toMerge = new FieldPhraseList[ matchedFields.size() ];
+ int i = 0;
+ while ( matchedFieldsItr.hasNext() ) {
+ FieldTermStack stack = new FieldTermStack( reader, docId, matchedFieldsItr.next(), fieldQuery );
+ toMerge[ i++ ] = new FieldPhraseList( stack, fieldQuery, phraseLimit );
+ }
+ return fragListBuilder.createFieldFragList( new FieldPhraseList( toMerge ), fragCharSize );
+ }
+
+ /**
* return whether phraseHighlight or not.
*
* @return whether phraseHighlight or not
Modified: lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java?rev=1534281&r1=1534280&r2=1534281&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java (original)
+++ lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java Mon Oct 21 17:13:23 2013
@@ -17,18 +17,23 @@ package org.apache.lucene.search.vectorh
*/
import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import org.apache.lucene.search.vectorhighlight.FieldQuery.QueryPhraseMap;
import org.apache.lucene.search.vectorhighlight.FieldTermStack.TermInfo;
+import org.apache.lucene.util.MergedIterator;
/**
* FieldPhraseList has a list of WeightedPhraseInfo that is used by FragListBuilder
* to create a FieldFragList object.
*/
public class FieldPhraseList {
-
+ /**
+ * List of non-overlapping WeightedPhraseInfo objects.
+ */
LinkedList<WeightedPhraseInfo> phraseList = new LinkedList<WeightedPhraseInfo>();
/**
@@ -106,6 +111,55 @@ public class FieldPhraseList {
}
}
+ /**
+ * Merging constructor.
+ *
+ * @param toMerge FieldPhraseLists to merge to build this one
+ */
+ public FieldPhraseList( FieldPhraseList[] toMerge ) {
+ // Merge all overlapping WeightedPhraseInfos
+ // Step 1. Sort by startOffset, endOffset, and boost, in that order.
+ @SuppressWarnings( { "rawtypes", "unchecked" } )
+ Iterator< WeightedPhraseInfo >[] allInfos = new Iterator[ toMerge.length ];
+ int index = 0;
+ for ( FieldPhraseList fplToMerge : toMerge ) {
+ allInfos[ index++ ] = fplToMerge.phraseList.iterator();
+ }
+ MergedIterator< WeightedPhraseInfo > itr = new MergedIterator< WeightedPhraseInfo >( false, allInfos );
+ // Step 2. Walk the sorted list merging infos that overlap
+ phraseList = new LinkedList< WeightedPhraseInfo >();
+ if ( !itr.hasNext() ) {
+ return;
+ }
+ List< WeightedPhraseInfo > work = new ArrayList< WeightedPhraseInfo >();
+ WeightedPhraseInfo first = itr.next();
+ work.add( first );
+ int workEndOffset = first.getEndOffset();
+ while ( itr.hasNext() ) {
+ WeightedPhraseInfo current = itr.next();
+ if ( current.getStartOffset() <= workEndOffset ) {
+ workEndOffset = Math.max( workEndOffset, current.getEndOffset() );
+ work.add( current );
+ } else {
+ if ( work.size() == 1 ) {
+ phraseList.add( work.get( 0 ) );
+ work.set( 0, current );
+ } else {
+ phraseList.add( new WeightedPhraseInfo( work ) );
+ work.clear();
+ work.add( current );
+ }
+ workEndOffset = current.getEndOffset();
+ }
+ }
+ if ( work.size() == 1 ) {
+ phraseList.add( work.get( 0 ) );
+ } else {
+ phraseList.add( new WeightedPhraseInfo( work ) );
+ work.clear();
+ }
+ }
+
public void addIfNoOverlap( WeightedPhraseInfo wpi ){
for( WeightedPhraseInfo existWpi : getPhraseList() ){
if( existWpi.isOffsetOverlap( wpi ) ) {
@@ -121,9 +175,7 @@ public class FieldPhraseList {
/**
* Represents the list of term offsets and boost for some text
*/
- public static class WeightedPhraseInfo {
-
- private String text; // unnecessary member, just exists for debugging purpose
+ public static class WeightedPhraseInfo implements Comparable< WeightedPhraseInfo > {
private List<Toffs> termsOffsets; // usually termsOffsets.size() == 1,
// but if position-gap > 1 and slop > 0 then size() could be greater than 1
private float boost; // query boost
@@ -132,10 +184,15 @@ public class FieldPhraseList {
private ArrayList<TermInfo> termsInfos;
/**
+ * Text of the match, calculated on the fly. Use for debugging only.
* @return the text
*/
public String getText() {
- return text;
+ StringBuilder text = new StringBuilder();
+ for ( TermInfo ti: termsInfos ) {
+ text.append( ti.getText() );
+ }
+ return text.toString();
}
/**
@@ -174,15 +231,11 @@ public class FieldPhraseList {
TermInfo ti = terms.get( 0 );
termsOffsets.add( new Toffs( ti.getStartOffset(), ti.getEndOffset() ) );
if( terms.size() == 1 ){
- text = ti.getText();
return;
}
- StringBuilder sb = new StringBuilder();
- sb.append( ti.getText() );
int pos = ti.getPosition();
for( int i = 1; i < terms.size(); i++ ){
ti = terms.get( i );
- sb.append( ti.getText() );
if( ti.getPosition() - pos == 1 ){
Toffs to = termsOffsets.get( termsOffsets.size() - 1 );
to.setEndOffset( ti.getEndOffset() );
@@ -192,7 +245,50 @@ public class FieldPhraseList {
}
pos = ti.getPosition();
}
- text = sb.toString();
+ }
+
+ /**
+ * Merging constructor. Note that this just grabs seqnum from the first info.
+ */
+ public WeightedPhraseInfo( Collection< WeightedPhraseInfo > toMerge ) {
+ // Pretty much the same idea as merging FieldPhraseLists:
+ // Step 1. Sort by startOffset, endOffset
+ // While we are here merge the boosts and termInfos
+ Iterator< WeightedPhraseInfo > toMergeItr = toMerge.iterator();
+ if ( !toMergeItr.hasNext() ) {
+ throw new IllegalArgumentException( "toMerge must contain at least one WeightedPhraseInfo." );
+ }
+ WeightedPhraseInfo first = toMergeItr.next();
+ @SuppressWarnings( { "rawtypes", "unchecked" } )
+ Iterator< Toffs >[] allToffs = new Iterator[ toMerge.size() ];
+ termsInfos = new ArrayList< TermInfo >();
+ seqnum = first.seqnum;
+ boost = first.boost;
+ allToffs[ 0 ] = first.termsOffsets.iterator();
+ int index = 1;
+ while ( toMergeItr.hasNext() ) {
+ WeightedPhraseInfo info = toMergeItr.next();
+ boost += info.boost;
+ termsInfos.addAll( info.termsInfos );
+ allToffs[ index++ ] = info.termsOffsets.iterator();
+ }
+ // Step 2. Walk the sorted list merging overlaps
+ MergedIterator< Toffs > itr = new MergedIterator< Toffs >( false, allToffs );
+ termsOffsets = new ArrayList< Toffs >();
+ if ( !itr.hasNext() ) {
+ return;
+ }
+ Toffs work = itr.next();
+ while ( itr.hasNext() ) {
+ Toffs current = itr.next();
+ if ( current.startOffset <= work.endOffset ) {
+ work.endOffset = Math.max( work.endOffset, current.endOffset );
+ } else {
+ termsOffsets.add( work );
+ work = current;
+ }
+ }
+ termsOffsets.add( work );
}
public int getStartOffset(){
@@ -202,7 +298,7 @@ public class FieldPhraseList {
public int getEndOffset(){
return termsOffsets.get( termsOffsets.size() - 1 ).endOffset;
}
-
+
public boolean isOffsetOverlap( WeightedPhraseInfo other ){
int so = getStartOffset();
int eo = getEndOffset();
@@ -218,7 +314,7 @@ public class FieldPhraseList {
@Override
public String toString(){
StringBuilder sb = new StringBuilder();
- sb.append( text ).append( '(' ).append( boost ).append( ")(" );
+ sb.append( getText() ).append( '(' ).append( boost ).append( ")(" );
for( Toffs to : termsOffsets ){
sb.append( to );
}
@@ -233,10 +329,58 @@ public class FieldPhraseList {
return seqnum;
}
+ @Override
+ public int compareTo( WeightedPhraseInfo other ) {
+ int diff = getStartOffset() - other.getStartOffset();
+ if ( diff != 0 ) {
+ return diff;
+ }
+ diff = getEndOffset() - other.getEndOffset();
+ if ( diff != 0 ) {
+ return diff;
+ }
+ return (int) Math.signum( getBoost() - other.getBoost() );
+ }
+
+ @Override
+ public int hashCode() {
+ final int prime = 31;
+ int result = 1;
+ result = prime * result + getStartOffset();
+ result = prime * result + getEndOffset();
+ long b = Double.doubleToLongBits( getBoost() );
+ result = prime * result + ( int )( b ^ ( b >>> 32 ) );
+ return result;
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj) {
+ return true;
+ }
+ if (obj == null) {
+ return false;
+ }
+ if (getClass() != obj.getClass()) {
+ return false;
+ }
+ WeightedPhraseInfo other = (WeightedPhraseInfo) obj;
+ if (getStartOffset() != other.getStartOffset()) {
+ return false;
+ }
+ if (getEndOffset() != other.getEndOffset()) {
+ return false;
+ }
+ if (getBoost() != other.getBoost()) {
+ return false;
+ }
+ return true;
+ }
+
/**
* Term offsets (start + end)
*/
- public static class Toffs {
+ public static class Toffs implements Comparable< Toffs > {
private int startOffset;
private int endOffset;
public Toffs( int startOffset, int endOffset ){
@@ -253,6 +397,42 @@ public class FieldPhraseList {
return endOffset;
}
@Override
+ public int compareTo( Toffs other ) {
+ int diff = getStartOffset() - other.getStartOffset();
+ if ( diff != 0 ) {
+ return diff;
+ }
+ return getEndOffset() - other.getEndOffset();
+ }
+ @Override
+ public int hashCode() {
+ final int prime = 31;
+ int result = 1;
+ result = prime * result + getStartOffset();
+ result = prime * result + getEndOffset();
+ return result;
+ }
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj) {
+ return true;
+ }
+ if (obj == null) {
+ return false;
+ }
+ if (getClass() != obj.getClass()) {
+ return false;
+ }
+ Toffs other = (Toffs) obj;
+ if (getStartOffset() != other.getStartOffset()) {
+ return false;
+ }
+ if (getEndOffset() != other.getEndOffset()) {
+ return false;
+ }
+ return true;
+ }
+ @Override
public String toString(){
StringBuilder sb = new StringBuilder();
sb.append( '(' ).append( startOffset ).append( ',' ).append( endOffset ).append( ')' );
Modified: lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java?rev=1534281&r1=1534280&r2=1534281&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java (original)
+++ lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java Mon Oct 21 17:13:23 2013
@@ -161,7 +161,8 @@ public class FieldTermStack {
}
/**
- * Single term with its position/offsets in the document and IDF weight
+ * Single term with its position/offsets in the document and IDF weight.
+ * It is Comparable but considers only position.
*/
public static class TermInfo implements Comparable<TermInfo>{
@@ -198,5 +199,30 @@ public class FieldTermStack {
public int compareTo( TermInfo o ){
return ( this.position - o.position );
}
+ @Override
+ public int hashCode() {
+ final int prime = 31;
+ int result = 1;
+ result = prime * result + position;
+ return result;
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj) {
+ return true;
+ }
+ if (obj == null) {
+ return false;
+ }
+ if (getClass() != obj.getClass()) {
+ return false;
+ }
+ TermInfo other = (TermInfo) obj;
+ if (position != other.position) {
+ return false;
+ }
+ return true;
+ }
}
}
Modified: lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FastVectorHighlighterTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FastVectorHighlighterTest.java?rev=1534281&r1=1534280&r2=1534281&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FastVectorHighlighterTest.java (original)
+++ lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FastVectorHighlighterTest.java Mon Oct 21 17:13:23 2013
@@ -16,10 +16,18 @@ package org.apache.lucene.search.vectorh
* limitations under the License.
*/
import java.io.IOException;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+import java.util.TreeMap;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.AnalyzerWrapper;
+import org.apache.lucene.analysis.CannedTokenStream;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenFilter;
import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.Token;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
@@ -33,10 +41,15 @@ import org.apache.lucene.search.BooleanC
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PhraseQuery;
+import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.search.highlight.DefaultEncoder;
+import org.apache.lucene.search.highlight.Encoder;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.automaton.CharacterRunAutomaton;
+import org.apache.lucene.util.automaton.RegExp;
public class FastVectorHighlighterTest extends LuceneTestCase {
@@ -287,4 +300,222 @@ public class FastVectorHighlighterTest e
writer.close();
dir.close();
}
+
+ public void testMatchedFields() throws IOException {
+ // Searching just on the stored field doesn't highlight a stopword
+ matchedFieldsTestCase( false, true, "a match", "a <b>match</b>",
+ clause( "field", "a" ), clause( "field", "match" ) );
+
+ // Even if you add an unqueried matched field that would match it
+ matchedFieldsTestCase( "a match", "a <b>match</b>",
+ clause( "field", "a" ), clause( "field", "match" ) );
+
+ // Nor if you query the field but don't add it as a matched field to the highlighter
+ matchedFieldsTestCase( false, false, "a match", "a <b>match</b>",
+ clause( "field_exact", "a" ), clause( "field", "match" ) );
+
+ // But if you query the field and add it as a matched field to the highlighter then it is highlighted
+ matchedFieldsTestCase( "a match", "<b>a</b> <b>match</b>",
+ clause( "field_exact", "a" ), clause( "field", "match" ) );
+
+ // It is also ok to match just the matched field but get highlighting from the stored field
+ matchedFieldsTestCase( "a match", "<b>a</b> <b>match</b>",
+ clause( "field_exact", "a" ), clause( "field_exact", "match" ) );
+
+ // Boosted matched fields work too
+ matchedFieldsTestCase( "a match", "<b>a</b> <b>match</b>",
+ clause( "field_exact", 5, "a" ), clause( "field", "match" ) );
+
+ // It is also ok if both the stored and the matched field match the term
+ matchedFieldsTestCase( "a match", "a <b>match</b>",
+ clause( "field_exact", "match" ), clause( "field", "match" ) );
+
+ // And the highlighter respects the boosts on matched fields when sorting fragments
+ matchedFieldsTestCase( "cat cat junk junk junk junk junk junk junk a cat junk junk",
+ "junk junk <b>a cat</b> junk junk",
+ clause( "field", "cat" ), clause( "field_exact", 5, "a", "cat" ) );
+ matchedFieldsTestCase( "cat cat junk junk junk junk junk junk junk a cat junk junk",
+ "<b>cat</b> <b>cat</b> junk junk junk junk",
+ clause( "field", "cat" ), clause( "field_exact", "a", "cat" ) );
+
+ // The same thing works across three fields as well
+ matchedFieldsTestCase( "cat cat CAT junk junk junk junk junk junk junk a cat junk junk",
+ "junk junk <b>a cat</b> junk junk",
+ clause( "field", "cat" ), clause( "field_exact", 200, "a", "cat" ), clause( "field_super_exact", 5, "CAT" ) );
+ matchedFieldsTestCase( "a cat cat junk junk junk junk junk junk junk a CAT junk junk",
+ "junk junk <b>a CAT</b> junk junk",
+ clause( "field", "cat" ), clause( "field_exact", 5, "a", "cat" ), clause( "field_super_exact", 200, "a", "CAT" ) );
+
+ // And across fields with different tokenizers!
+ matchedFieldsTestCase( "cat cat junk junk junk junk junk junk junk a cat junk junk",
+ "junk junk <b>a cat</b> junk junk",
+ clause( "field_exact", 5, "a", "cat" ), clause( "field_characters", "c" ) );
+ matchedFieldsTestCase( "cat cat junk junk junk junk junk junk junk a cat junk junk",
+ "<b>c</b>at <b>c</b>at junk junk junk junk",
+ clause( "field_exact", "a", "cat" ), clause( "field_characters", "c" ) );
+ matchedFieldsTestCase( "cat cat junk junk junk junk junk junk junk a cat junk junk",
+ "ca<b>t</b> ca<b>t</b> junk junk junk junk",
+ clause( "field_exact", "a", "cat" ), clause( "field_characters", "t" ) );
+ matchedFieldsTestCase( "cat cat junk junk junk junk junk junk junk a cat junk junk",
+ "<b>cat</b> <b>cat</b> junk junk junk junk", // See how the phrases are joined?
+ clause( "field", "cat" ), clause( "field_characters", 5, "c" ) );
+ matchedFieldsTestCase( "cat cat junk junk junk junk junk junk junk a cat junk junk",
+ "junk junk <b>a cat</b> junk junk",
+ clause( "field", "cat" ), clause( "field_characters", 5, "a", " ", "c", "a", "t" ) );
+
+ // Phrases and tokens inside one another are joined
+ matchedFieldsTestCase( "cats wow", "<b>cats w</b>ow",
+ clause( "field", "cats" ), clause( "field_tripples", "s w" ) );
+
+ // Everything works pretty well even if you don't require a field match
+ matchedFieldsTestCase( true, false, "cat cat junk junk junk junk junk junk junk a cat junk junk",
+ "junk junk <b>a cat</b> junk junk",
+ clause( "field", "cat" ), clause( "field_characters", 10, "a", " ", "c", "a", "t" ) );
+
+ // Even boosts keep themselves pretty much intact
+ matchedFieldsTestCase( true, false, "a cat cat junk junk junk junk junk junk junk a CAT junk junk",
+ "junk junk <b>a CAT</b> junk junk",
+ clause( "field", "cat" ), clause( "field_exact", 5, "a", "cat" ), clause( "field_super_exact", 200, "a", "CAT" ) );
+ matchedFieldsTestCase( true, false, "cat cat CAT junk junk junk junk junk junk junk a cat junk junk",
+ "junk junk <b>a cat</b> junk junk",
+ clause( "field", "cat" ), clause( "field_exact", 200, "a", "cat" ), clause( "field_super_exact", 5, "CAT" ) );
+
+ // Except that all the matched field matches apply even if they aren't mentioned in the query
+ // which can make for some confusing scoring. This isn't too big a deal, just something you
+ // need to think about when you don't force a field match.
+ matchedFieldsTestCase( true, false, "cat cat junk junk junk junk junk junk junk a cat junk junk",
+ "<b>cat</b> <b>cat</b> junk junk junk junk",
+ clause( "field", "cat" ), clause( "field_characters", 4, "a", " ", "c", "a", "t" ) );
+
+ // It is also cool to match fields that don't have _exactly_ the same text so long as you are careful.
+ // In this case field_sliced is a prefix of field.
+ matchedFieldsTestCase( "cat cat junk junk junk junk junk junk junk a cat junk junk",
+ "<b>cat</b> <b>cat</b> junk junk junk junk", clause( "field_sliced", "cat" ) );
+
+ // Multiple matches add to the score of the segment
+ matchedFieldsTestCase( "cat cat junk junk junk junk junk junk junk a cat junk junk",
+ "<b>cat</b> <b>cat</b> junk junk junk junk",
+ clause( "field", "cat" ), clause( "field_sliced", "cat" ), clause( "field_exact", 2, "a", "cat" ) );
+ matchedFieldsTestCase( "cat cat junk junk junk junk junk junk junk a cat junk junk",
+ "junk junk <b>a cat</b> junk junk",
+ clause( "field", "cat" ), clause( "field_sliced", "cat" ), clause( "field_exact", 4, "a", "cat" ) );
+
+ // Even fields with tokens on top of one another are ok
+ matchedFieldsTestCase( "cat cat junk junk junk junk junk junk junk a cat junk junk",
+ "<b>cat</b> cat junk junk junk junk",
+ clause( "field_der_red", 2, "der" ), clause( "field_exact", "a", "cat" ) );
+ matchedFieldsTestCase( "cat cat junk junk junk junk junk junk junk a cat junk junk",
+ "<b>cat</b> cat junk junk junk junk",
+ clause( "field_der_red", 2, "red" ), clause( "field_exact", "a", "cat" ) );
+ matchedFieldsTestCase( "cat cat junk junk junk junk junk junk junk a cat junk junk",
+ "<b>cat</b> cat junk junk junk junk",
+ clause( "field_der_red", "red" ), clause( "field_der_red", "der" ), clause( "field_exact", "a", "cat" ) );
+ }
+
+ private void matchedFieldsTestCase( String fieldValue, String expected, Query... queryClauses ) throws IOException {
+ matchedFieldsTestCase( true, true, fieldValue, expected, queryClauses );
+ }
+
+ private void matchedFieldsTestCase( boolean useMatchedFields, boolean fieldMatch, String fieldValue, String expected, Query... queryClauses ) throws IOException {
+ Document doc = new Document();
+ FieldType stored = new FieldType( TextField.TYPE_STORED );
+ stored.setStoreTermVectorOffsets( true );
+ stored.setStoreTermVectorPositions( true );
+ stored.setStoreTermVectors( true );
+ stored.freeze();
+ FieldType matched = new FieldType( TextField.TYPE_NOT_STORED );
+ matched.setStoreTermVectorOffsets( true );
+ matched.setStoreTermVectorPositions( true );
+ matched.setStoreTermVectors( true );
+ matched.freeze();
+ doc.add( new Field( "field", fieldValue, stored ) ); // Whitespace tokenized with English stop words
+ doc.add( new Field( "field_exact", fieldValue, matched ) ); // Whitespace tokenized without stop words
+ doc.add( new Field( "field_super_exact", fieldValue, matched ) ); // Whitespace tokenized without toLower
+ doc.add( new Field( "field_characters", fieldValue, matched ) ); // Each letter is a token
+ doc.add( new Field( "field_tripples", fieldValue, matched ) ); // Every three letters is a token
+ doc.add( new Field( "field_sliced", fieldValue.substring( 0, // Sliced at 10 chars then analyzed just like field
+ Math.min( fieldValue.length() - 1 , 10 ) ), matched ) );
+ doc.add( new Field( "field_der_red", new CannedTokenStream( // Hacky field containing "der" and "red" at pos = 0
+ token( "der", 1, 0, 3 ),
+ token( "red", 0, 0, 3 )
+ ), matched ) );
+
+ final Map<String, Analyzer> fieldAnalyzers = new TreeMap<String, Analyzer>();
+ fieldAnalyzers.put( "field", new MockAnalyzer( random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET ) );
+ fieldAnalyzers.put( "field_exact", new MockAnalyzer( random() ) );
+ fieldAnalyzers.put( "field_super_exact", new MockAnalyzer( random(), MockTokenizer.WHITESPACE, false ) );
+ fieldAnalyzers.put( "field_characters", new MockAnalyzer( random(), new CharacterRunAutomaton( new RegExp(".").toAutomaton() ), true ) );
+ fieldAnalyzers.put( "field_tripples", new MockAnalyzer( random(), new CharacterRunAutomaton( new RegExp("...").toAutomaton() ), true ) );
+ fieldAnalyzers.put( "field_sliced", fieldAnalyzers.get( "field" ) );
+ fieldAnalyzers.put( "field_der_red", fieldAnalyzers.get( "field" ) ); // This is required even though we provide a token stream
+ Analyzer analyzer = new AnalyzerWrapper() {
+ public Analyzer getWrappedAnalyzer(String fieldName) {
+ return fieldAnalyzers.get( fieldName );
+ }
+ };
+
+ Directory dir = newDirectory();
+ IndexWriter writer = new IndexWriter( dir, newIndexWriterConfig( TEST_VERSION_CURRENT, analyzer ) );
+ writer.addDocument( doc );
+
+ FastVectorHighlighter highlighter = new FastVectorHighlighter();
+ FragListBuilder fragListBuilder = new SimpleFragListBuilder();
+ FragmentsBuilder fragmentsBuilder = new ScoreOrderFragmentsBuilder();
+ IndexReader reader = DirectoryReader.open( writer, true );
+ String[] preTags = new String[] { "<b>" };
+ String[] postTags = new String[] { "</b>" };
+ Encoder encoder = new DefaultEncoder();
+ int docId = 0;
+ BooleanQuery query = new BooleanQuery();
+ for ( Query clause : queryClauses ) {
+ query.add( clause, Occur.MUST );
+ }
+ FieldQuery fieldQuery = new FieldQuery( query, reader, true, fieldMatch );
+ String[] bestFragments;
+ if ( useMatchedFields ) {
+ Set< String > matchedFields = new HashSet< String >();
+ matchedFields.add( "field" );
+ matchedFields.add( "field_exact" );
+ matchedFields.add( "field_super_exact" );
+ matchedFields.add( "field_characters" );
+ matchedFields.add( "field_tripples" );
+ matchedFields.add( "field_sliced" );
+ matchedFields.add( "field_der_red" );
+ bestFragments = highlighter.getBestFragments( fieldQuery, reader, docId, "field", matchedFields, 25, 1,
+ fragListBuilder, fragmentsBuilder, preTags, postTags, encoder );
+ } else {
+ bestFragments = highlighter.getBestFragments( fieldQuery, reader, docId, "field", 25, 1,
+ fragListBuilder, fragmentsBuilder, preTags, postTags, encoder );
+ }
+ assertEquals( expected, bestFragments[ 0 ] );
+
+ reader.close();
+ writer.close();
+ dir.close();
+ }
+
+ private Query clause( String field, String... terms ) {
+ return clause( field, 1, terms );
+ }
+
+ private Query clause( String field, float boost, String... terms ) {
+ Query q;
+ if ( terms.length == 1 ) {
+ q = new TermQuery( new Term( field, terms[ 0 ] ) );
+ } else {
+ PhraseQuery pq = new PhraseQuery();
+ for ( String term: terms ) {
+ pq.add( new Term( field, term ) );
+ }
+ q = pq;
+ }
+ q.setBoost( boost );
+ return q;
+ }
+
+ private static Token token( String term, int posInc, int startOffset, int endOffset ) {
+ Token t = new Token( term, startOffset, endOffset );
+ t.setPositionIncrement( posInc );
+ return t;
+ }
}
Modified: lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FieldPhraseListTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FieldPhraseListTest.java?rev=1534281&r1=1534280&r2=1534281&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FieldPhraseListTest.java (original)
+++ lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FieldPhraseListTest.java Mon Oct 21 17:13:23 2013
@@ -16,8 +16,14 @@ package org.apache.lucene.search.vectorh
* limitations under the License.
*/
+import java.util.LinkedList;
+
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.BooleanClause.Occur;
+import org.apache.lucene.search.vectorhighlight.FieldPhraseList.WeightedPhraseInfo;
+import org.apache.lucene.search.vectorhighlight.FieldPhraseList.WeightedPhraseInfo.Toffs;
+import org.apache.lucene.search.vectorhighlight.FieldTermStack.TermInfo;
+import org.apache.lucene.util._TestUtil;
public class FieldPhraseListTest extends AbstractTestCase {
@@ -188,7 +194,7 @@ public class FieldPhraseListTest extends
assertEquals( 1, fpl.phraseList.size() );
assertEquals( "sppeeeed(1.0)((88,93))", fpl.phraseList.get( 0 ).toString() );
}
-
+
/* This test shows a big speedup from limiting the number of analyzed phrases in
* this bad case for FieldPhraseList */
/* But it is not reliable as a unit test since it is timing-dependent
@@ -218,4 +224,68 @@ public class FieldPhraseListTest extends
assertEquals( "a(1.0)((0,1))", fpl.phraseList.get( 0 ).toString() );
}
*/
+
+ public void testWeightedPhraseInfoComparisonConsistency() {
+ WeightedPhraseInfo a = newInfo( 0, 0, 1 );
+ WeightedPhraseInfo b = newInfo( 1, 2, 1 );
+ WeightedPhraseInfo c = newInfo( 2, 3, 1 );
+ WeightedPhraseInfo d = newInfo( 0, 0, 1 );
+ WeightedPhraseInfo e = newInfo( 0, 0, 2 );
+
+ assertConsistentEquals( a, a );
+ assertConsistentEquals( b, b );
+ assertConsistentEquals( c, c );
+ assertConsistentEquals( d, d );
+ assertConsistentEquals( e, e );
+ assertConsistentEquals( a, d );
+ assertConsistentLessThan( a, b );
+ assertConsistentLessThan( b, c );
+ assertConsistentLessThan( a, c );
+ assertConsistentLessThan( a, e );
+ assertConsistentLessThan( e, b );
+ assertConsistentLessThan( e, c );
+ assertConsistentLessThan( d, b );
+ assertConsistentLessThan( d, c );
+ assertConsistentLessThan( d, e );
+ }
+
+ public void testToffsComparisonConsistency() {
+ Toffs a = new Toffs( 0, 0 );
+ Toffs b = new Toffs( 1, 2 );
+ Toffs c = new Toffs( 2, 3 );
+ Toffs d = new Toffs( 0, 0 );
+
+ assertConsistentEquals( a, a );
+ assertConsistentEquals( b, b );
+ assertConsistentEquals( c, c );
+ assertConsistentEquals( d, d );
+ assertConsistentEquals( a, d );
+ assertConsistentLessThan( a, b );
+ assertConsistentLessThan( b, c );
+ assertConsistentLessThan( a, c );
+ assertConsistentLessThan( d, b );
+ assertConsistentLessThan( d, c );
+ }
+
+ private WeightedPhraseInfo newInfo( int startOffset, int endOffset, float boost ) {
+ LinkedList< TermInfo > infos = new LinkedList< TermInfo >();
+ infos.add( new TermInfo( _TestUtil.randomUnicodeString( random() ), startOffset, endOffset, 0, 0 ) );
+ return new WeightedPhraseInfo( infos, boost );
+ }
+
+ private < T extends Comparable< T > > void assertConsistentEquals( T a, T b ) {
+ assertEquals( a, b );
+ assertEquals( b, a );
+ assertEquals( a.hashCode(), b.hashCode() );
+ assertEquals( 0, a.compareTo( b ) );
+ assertEquals( 0, b.compareTo( a ) );
+ }
+
+ private < T extends Comparable< T > > void assertConsistentLessThan( T a, T b ) {
+ assertFalse( a.equals( b ) );
+ assertFalse( b.equals( a ) );
+ assertFalse( a.hashCode() == b.hashCode() );
+ assertTrue( a.compareTo( b ) < 0 );
+ assertTrue( b.compareTo( a ) > 0 );
+ }
}
Modified: lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FieldTermStackTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FieldTermStackTest.java?rev=1534281&r1=1534280&r2=1534281&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FieldTermStackTest.java (original)
+++ lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FieldTermStackTest.java Mon Oct 21 17:13:23 2013
@@ -20,6 +20,8 @@ import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.WildcardQuery;
+import org.apache.lucene.search.vectorhighlight.FieldTermStack.TermInfo;
+import org.apache.lucene.util._TestUtil;
public class FieldTermStackTest extends AbstractTestCase {
@@ -173,4 +175,37 @@ public class FieldTermStackTest extends
assertEquals ("the(195,198,31)", stack.pop().toString());
}
+ public void testTermInfoComparisonConsistency() {
+ TermInfo a = new TermInfo( _TestUtil.randomUnicodeString( random() ), 0, 0, 0, 1 );
+ TermInfo b = new TermInfo( _TestUtil.randomUnicodeString( random() ), 0, 0, 1, 1 );
+ TermInfo c = new TermInfo( _TestUtil.randomUnicodeString( random() ), 0, 0, 2, 1 );
+ TermInfo d = new TermInfo( _TestUtil.randomUnicodeString( random() ), 0, 0, 0, 1 );
+
+ assertConsistentEquals( a, a );
+ assertConsistentEquals( b, b );
+ assertConsistentEquals( c, c );
+ assertConsistentEquals( d, d );
+ assertConsistentEquals( a, d );
+ assertConsistentLessThan( a, b );
+ assertConsistentLessThan( b, c );
+ assertConsistentLessThan( a, c );
+ assertConsistentLessThan( d, b );
+ assertConsistentLessThan( d, c );
+ }
+
+ private < T extends Comparable< T > > void assertConsistentEquals( T a, T b ) {
+ assertEquals( a, b );
+ assertEquals( b, a );
+ assertEquals( a.hashCode(), b.hashCode() );
+ assertEquals( 0, a.compareTo( b ) );
+ assertEquals( 0, b.compareTo( a ) );
+ }
+
+ private < T extends Comparable< T > > void assertConsistentLessThan( T a, T b ) {
+ assertFalse( a.equals( b ) );
+ assertFalse( b.equals( a ) );
+ assertFalse( a.hashCode() == b.hashCode() );
+ assertTrue( a.compareTo( b ) < 0 );
+ assertTrue( b.compareTo( a ) > 0 );
+ }
}