You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mv...@apache.org on 2012/07/16 21:24:31 UTC
svn commit: r1362222 - in /lucene/dev/branches/branch_4x/lucene: ./
highlighter/src/java/org/apache/lucene/search/vectorhighlight/
highlighter/src/test/org/apache/lucene/search/vectorhighlight/
Author: mvg
Date: Mon Jul 16 19:24:31 2012
New Revision: 1362222
URL: http://svn.apache.org/viewvc?rev=1362222&view=rev
Log:
LUCENE-4153: Added option to fast vector highlighting via BaseFragmentsBuilder to
respect field boundaries in the case of multivalued fields.
Modified:
lucene/dev/branches/branch_4x/lucene/CHANGES.txt
lucene/dev/branches/branch_4x/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BaseFragmentsBuilder.java
lucene/dev/branches/branch_4x/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldFragList.java
lucene/dev/branches/branch_4x/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/SimpleFragmentsBuilderTest.java
Modified: lucene/dev/branches/branch_4x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/CHANGES.txt?rev=1362222&r1=1362221&r2=1362222&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_4x/lucene/CHANGES.txt Mon Jul 16 19:24:31 2012
@@ -17,6 +17,9 @@ New features
create automata from a fixed collection of UTF-8 encoded BytesRef
(Dawid Weiss, Robert Muir)
+* LUCENE-4153: Added option to fast vector highlighting via BaseFragmentsBuilder to
+ respect field boundaries in the case of multivalued fields. (Martijn van Groningen)
+
API Changes
* LUCENE-4138: update of morfologik (Polish morphological analyzer) to 1.5.3.
Modified: lucene/dev/branches/branch_4x/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BaseFragmentsBuilder.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BaseFragmentsBuilder.java?rev=1362222&r1=1362221&r2=1362222&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BaseFragmentsBuilder.java (original)
+++ lucene/dev/branches/branch_4x/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BaseFragmentsBuilder.java Mon Jul 16 19:24:31 2012
@@ -17,10 +17,6 @@ package org.apache.lucene.search.vectorh
* limitations under the License.
*/
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;
-
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.TextField;
@@ -29,10 +25,19 @@ import org.apache.lucene.index.IndexRead
import org.apache.lucene.index.StoredFieldVisitor;
import org.apache.lucene.search.highlight.DefaultEncoder;
import org.apache.lucene.search.highlight.Encoder;
-import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo.SubInfo;
import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo;
+import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo.SubInfo;
import org.apache.lucene.search.vectorhighlight.FieldPhraseList.WeightedPhraseInfo.Toffs;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+
public abstract class BaseFragmentsBuilder implements FragmentsBuilder {
protected String[] preTags, postTags;
@@ -48,6 +53,7 @@ public abstract class BaseFragmentsBuild
public static final String[] COLORED_POST_TAGS = { "</b>" };
private char multiValuedSeparator = ' ';
private final BoundaryScanner boundaryScanner;
+ private boolean discreteMultiValueHighlighting = false;
protected BaseFragmentsBuilder(){
this( new String[]{ "<b>" }, new String[]{ "</b>" } );
@@ -76,7 +82,7 @@ public abstract class BaseFragmentsBuild
public abstract List<WeightedFragInfo> getWeightedFragInfoList( List<WeightedFragInfo> src );
private static final Encoder NULL_ENCODER = new DefaultEncoder();
-
+
public String createFragment( IndexReader reader, int docId,
String fieldName, FieldFragList fieldFragList ) throws IOException {
return createFragment( reader, docId, fieldName, fieldFragList,
@@ -102,14 +108,23 @@ public abstract class BaseFragmentsBuild
public String[] createFragments( IndexReader reader, int docId,
String fieldName, FieldFragList fieldFragList, int maxNumFragments,
String[] preTags, String[] postTags, Encoder encoder ) throws IOException {
- if( maxNumFragments < 0 )
+
+ if( maxNumFragments < 0 ) {
throw new IllegalArgumentException( "maxNumFragments(" + maxNumFragments + ") must be positive number." );
+ }
- List<WeightedFragInfo> fragInfos = getWeightedFragInfoList( fieldFragList.getFragInfos() );
-
+ List<WeightedFragInfo> fragInfos = fieldFragList.getFragInfos();
List<String> fragments = new ArrayList<String>( maxNumFragments );
Field[] values = getFields( reader, docId, fieldName );
- if( values.length == 0 ) return null;
+ if( values.length == 0 ) {
+ return null;
+ }
+
+ if (discreteMultiValueHighlighting && values.length > 1) {
+ fragInfos = discreteMultiValueHighlighting(fragInfos, values);
+ }
+
+ fragInfos = getWeightedFragInfoList(fragInfos);
StringBuilder buffer = new StringBuilder();
int[] nextValueIndex = { 0 };
for( int n = 0; n < maxNumFragments && n < fragInfos.size(); n++ ){
@@ -186,7 +201,92 @@ public abstract class BaseFragmentsBuild
int eo = buffer.length() < endOffset ? buffer.length() : endOffset;
return buffer.substring( startOffset, eo );
}
-
+
+ protected List<WeightedFragInfo> discreteMultiValueHighlighting(List<WeightedFragInfo> fragInfos, Field[] fields) {
+ Map<String, List<WeightedFragInfo>> fieldNameToFragInfos = new HashMap<String, List<WeightedFragInfo>>();
+ for (Field field : fields) {
+ fieldNameToFragInfos.put(field.name(), new ArrayList<WeightedFragInfo>());
+ }
+
+ fragInfos: for (WeightedFragInfo fragInfo : fragInfos) {
+ int fieldStart;
+ int fieldEnd = 0;
+ for (Field field : fields) {
+ if (field.stringValue().isEmpty()) {
+ fieldEnd++;
+ continue;
+ }
+ fieldStart = fieldEnd;
+ fieldEnd += field.stringValue().length() + 1; // + 1 for going to next field with same name.
+
+ if (fragInfo.getStartOffset() >= fieldStart && fragInfo.getEndOffset() >= fieldStart &&
+ fragInfo.getStartOffset() <= fieldEnd && fragInfo.getEndOffset() <= fieldEnd) {
+ fieldNameToFragInfos.get(field.name()).add(fragInfo);
+ continue fragInfos;
+ }
+
+ if (fragInfo.getSubInfos().isEmpty()) {
+ continue fragInfos;
+ }
+
+ Toffs firstToffs = fragInfo.getSubInfos().get(0).getTermsOffsets().get(0);
+ if (fragInfo.getStartOffset() >= fieldEnd || firstToffs.getStartOffset() >= fieldEnd) {
+ continue;
+ }
+
+ int fragStart = fieldStart;
+ if (fragInfo.getStartOffset() > fieldStart && fragInfo.getStartOffset() < fieldEnd) {
+ fragStart = fragInfo.getStartOffset();
+ }
+
+ int fragEnd = fieldEnd;
+ if (fragInfo.getEndOffset() > fieldStart && fragInfo.getEndOffset() < fieldEnd) {
+ fragEnd = fragInfo.getEndOffset();
+ }
+
+
+ List<SubInfo> subInfos = new ArrayList<SubInfo>();
+ WeightedFragInfo weightedFragInfo = new WeightedFragInfo(fragStart, fragEnd, subInfos, fragInfo.getTotalBoost());
+
+ Iterator<SubInfo> subInfoIterator = fragInfo.getSubInfos().iterator();
+ while (subInfoIterator.hasNext()) {
+ SubInfo subInfo = subInfoIterator.next();
+ List<Toffs> toffsList = new ArrayList<Toffs>();
+ Iterator<Toffs> toffsIterator = subInfo.getTermsOffsets().iterator();
+ while (toffsIterator.hasNext()) {
+ Toffs toffs = toffsIterator.next();
+ if (toffs.getStartOffset() >= fieldStart && toffs.getEndOffset() <= fieldEnd) {
+ toffsList.add(toffs);
+ toffsIterator.remove();
+ }
+ }
+ if (!toffsList.isEmpty()) {
+ subInfos.add(new SubInfo(subInfo.getText(), toffsList, subInfo.getSeqnum()));
+ }
+
+ if (subInfo.getTermsOffsets().isEmpty()) {
+ subInfoIterator.remove();
+ }
+ }
+ fieldNameToFragInfos.get(field.name()).add(weightedFragInfo);
+ }
+ }
+
+ List<WeightedFragInfo> result = new ArrayList<WeightedFragInfo>();
+ for (List<WeightedFragInfo> weightedFragInfos : fieldNameToFragInfos.values()) {
+ result.addAll(weightedFragInfos);
+ }
+ Collections.sort(result, new Comparator<WeightedFragInfo>() {
+
+ public int compare(FieldFragList.WeightedFragInfo info1, FieldFragList.WeightedFragInfo info2) {
+ return info1.getStartOffset() - info2.getStartOffset();
+ }
+
+ });
+
+ return result;
+ }
+
public void setMultiValuedSeparator( char separator ){
multiValuedSeparator = separator;
}
@@ -195,6 +295,14 @@ public abstract class BaseFragmentsBuild
return multiValuedSeparator;
}
+ public boolean isDiscreteMultiValueHighlighting() {
+ return discreteMultiValueHighlighting;
+ }
+
+ public void setDiscreteMultiValueHighlighting(boolean discreteMultiValueHighlighting) {
+ this.discreteMultiValueHighlighting = discreteMultiValueHighlighting;
+ }
+
protected String getPreTag( int num ){
return getPreTag( preTags, num );
}
Modified: lucene/dev/branches/branch_4x/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldFragList.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldFragList.java?rev=1362222&r1=1362221&r2=1362222&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldFragList.java (original)
+++ lucene/dev/branches/branch_4x/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldFragList.java Mon Jul 16 19:24:31 2012
@@ -17,12 +17,12 @@ package org.apache.lucene.search.vectorh
* limitations under the License.
*/
-import java.util.ArrayList;
-import java.util.List;
-
import org.apache.lucene.search.vectorhighlight.FieldPhraseList.WeightedPhraseInfo;
import org.apache.lucene.search.vectorhighlight.FieldPhraseList.WeightedPhraseInfo.Toffs;
+import java.util.ArrayList;
+import java.util.List;
+
/**
* FieldFragList has a list of "frag info" that is used by FragmentsBuilder class
* to create fragments (snippets).
@@ -116,7 +116,11 @@ public abstract class FieldFragList {
public int getSeqnum(){
return seqnum;
}
-
+
+ public String getText(){
+ return text;
+ }
+
@Override
public String toString(){
StringBuilder sb = new StringBuilder();
Modified: lucene/dev/branches/branch_4x/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/SimpleFragmentsBuilderTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/SimpleFragmentsBuilderTest.java?rev=1362222&r1=1362221&r2=1362222&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/SimpleFragmentsBuilderTest.java (original)
+++ lucene/dev/branches/branch_4x/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/SimpleFragmentsBuilderTest.java Mon Jul 16 19:24:31 2012
@@ -17,20 +17,32 @@ package org.apache.lucene.search.vectorh
* limitations under the License.
*/
+import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
+import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.highlight.SimpleHTMLEncoder;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util._TestUtil;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
public class SimpleFragmentsBuilderTest extends AbstractTestCase {
@@ -175,4 +187,152 @@ public class SimpleFragmentsBuilderTest
sfb.setMultiValuedSeparator( '/' );
assertEquals( "//a b c//<b>d</b> e", sfb.createFragment( reader, 0, F, ffl ) );
}
+
+ public void testDiscreteMultiValueHighlighting() throws Exception {
+ makeIndexShortMV();
+
+ FieldQuery fq = new FieldQuery( tq( "d" ), true, true );
+ FieldTermStack stack = new FieldTermStack( reader, 0, F, fq );
+ FieldPhraseList fpl = new FieldPhraseList( stack, fq );
+ SimpleFragListBuilder sflb = new SimpleFragListBuilder();
+ FieldFragList ffl = sflb.createFieldFragList( fpl, 100 );
+ SimpleFragmentsBuilder sfb = new SimpleFragmentsBuilder();
+ sfb.setDiscreteMultiValueHighlighting(true);
+ assertEquals( "<b>d</b> e", sfb.createFragment( reader, 0, F, ffl ) );
+
+ make1dmfIndex("some text to highlight", "highlight other text");
+ fq = new FieldQuery( tq( "text" ), true, true );
+ stack = new FieldTermStack( reader, 0, F, fq );
+ fpl = new FieldPhraseList( stack, fq );
+ sflb = new SimpleFragListBuilder();
+ ffl = sflb.createFieldFragList( fpl, 32 );
+ String[] result = sfb.createFragments(reader, 0, F, ffl, 3);
+ assertEquals(2, result.length);
+ assertEquals("some <b>text</b> to highlight", result[0]);
+ assertEquals("other <b>text</b>", result[1]);
+
+ fq = new FieldQuery( tq( "highlight" ), true, true );
+ stack = new FieldTermStack( reader, 0, F, fq );
+ fpl = new FieldPhraseList( stack, fq );
+ sflb = new SimpleFragListBuilder();
+ ffl = sflb.createFieldFragList( fpl, 32 );
+ result = sfb.createFragments(reader, 0, F, ffl, 3);
+ assertEquals(2, result.length);
+ assertEquals("text to <b>highlight</b>", result[0]);
+ assertEquals("<b>highlight</b> other text", result[1]);
+ }
+
+ public void testRandomDiscreteMultiValueHighlighting() throws Exception {
+ String[] randomValues = new String[3 + random().nextInt(10 * RANDOM_MULTIPLIER)];
+ for (int i = 0; i < randomValues.length; i++) {
+ String randomValue;
+ do {
+ randomValue = _TestUtil.randomSimpleString(random());
+ } while ("".equals(randomValue));
+ randomValues[i] = randomValue;
+ }
+
+ Directory dir = newDirectory();
+ RandomIndexWriter writer = new RandomIndexWriter(
+ random(),
+ dir,
+ newIndexWriterConfig(TEST_VERSION_CURRENT,
+ new MockAnalyzer(random())).setMergePolicy(newLogMergePolicy()));
+
+ FieldType customType = new FieldType(TextField.TYPE_STORED);
+ customType.setStoreTermVectors(true);
+ customType.setStoreTermVectorOffsets(true);
+ customType.setStoreTermVectorPositions(true);
+
+ int numDocs = randomValues.length * 5;
+ int numFields = 2 + random().nextInt(5);
+ int numTerms = 2 + random().nextInt(3);
+ List<Doc> docs = new ArrayList<Doc>(numDocs);
+ List<Document> documents = new ArrayList<Document>(numDocs);
+ Map<String, Set<Integer>> valueToDocId = new HashMap<String, Set<Integer>>();
+ for (int i = 0; i < numDocs; i++) {
+ Document document = new Document();
+ String[][] fields = new String[numFields][numTerms];
+ for (int j = 0; j < numFields; j++) {
+ String[] fieldValues = new String[numTerms];
+ fieldValues[0] = getRandomValue(randomValues, valueToDocId, i);
+ StringBuilder builder = new StringBuilder(fieldValues[0]);
+ for (int k = 1; k < numTerms; k++) {
+ fieldValues[k] = getRandomValue(randomValues, valueToDocId, i);
+ builder.append(' ').append(fieldValues[k]);
+ }
+ document.add(new Field(F, builder.toString(), customType));
+ fields[j] = fieldValues;
+ }
+ docs.add(new Doc(fields));
+ documents.add(document);
+ }
+ writer.addDocuments(documents);
+ writer.close();
+ IndexReader reader = DirectoryReader.open(dir);
+
+ try {
+ int highlightIters = 1 + random().nextInt(120 * RANDOM_MULTIPLIER);
+ for (int highlightIter = 0; highlightIter < highlightIters; highlightIter++) {
+ String queryTerm = randomValues[random().nextInt(randomValues.length)];
+ int randomHit = valueToDocId.get(queryTerm).iterator().next();
+ List<StringBuilder> builders = new ArrayList<StringBuilder>();
+ for (String[] fieldValues : docs.get(randomHit).fieldValues) {
+ StringBuilder builder = new StringBuilder();
+ boolean hit = false;
+ for (int i = 0; i < fieldValues.length; i++) {
+ if (queryTerm.equals(fieldValues[i])) {
+ builder.append("<b>").append(queryTerm).append("</b>");
+ hit = true;
+ } else {
+ builder.append(fieldValues[i]);
+ }
+ if (i != fieldValues.length - 1) {
+ builder.append(' ');
+ }
+ }
+ if (hit) {
+ builders.add(builder);
+ }
+ }
+
+ FieldQuery fq = new FieldQuery(tq(queryTerm), true, true);
+ FieldTermStack stack = new FieldTermStack(reader, randomHit, F, fq);
+
+ FieldPhraseList fpl = new FieldPhraseList(stack, fq);
+ SimpleFragListBuilder sflb = new SimpleFragListBuilder(100);
+ FieldFragList ffl = sflb.createFieldFragList(fpl, 300);
+
+ SimpleFragmentsBuilder sfb = new SimpleFragmentsBuilder();
+ sfb.setDiscreteMultiValueHighlighting(true);
+ String[] actualFragments = sfb.createFragments(reader, randomHit, F, ffl, numFields);
+ assertEquals(builders.size(), actualFragments.length);
+ for (int i = 0; i < actualFragments.length; i++) {
+ assertEquals(builders.get(i).toString(), actualFragments[i]);
+ }
+ }
+ } finally {
+ reader.close();
+ dir.close();
+ }
+ }
+
+ private String getRandomValue(String[] randomValues, Map<String, Set<Integer>> valueToDocId, int docId) {
+ String value = randomValues[random().nextInt(randomValues.length)];
+ if (!valueToDocId.containsKey(value)) {
+ valueToDocId.put(value, new HashSet<Integer>());
+ }
+ valueToDocId.get(value).add(docId);
+ return value;
+ }
+
+ private static class Doc {
+
+ final String[][] fieldValues;
+
+ private Doc(String[][] fieldValues) {
+ this.fieldValues = fieldValues;
+ }
+ }
+
}