You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2013/10/21 20:58:44 UTC
svn commit: r1534320 [20/39] - in /lucene/dev/branches/lucene4956: ./
dev-tools/ dev-tools/idea/.idea/ dev-tools/idea/lucene/expressions/
dev-tools/idea/solr/contrib/velocity/ dev-tools/maven/
dev-tools/maven/lucene/ dev-tools/maven/lucene/expressions/...
Modified: lucene/dev/branches/lucene4956/lucene/facet/src/test/org/apache/lucene/facet/range/TestRangeAccumulator.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/facet/src/test/org/apache/lucene/facet/range/TestRangeAccumulator.java?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/facet/src/test/org/apache/lucene/facet/range/TestRangeAccumulator.java (original)
+++ lucene/dev/branches/lucene4956/lucene/facet/src/test/org/apache/lucene/facet/range/TestRangeAccumulator.java Mon Oct 21 18:58:24 2013
@@ -73,6 +73,8 @@ public class TestRangeAccumulator extend
field.setLongValue(l);
w.addDocument(doc);
}
+ field.setLongValue(Long.MAX_VALUE);
+ w.addDocument(doc);
IndexReader r = w.getReader();
w.close();
@@ -82,7 +84,7 @@ public class TestRangeAccumulator extend
new LongRange("less than or equal to 10", 0L, true, 10L, true),
new LongRange("over 90", 90L, false, 100L, false),
new LongRange("90 or above", 90L, true, 100L, false),
- new LongRange("over 1000", 1000L, false, Long.MAX_VALUE, false)));
+ new LongRange("over 1000", 1000L, false, Long.MAX_VALUE, true)));
FacetsCollector fc = FacetsCollector.create(a);
@@ -90,7 +92,7 @@ public class TestRangeAccumulator extend
s.search(new MatchAllDocsQuery(), fc);
List<FacetResult> result = fc.getFacetResults();
assertEquals(1, result.size());
- assertEquals("field (0)\n less than 10 (10)\n less than or equal to 10 (11)\n over 90 (9)\n 90 or above (10)\n over 1000 (0)\n", FacetTestUtils.toSimpleString(result.get(0)));
+ assertEquals("field (0)\n less than 10 (10)\n less than or equal to 10 (11)\n over 90 (9)\n 90 or above (10)\n over 1000 (1)\n", FacetTestUtils.toSimpleString(result.get(0)));
r.close();
d.close();
@@ -632,5 +634,44 @@ public class TestRangeAccumulator extend
r.close();
dir.close();
}
-}
+ // LUCENE-5178
+ public void testMissingValues() throws Exception {
+ assumeTrue("codec does not support docsWithField", defaultCodecSupportsDocsWithField());
+ Directory d = newDirectory();
+ RandomIndexWriter w = new RandomIndexWriter(random(), d);
+ Document doc = new Document();
+ NumericDocValuesField field = new NumericDocValuesField("field", 0L);
+ doc.add(field);
+ for(long l=0;l<100;l++) {
+ if (l % 5 == 0) {
+ // Every 5th doc is missing the value:
+ w.addDocument(new Document());
+ continue;
+ }
+ field.setLongValue(l);
+ w.addDocument(doc);
+ }
+
+ IndexReader r = w.getReader();
+ w.close();
+
+ RangeAccumulator a = new RangeAccumulator(new RangeFacetRequest<LongRange>("field",
+ new LongRange("less than 10", 0L, true, 10L, false),
+ new LongRange("less than or equal to 10", 0L, true, 10L, true),
+ new LongRange("over 90", 90L, false, 100L, false),
+ new LongRange("90 or above", 90L, true, 100L, false),
+ new LongRange("over 1000", 1000L, false, Long.MAX_VALUE, false)));
+
+ FacetsCollector fc = FacetsCollector.create(a);
+
+ IndexSearcher s = newSearcher(r);
+ s.search(new MatchAllDocsQuery(), fc);
+ List<FacetResult> result = fc.getFacetResults();
+ assertEquals(1, result.size());
+ assertEquals("field (0)\n less than 10 (8)\n less than or equal to 10 (8)\n over 90 (8)\n 90 or above (8)\n over 1000 (0)\n", FacetTestUtils.toSimpleString(result.get(0)));
+
+ r.close();
+ d.close();
+ }
+}
Modified: lucene/dev/branches/lucene4956/lucene/facet/src/test/org/apache/lucene/facet/search/DrillDownQueryTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/facet/src/test/org/apache/lucene/facet/search/DrillDownQueryTest.java?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/facet/src/test/org/apache/lucene/facet/search/DrillDownQueryTest.java (original)
+++ lucene/dev/branches/lucene4956/lucene/facet/src/test/org/apache/lucene/facet/search/DrillDownQueryTest.java Mon Oct 21 18:58:24 2013
@@ -66,6 +66,10 @@ public class DrillDownQueryTest extends
@AfterClass
public static void afterClassDrillDownQueryTest() throws Exception {
IOUtils.close(reader, taxo, dir, taxoDir);
+ reader = null;
+ taxo = null;
+ dir = null;
+ taxoDir = null;
}
@BeforeClass
Modified: lucene/dev/branches/lucene4956/lucene/facet/src/test/org/apache/lucene/facet/search/TestDemoFacets.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/facet/src/test/org/apache/lucene/facet/search/TestDemoFacets.java?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/facet/src/test/org/apache/lucene/facet/search/TestDemoFacets.java (original)
+++ lucene/dev/branches/lucene4956/lucene/facet/src/test/org/apache/lucene/facet/search/TestDemoFacets.java Mon Oct 21 18:58:24 2013
@@ -22,13 +22,16 @@ import java.io.IOException;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.Collections;
+import java.util.HashSet;
import java.util.List;
+import java.util.Set;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.facet.FacetTestCase;
import org.apache.lucene.facet.FacetTestUtils;
+import org.apache.lucene.facet.codecs.facet46.Facet46Codec;
import org.apache.lucene.facet.index.FacetFields;
import org.apache.lucene.facet.params.CategoryListParams;
import org.apache.lucene.facet.params.FacetIndexingParams;
@@ -48,6 +51,8 @@ import org.apache.lucene.search.similari
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util._TestUtil;
public class TestDemoFacets extends FacetTestCase {
@@ -248,4 +253,60 @@ public class TestDemoFacets extends Face
dir.close();
taxoDir.close();
}
+
+ // LUCENE-4583: make sure if we require > 32 KB for one
+ // document, we don't hit exc when using Facet42DocValuesFormat
+ public void testManyFacetsInOneDocument() throws Exception {
+ Directory dir = newDirectory();
+ Directory taxoDir = newDirectory();
+ IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
+ iwc.setCodec(new Facet46Codec());
+ RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc);
+ DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir, IndexWriterConfig.OpenMode.CREATE);
+
+ FacetFields facetFields = new FacetFields(taxoWriter);
+
+ int numLabels = _TestUtil.nextInt(random(), 40000, 100000);
+
+ Document doc = new Document();
+ doc.add(newTextField("field", "text", Field.Store.NO));
+ List<CategoryPath> paths = new ArrayList<CategoryPath>();
+ for(int i=0;i<numLabels;i++) {
+ paths.add(new CategoryPath("dim", "" + i));
+ }
+ facetFields.addFields(doc, paths);
+ writer.addDocument(doc);
+
+ // NRT open
+ IndexSearcher searcher = newSearcher(writer.getReader());
+ writer.close();
+
+ // NRT open
+ TaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoWriter);
+ taxoWriter.close();
+
+ FacetSearchParams fsp = new FacetSearchParams(new CountFacetRequest(new CategoryPath("dim"), Integer.MAX_VALUE));
+
+ // Aggregate the facet counts:
+ FacetsCollector c = FacetsCollector.create(fsp, searcher.getIndexReader(), taxoReader);
+
+ // MatchAllDocsQuery is for "browsing" (counts facets
+ // for all non-deleted docs in the index); normally
+ // you'd use a "normal" query, and use MultiCollector to
+ // wrap collecting the "normal" hits and also facets:
+ searcher.search(new MatchAllDocsQuery(), c);
+ List<FacetResult> results = c.getFacetResults();
+ assertEquals(1, results.size());
+ FacetResultNode root = results.get(0).getFacetResultNode();
+ assertEquals(numLabels, root.subResults.size());
+ Set<String> allLabels = new HashSet<String>();
+ for(FacetResultNode childNode : root.subResults) {
+ assertEquals(2, childNode.label.length);
+ allLabels.add(childNode.label.components[1]);
+ assertEquals(1, (int) childNode.value);
+ }
+ assertEquals(numLabels, allLabels.size());
+
+ IOUtils.close(searcher.getIndexReader(), taxoReader, dir, taxoDir);
+ }
}
Modified: lucene/dev/branches/lucene4956/lucene/facet/src/test/org/apache/lucene/facet/search/TestFacetsCollector.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/facet/src/test/org/apache/lucene/facet/search/TestFacetsCollector.java?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/facet/src/test/org/apache/lucene/facet/search/TestFacetsCollector.java (original)
+++ lucene/dev/branches/lucene4956/lucene/facet/src/test/org/apache/lucene/facet/search/TestFacetsCollector.java Mon Oct 21 18:58:24 2013
@@ -89,14 +89,8 @@ public class TestFacetsCollector extends
DirectoryReader r = DirectoryReader.open(indexDir);
DirectoryTaxonomyReader taxo = new DirectoryTaxonomyReader(taxoDir);
- FacetSearchParams sParams = new FacetSearchParams(new SumScoreFacetRequest(new CategoryPath("a"), 10));
- TaxonomyFacetsAccumulator fa = new TaxonomyFacetsAccumulator(sParams, r, taxo) {
- @Override
- public FacetsAggregator getAggregator() {
- return new SumScoreFacetsAggregator();
- }
- };
- FacetsCollector fc = FacetsCollector.create(fa);
+ FacetSearchParams fsp = new FacetSearchParams(new SumScoreFacetRequest(new CategoryPath("a"), 10));
+ FacetsCollector fc = FacetsCollector.create(fsp, r, taxo);
TopScoreDocCollector topDocs = TopScoreDocCollector.create(10, false);
ConstantScoreQuery csq = new ConstantScoreQuery(new MatchAllDocsQuery());
csq.setBoost(2.0f);
@@ -335,12 +329,7 @@ public class TestFacetsCollector extends
// assert IntFacetResultHandler
fsp = new FacetSearchParams(new SumScoreFacetRequest(new CategoryPath("a"), 10));
if (random().nextBoolean()) {
- fa = new TaxonomyFacetsAccumulator(fsp, r, taxo) {
- @Override
- public FacetsAggregator getAggregator() {
- return new SumScoreFacetsAggregator();
- }
- };
+ fa = new TaxonomyFacetsAccumulator(fsp, r, taxo);
} else {
fa = new OldFacetsAccumulator(fsp, r, taxo);
}
Modified: lucene/dev/branches/lucene4956/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestDirectoryTaxonomyReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestDirectoryTaxonomyReader.java?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestDirectoryTaxonomyReader.java (original)
+++ lucene/dev/branches/lucene4956/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestDirectoryTaxonomyReader.java Mon Oct 21 18:58:24 2013
@@ -473,6 +473,9 @@ public class TestDirectoryTaxonomyReader
int numCategories = atLeast(10);
int numA = 0, numB = 0;
Random random = random();
+ // add the two categories for which we'll also add children (so asserts are simpler)
+ taxoWriter.addCategory(new CategoryPath("a"));
+ taxoWriter.addCategory(new CategoryPath("b"));
for (int i = 0; i < numCategories; i++) {
if (random.nextBoolean()) {
taxoWriter.addCategory(new CategoryPath("a", Integer.toString(i)));
Modified: lucene/dev/branches/lucene4956/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestDirectoryTaxonomyWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestDirectoryTaxonomyWriter.java?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestDirectoryTaxonomyWriter.java (original)
+++ lucene/dev/branches/lucene4956/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestDirectoryTaxonomyWriter.java Mon Oct 21 18:58:24 2013
@@ -469,4 +469,27 @@ public class TestDirectoryTaxonomyWriter
IOUtils.close(indexDir, taxoDir);
}
+
+ @Test
+ public void testReplaceTaxoWithLargeTaxonomy() throws Exception {
+ Directory srcTaxoDir = newDirectory(), targetTaxoDir = newDirectory();
+
+ // build source, large, taxonomy
+ DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(srcTaxoDir);
+ int ord = taxoWriter.addCategory(new CategoryPath("A/1/1/1/1/1/1", '/'));
+ taxoWriter.close();
+
+ taxoWriter = new DirectoryTaxonomyWriter(targetTaxoDir);
+ int ordinal = taxoWriter.addCategory(new CategoryPath("B/1", '/'));
+ assertEquals(1, taxoWriter.getParent(ordinal)); // call getParent to initialize taxoArrays
+ taxoWriter.commit();
+
+ taxoWriter.replaceTaxonomy(srcTaxoDir);
+ assertEquals(ord - 1, taxoWriter.getParent(ord));
+ taxoWriter.close();
+
+ srcTaxoDir.close();
+ targetTaxoDir.close();
+ }
+
}
Modified: lucene/dev/branches/lucene4956/lucene/grouping/src/java/org/apache/lucene/search/grouping/function/package.html
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/grouping/src/java/org/apache/lucene/search/grouping/function/package.html?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/grouping/src/java/org/apache/lucene/search/grouping/function/package.html (original)
+++ lucene/dev/branches/lucene4956/lucene/grouping/src/java/org/apache/lucene/search/grouping/function/package.html Mon Oct 21 18:58:24 2013
@@ -16,6 +16,6 @@
-->
<html>
<body>
-Support for grouping by {org.apache.lucene.queries.function.ValueSource}.
+Support for grouping by {@link org.apache.lucene.queries.function.ValueSource}.
</body>
</html>
Modified: lucene/dev/branches/lucene4956/lucene/grouping/src/java/org/apache/lucene/search/grouping/term/package.html
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/grouping/src/java/org/apache/lucene/search/grouping/term/package.html?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/grouping/src/java/org/apache/lucene/search/grouping/term/package.html (original)
+++ lucene/dev/branches/lucene4956/lucene/grouping/src/java/org/apache/lucene/search/grouping/term/package.html Mon Oct 21 18:58:24 2013
@@ -16,6 +16,6 @@
-->
<html>
<body>
-Support for grouping by indexed terms via {org.apache.lucene.search.FieldCache}.
+Support for grouping by indexed terms via {@link org.apache.lucene.search.FieldCache}.
</body>
</html>
Modified: lucene/dev/branches/lucene4956/lucene/grouping/src/test/org/apache/lucene/search/grouping/AllGroupHeadsCollectorTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/grouping/src/test/org/apache/lucene/search/grouping/AllGroupHeadsCollectorTest.java?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/grouping/src/test/org/apache/lucene/search/grouping/AllGroupHeadsCollectorTest.java (original)
+++ lucene/dev/branches/lucene4956/lucene/grouping/src/test/org/apache/lucene/search/grouping/AllGroupHeadsCollectorTest.java Mon Oct 21 18:58:24 2013
@@ -302,7 +302,7 @@ public class AllGroupHeadsCollectorTest
w.close();
// NOTE: intentional but temporary field cache insanity!
- final FieldCache.Ints docIdToFieldId = FieldCache.DEFAULT.getInts(new SlowCompositeReaderWrapper(r), "id", false);
+ final FieldCache.Ints docIdToFieldId = FieldCache.DEFAULT.getInts(SlowCompositeReaderWrapper.wrap(r), "id", false);
final int[] fieldIdToDocID = new int[numDocs];
for (int i = 0; i < numDocs; i++) {
int fieldId = docIdToFieldId.get(i);
Modified: lucene/dev/branches/lucene4956/lucene/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java (original)
+++ lucene/dev/branches/lucene4956/lucene/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java Mon Oct 21 18:58:24 2013
@@ -743,7 +743,7 @@ public class TestGrouping extends Lucene
w.close();
// NOTE: intentional but temporary field cache insanity!
- final FieldCache.Ints docIDToID = FieldCache.DEFAULT.getInts(new SlowCompositeReaderWrapper(r), "id", false);
+ final FieldCache.Ints docIDToID = FieldCache.DEFAULT.getInts(SlowCompositeReaderWrapper.wrap(r), "id", false);
DirectoryReader rBlocks = null;
Directory dirBlocks = null;
@@ -779,7 +779,7 @@ public class TestGrouping extends Lucene
dirBlocks = newDirectory();
rBlocks = getDocBlockReader(dirBlocks, groupDocs);
final Filter lastDocInBlock = new CachingWrapperFilter(new QueryWrapperFilter(new TermQuery(new Term("groupend", "x"))));
- final FieldCache.Ints docIDToIDBlocks = FieldCache.DEFAULT.getInts(new SlowCompositeReaderWrapper(rBlocks), "id", false);
+ final FieldCache.Ints docIDToIDBlocks = FieldCache.DEFAULT.getInts(SlowCompositeReaderWrapper.wrap(rBlocks), "id", false);
final IndexSearcher sBlocks = newSearcher(rBlocks);
final ShardState shardsBlocks = new ShardState(sBlocks);
Modified: lucene/dev/branches/lucene4956/lucene/highlighter/src/java/org/apache/lucene/search/highlight/SimpleHTMLEncoder.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/highlighter/src/java/org/apache/lucene/search/highlight/SimpleHTMLEncoder.java?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/highlighter/src/java/org/apache/lucene/search/highlight/SimpleHTMLEncoder.java (original)
+++ lucene/dev/branches/lucene4956/lucene/highlighter/src/java/org/apache/lucene/search/highlight/SimpleHTMLEncoder.java Mon Oct 21 18:58:24 2013
@@ -47,33 +47,27 @@ public class SimpleHTMLEncoder implement
{
char ch = plainText.charAt(index);
- switch (ch)
- {
+ switch (ch) {
case '"':
result.append(""");
break;
-
case '&':
result.append("&");
break;
-
case '<':
result.append("<");
break;
-
case '>':
result.append(">");
break;
-
+ case '\'':
+ result.append("'");
+ break;
+ case '/':
+ result.append("/");
+ break;
default:
- if (ch < 128)
- {
- result.append(ch);
- }
- else
- {
- result.append("&#").append((int)ch).append(";");
- }
+ result.append(ch);
}
}
Modified: lucene/dev/branches/lucene4956/lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java (original)
+++ lucene/dev/branches/lucene4956/lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java Mon Oct 21 18:58:24 2013
@@ -422,6 +422,11 @@ public class WeightedSpanTermExtractor {
public NumericDocValues getNormValues(String field) throws IOException {
return super.getNormValues(FIELD_NAME);
}
+
+ @Override
+ public Bits getDocsWithField(String field) throws IOException {
+ return super.getDocsWithField(FIELD_NAME);
+ }
}
/**
Modified: lucene/dev/branches/lucene4956/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PassageFormatter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PassageFormatter.java?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PassageFormatter.java (original)
+++ lucene/dev/branches/lucene4956/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PassageFormatter.java Mon Oct 21 18:58:24 2013
@@ -31,8 +31,11 @@ public abstract class PassageFormatter {
* @param passages top-N passages for the field. Note these are sorted in
* the order that they appear in the document for convenience.
* @param content content for the field.
- * @return formatted highlight
+ * @return formatted highlight. Note that for the
+ * non-expert APIs in {@link PostingsHighlighter} that
+ * return String, the toString method on the Object
+ * returned by this method is used to compute the string.
*/
- public abstract String format(Passage passages[], String content);
+ public abstract Object format(Passage passages[], String content);
}
Modified: lucene/dev/branches/lucene4956/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java (original)
+++ lucene/dev/branches/lucene4956/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java Mon Oct 21 18:58:24 2013
@@ -107,7 +107,7 @@ public class PostingsHighlighter {
private PassageScorer defaultScorer;
/**
- * Creates a new highlighter with default parameters.
+ * Creates a new highlighter with {@link #DEFAULT_MAX_LENGTH}.
*/
public PostingsHighlighter() {
this(DEFAULT_MAX_LENGTH);
@@ -267,7 +267,7 @@ public class PostingsHighlighter {
return highlightFields(fields, query, searcher, docids, maxPassages);
}
-
+
/**
* Highlights the top-N passages from multiple fields,
* for the provided int[] docids.
@@ -280,7 +280,7 @@ public class PostingsHighlighter {
* @param maxPassagesIn The maximum number of top-N ranked passages per-field used to
* form the highlighted snippets.
* @return Map keyed on field name, containing the array of formatted snippets
- * corresponding to the documents in <code>topDocs</code>.
+ * corresponding to the documents in <code>docidsIn</code>.
* If no highlights were found for a document, the
* first {@code maxPassages} from the field will
* be returned.
@@ -289,6 +289,45 @@ public class PostingsHighlighter {
* {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
*/
public Map<String,String[]> highlightFields(String fieldsIn[], Query query, IndexSearcher searcher, int[] docidsIn, int maxPassagesIn[]) throws IOException {
+ Map<String,String[]> snippets = new HashMap<String,String[]>();
+ for(Map.Entry<String,Object[]> ent : highlightFieldsAsObjects(fieldsIn, query, searcher, docidsIn, maxPassagesIn).entrySet()) {
+ Object[] snippetObjects = ent.getValue();
+ String[] snippetStrings = new String[snippetObjects.length];
+ snippets.put(ent.getKey(), snippetStrings);
+ for(int i=0;i<snippetObjects.length;i++) {
+ Object snippet = snippetObjects[i];
+ if (snippet != null) {
+ snippetStrings[i] = snippet.toString();
+ }
+ }
+ }
+
+ return snippets;
+ }
+
+ /**
+ * Expert: highlights the top-N passages from multiple fields,
+ * for the provided int[] docids, to custom Object as
+ * returned by the {@link PassageFormatter}. Use
+ * this API to render to something other than String.
+ *
+ * @param fieldsIn field names to highlight.
+ * Must have a stored string value and also be indexed with offsets.
+ * @param query query to highlight.
+ * @param searcher searcher that was previously used to execute the query.
+ * @param docidsIn containing the document IDs to highlight.
+ * @param maxPassagesIn The maximum number of top-N ranked passages per-field used to
+ * form the highlighted snippets.
+ * @return Map keyed on field name, containing the array of formatted snippets
+ * corresponding to the documents in <code>docidsIn</code>.
+ * If no highlights were found for a document, the
+ * first {@code maxPassages} from the field will
+ * be returned.
+ * @throws IOException if an I/O error occurred during processing
+ * @throws IllegalArgumentException if <code>field</code> was indexed without
+ * {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
+ */
+ protected Map<String,Object[]> highlightFieldsAsObjects(String fieldsIn[], Query query, IndexSearcher searcher, int[] docidsIn, int maxPassagesIn[]) throws IOException {
if (fieldsIn.length < 1) {
throw new IllegalArgumentException("fieldsIn must not be empty");
}
@@ -335,7 +374,7 @@ public class PostingsHighlighter {
// pull stored data:
String[][] contents = loadFieldValues(searcher, fields, docids, maxLength);
- Map<String,String[]> highlights = new HashMap<String,String[]>();
+ Map<String,Object[]> highlights = new HashMap<String,Object[]>();
for (int i = 0; i < fields.length; i++) {
String field = fields[i];
int numPassages = maxPassages[i];
@@ -350,9 +389,9 @@ public class PostingsHighlighter {
for(Term term : fieldTerms) {
terms[termUpto++] = term.bytes();
}
- Map<Integer,String> fieldHighlights = highlightField(field, contents[i], getBreakIterator(field), terms, docids, leaves, numPassages);
+ Map<Integer,Object> fieldHighlights = highlightField(field, contents[i], getBreakIterator(field), terms, docids, leaves, numPassages);
- String[] result = new String[docids.length];
+ Object[] result = new Object[docids.length];
for (int j = 0; j < docidsIn.length; j++) {
result[j] = fieldHighlights.get(docidsIn[j]);
}
@@ -394,8 +433,8 @@ public class PostingsHighlighter {
return ' ';
}
- private Map<Integer,String> highlightField(String field, String contents[], BreakIterator bi, BytesRef terms[], int[] docids, List<AtomicReaderContext> leaves, int maxPassages) throws IOException {
- Map<Integer,String> highlights = new HashMap<Integer,String>();
+ private Map<Integer,Object> highlightField(String field, String contents[], BreakIterator bi, BytesRef terms[], int[] docids, List<AtomicReaderContext> leaves, int maxPassages) throws IOException {
+ Map<Integer,Object> highlights = new HashMap<Integer,Object>();
// reuse in the real sense... for docs in same segment we just advance our old enum
DocsAndPositionsEnum postings[] = null;
@@ -563,7 +602,7 @@ public class PostingsHighlighter {
start = dp.startOffset();
end = dp.endOffset();
}
- if (start >= current.endOffset) {
+ if (start >= current.endOffset || end > contentLength) {
pq.offer(off);
break;
}
Modified: lucene/dev/branches/lucene4956/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FastVectorHighlighter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FastVectorHighlighter.java?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FastVectorHighlighter.java (original)
+++ lucene/dev/branches/lucene4956/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FastVectorHighlighter.java Mon Oct 21 18:58:24 2013
@@ -18,6 +18,8 @@ package org.apache.lucene.search.vectorh
*/
import java.io.IOException;
+import java.util.Iterator;
+import java.util.Set;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.Query;
@@ -28,7 +30,6 @@ import org.apache.lucene.search.highligh
*
*/
public class FastVectorHighlighter {
-
public static final boolean DEFAULT_PHRASE_HIGHLIGHT = true;
public static final boolean DEFAULT_FIELD_MATCH = true;
private final boolean phraseHighlight;
@@ -186,16 +187,71 @@ public class FastVectorHighlighter {
return fragmentsBuilder.createFragments( reader, docId, fieldName, fieldFragList, maxNumFragments,
preTags, postTags, encoder );
}
-
+
+ /**
+ * Return the best fragments. Matches are scanned from matchedFields and turned into fragments against
+ * storedField. The highlighting may not make sense if matchedFields has matches with offsets that don't
+ * correspond features in storedField. It will outright throw a {@code StringIndexOutOfBoundsException}
+ * if matchedFields produces offsets outside of storedField. As such it is advisable that all
+ * matchedFields share the same source as storedField or are at least a prefix of it.
+ *
+ * @param fieldQuery {@link FieldQuery} object
+ * @param reader {@link IndexReader} of the index
+ * @param docId document id to be highlighted
+ * @param storedField field of the document that stores the text
+ * @param matchedFields fields of the document to scan for matches
+ * @param fragCharSize the length (number of chars) of a fragment
+ * @param maxNumFragments maximum number of fragments
+ * @param fragListBuilder {@link FragListBuilder} object
+ * @param fragmentsBuilder {@link FragmentsBuilder} object
+ * @param preTags pre-tags to be used to highlight terms
+ * @param postTags post-tags to be used to highlight terms
+ * @param encoder an encoder that generates encoded text
+ * @return created fragments or null when no fragments created.
+ * size of the array can be less than maxNumFragments
+ * @throws IOException If there is a low-level I/O error
+ */
+ public final String[] getBestFragments( final FieldQuery fieldQuery, IndexReader reader, int docId,
+ String storedField, Set< String > matchedFields, int fragCharSize, int maxNumFragments,
+ FragListBuilder fragListBuilder, FragmentsBuilder fragmentsBuilder,
+ String[] preTags, String[] postTags, Encoder encoder ) throws IOException {
+ FieldFragList fieldFragList =
+ getFieldFragList( fragListBuilder, fieldQuery, reader, docId, matchedFields, fragCharSize );
+ return fragmentsBuilder.createFragments( reader, docId, storedField, fieldFragList, maxNumFragments,
+ preTags, postTags, encoder );
+ }
+
+ /**
+ * Build a FieldFragList for one field.
+ */
private FieldFragList getFieldFragList( FragListBuilder fragListBuilder,
final FieldQuery fieldQuery, IndexReader reader, int docId,
- String fieldName, int fragCharSize ) throws IOException {
- FieldTermStack fieldTermStack = new FieldTermStack( reader, docId, fieldName, fieldQuery );
+ String matchedField, int fragCharSize ) throws IOException {
+ FieldTermStack fieldTermStack = new FieldTermStack( reader, docId, matchedField, fieldQuery );
FieldPhraseList fieldPhraseList = new FieldPhraseList( fieldTermStack, fieldQuery, phraseLimit );
return fragListBuilder.createFieldFragList( fieldPhraseList, fragCharSize );
}
/**
+ * Build a FieldFragList for more than one field.
+ */
+ private FieldFragList getFieldFragList( FragListBuilder fragListBuilder,
+ final FieldQuery fieldQuery, IndexReader reader, int docId,
+ Set< String > matchedFields, int fragCharSize ) throws IOException {
+ Iterator< String > matchedFieldsItr = matchedFields.iterator();
+ if ( !matchedFieldsItr.hasNext() ) {
+ throw new IllegalArgumentException( "matchedFields must contain at least on field name." );
+ }
+ FieldPhraseList[] toMerge = new FieldPhraseList[ matchedFields.size() ];
+ int i = 0;
+ while ( matchedFieldsItr.hasNext() ) {
+ FieldTermStack stack = new FieldTermStack( reader, docId, matchedFieldsItr.next(), fieldQuery );
+ toMerge[ i++ ] = new FieldPhraseList( stack, fieldQuery, phraseLimit );
+ }
+ return fragListBuilder.createFieldFragList( new FieldPhraseList( toMerge ), fragCharSize );
+ }
+
+ /**
* return whether phraseHighlight or not.
*
* @return whether phraseHighlight or not
Modified: lucene/dev/branches/lucene4956/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java (original)
+++ lucene/dev/branches/lucene4956/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java Mon Oct 21 18:58:24 2013
@@ -17,18 +17,23 @@ package org.apache.lucene.search.vectorh
*/
import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import org.apache.lucene.search.vectorhighlight.FieldQuery.QueryPhraseMap;
import org.apache.lucene.search.vectorhighlight.FieldTermStack.TermInfo;
+import org.apache.lucene.util.MergedIterator;
/**
* FieldPhraseList has a list of WeightedPhraseInfo that is used by FragListBuilder
* to create a FieldFragList object.
*/
public class FieldPhraseList {
-
+ /**
+ * List of non-overlapping WeightedPhraseInfo objects.
+ */
LinkedList<WeightedPhraseInfo> phraseList = new LinkedList<WeightedPhraseInfo>();
/**
@@ -60,46 +65,98 @@ public class FieldPhraseList {
public FieldPhraseList( FieldTermStack fieldTermStack, FieldQuery fieldQuery, int phraseLimit ){
final String field = fieldTermStack.getFieldName();
- QueryPhraseMap qpm = fieldQuery.getRootMap(field);
- if (qpm != null) {
- LinkedList<TermInfo> phraseCandidate = new LinkedList<TermInfo>();
- extractPhrases(fieldTermStack.termList, qpm, phraseCandidate, 0);
- assert phraseCandidate.size() == 0;
+ LinkedList<TermInfo> phraseCandidate = new LinkedList<TermInfo>();
+ QueryPhraseMap currMap = null;
+ QueryPhraseMap nextMap = null;
+ while( !fieldTermStack.isEmpty() && (phraseList.size() < phraseLimit) )
+ {
+ phraseCandidate.clear();
+
+ TermInfo ti = fieldTermStack.pop();
+ currMap = fieldQuery.getFieldTermMap( field, ti.getText() );
+
+ // if not found, discard top TermInfo from stack, then try next element
+ if( currMap == null ) continue;
+
+ // if found, search the longest phrase
+ phraseCandidate.add( ti );
+ while( true ){
+ ti = fieldTermStack.pop();
+ nextMap = null;
+ if( ti != null )
+ nextMap = currMap.getTermMap( ti.getText() );
+ if( ti == null || nextMap == null ){
+ if( ti != null )
+ fieldTermStack.push( ti );
+ if( currMap.isValidTermOrPhrase( phraseCandidate ) ){
+ addIfNoOverlap( new WeightedPhraseInfo( phraseCandidate, currMap.getBoost(), currMap.getTermOrPhraseNumber() ) );
+ }
+ else{
+ while( phraseCandidate.size() > 1 ){
+ fieldTermStack.push( phraseCandidate.removeLast() );
+ currMap = fieldQuery.searchPhrase( field, phraseCandidate );
+ if( currMap != null ){
+ addIfNoOverlap( new WeightedPhraseInfo( phraseCandidate, currMap.getBoost(), currMap.getTermOrPhraseNumber() ) );
+ break;
+ }
+ }
+ }
+ break;
+ }
+ else{
+ phraseCandidate.add( ti );
+ currMap = nextMap;
+ }
+ }
}
}
- void extractPhrases(LinkedList<TermInfo> terms, QueryPhraseMap currMap, LinkedList<TermInfo> phraseCandidate, int longest) {
- if (terms.isEmpty()) {
- if (longest > 0) {
- addIfNoOverlap( new WeightedPhraseInfo( phraseCandidate.subList(0, longest), currMap.getBoost(), currMap.getTermOrPhraseNumber() ) );
- }
+ /**
+ * Merging constructor.
+ *
+ * @param toMerge FieldPhraseLists to merge to build this one
+ */
+ public FieldPhraseList( FieldPhraseList[] toMerge ) {
+ // Merge all overlapping WeightedPhraseInfos
+ // Step 1. Sort by startOffset, endOffset, and boost, in that order.
+ @SuppressWarnings( { "rawtypes", "unchecked" } )
+ Iterator< WeightedPhraseInfo >[] allInfos = new Iterator[ toMerge.length ];
+ int index = 0;
+ for ( FieldPhraseList fplToMerge : toMerge ) {
+ allInfos[ index++ ] = fplToMerge.phraseList.iterator();
+ }
+ MergedIterator< WeightedPhraseInfo > itr = new MergedIterator< WeightedPhraseInfo >( false, allInfos );
+ // Step 2. Walk the sorted list merging infos that overlap
+ phraseList = new LinkedList< WeightedPhraseInfo >();
+ if ( !itr.hasNext() ) {
return;
}
- ArrayList<TermInfo> samePositionTerms = new ArrayList<TermInfo>();
- do {
- samePositionTerms.add(terms.pop());
- } while (!terms.isEmpty() && terms.get(0).getPosition() == samePositionTerms.get(0).getPosition());
-
- // try all next terms at the same position
- for (TermInfo nextTerm : samePositionTerms) {
- QueryPhraseMap nextMap = currMap.getTermMap(nextTerm.getText());
- if (nextMap != null) {
- phraseCandidate.add(nextTerm);
- int l = longest;
- if(nextMap.isValidTermOrPhrase( phraseCandidate ) ){
- l = phraseCandidate.size();
- }
- extractPhrases(terms, nextMap, phraseCandidate, l);
- phraseCandidate.removeLast();
+ List< WeightedPhraseInfo > work = new ArrayList< WeightedPhraseInfo >();
+ WeightedPhraseInfo first = itr.next();
+ work.add( first );
+ int workEndOffset = first.getEndOffset();
+ while ( itr.hasNext() ) {
+ WeightedPhraseInfo current = itr.next();
+ if ( current.getStartOffset() <= workEndOffset ) {
+ workEndOffset = Math.max( workEndOffset, current.getEndOffset() );
+ work.add( current );
+ } else {
+ if ( work.size() == 1 ) {
+ phraseList.add( work.get( 0 ) );
+ work.set( 0, current );
+ } else {
+ phraseList.add( new WeightedPhraseInfo( work ) );
+ work.clear();
+ work.add( current );
+ }
+ workEndOffset = current.getEndOffset();
}
}
-
- // ignore the next term
- extractPhrases(terms, currMap, phraseCandidate, longest);
-
- // add terms back
- for (TermInfo nextTerm : samePositionTerms) {
- terms.push(nextTerm);
+ if ( work.size() == 1 ) {
+ phraseList.add( work.get( 0 ) );
+ } else {
+ phraseList.add( new WeightedPhraseInfo( work ) );
+ work.clear();
}
}
@@ -118,9 +175,7 @@ public class FieldPhraseList {
/**
* Represents the list of term offsets and boost for some text
*/
- public static class WeightedPhraseInfo {
-
- private String text; // unnecessary member, just exists for debugging purpose
+ public static class WeightedPhraseInfo implements Comparable< WeightedPhraseInfo > {
private List<Toffs> termsOffsets; // usually termsOffsets.size() == 1,
// but if position-gap > 1 and slop > 0 then size() could be greater than 1
private float boost; // query boost
@@ -129,10 +184,15 @@ public class FieldPhraseList {
private ArrayList<TermInfo> termsInfos;
/**
+ * Text of the match, calculated on the fly. Use for debugging only.
* @return the text
*/
public String getText() {
- return text;
+ StringBuilder text = new StringBuilder();
+ for ( TermInfo ti: termsInfos ) {
+ text.append( ti.getText() );
+ }
+ return text.toString();
}
/**
@@ -156,11 +216,11 @@ public class FieldPhraseList {
return termsInfos;
}
- public WeightedPhraseInfo( List<TermInfo> terms, float boost ){
+ public WeightedPhraseInfo( LinkedList<TermInfo> terms, float boost ){
this( terms, boost, 0 );
}
- public WeightedPhraseInfo( List<TermInfo> terms, float boost, int seqnum ){
+ public WeightedPhraseInfo( LinkedList<TermInfo> terms, float boost, int seqnum ){
this.boost = boost;
this.seqnum = seqnum;
@@ -171,15 +231,11 @@ public class FieldPhraseList {
TermInfo ti = terms.get( 0 );
termsOffsets.add( new Toffs( ti.getStartOffset(), ti.getEndOffset() ) );
if( terms.size() == 1 ){
- text = ti.getText();
return;
}
- StringBuilder sb = new StringBuilder();
- sb.append( ti.getText() );
int pos = ti.getPosition();
for( int i = 1; i < terms.size(); i++ ){
ti = terms.get( i );
- sb.append( ti.getText() );
if( ti.getPosition() - pos == 1 ){
Toffs to = termsOffsets.get( termsOffsets.size() - 1 );
to.setEndOffset( ti.getEndOffset() );
@@ -189,7 +245,50 @@ public class FieldPhraseList {
}
pos = ti.getPosition();
}
- text = sb.toString();
+ }
+
+ /**
+ * Merging constructor. Note that this just grabs seqnum from the first info.
+ */
+ public WeightedPhraseInfo( Collection< WeightedPhraseInfo > toMerge ) {
+ // Pretty much the same idea as merging FieldPhraseLists:
+ // Step 1. Sort by startOffset, endOffset
+ // While we are here merge the boosts and termInfos
+ Iterator< WeightedPhraseInfo > toMergeItr = toMerge.iterator();
+ if ( !toMergeItr.hasNext() ) {
+ throw new IllegalArgumentException( "toMerge must contain at least one WeightedPhraseInfo." );
+ }
+ WeightedPhraseInfo first = toMergeItr.next();
+ @SuppressWarnings( { "rawtypes", "unchecked" } )
+ Iterator< Toffs >[] allToffs = new Iterator[ toMerge.size() ];
+ termsInfos = new ArrayList< TermInfo >();
+ seqnum = first.seqnum;
+ boost = first.boost;
+ allToffs[ 0 ] = first.termsOffsets.iterator();
+ int index = 1;
+ while ( toMergeItr.hasNext() ) {
+ WeightedPhraseInfo info = toMergeItr.next();
+ boost += info.boost;
+ termsInfos.addAll( info.termsInfos );
+ allToffs[ index++ ] = info.termsOffsets.iterator();
+ }
+ // Step 2. Walk the sorted list merging overlaps
+ MergedIterator< Toffs > itr = new MergedIterator< Toffs >( false, allToffs );
+ termsOffsets = new ArrayList< Toffs >();
+ if ( !itr.hasNext() ) {
+ return;
+ }
+ Toffs work = itr.next();
+ while ( itr.hasNext() ) {
+ Toffs current = itr.next();
+ if ( current.startOffset <= work.endOffset ) {
+ work.endOffset = Math.max( work.endOffset, current.endOffset );
+ } else {
+ termsOffsets.add( work );
+ work = current;
+ }
+ }
+ termsOffsets.add( work );
}
public int getStartOffset(){
@@ -199,7 +298,7 @@ public class FieldPhraseList {
public int getEndOffset(){
return termsOffsets.get( termsOffsets.size() - 1 ).endOffset;
}
-
+
public boolean isOffsetOverlap( WeightedPhraseInfo other ){
int so = getStartOffset();
int eo = getEndOffset();
@@ -215,7 +314,7 @@ public class FieldPhraseList {
@Override
public String toString(){
StringBuilder sb = new StringBuilder();
- sb.append( text ).append( '(' ).append( boost ).append( ")(" );
+ sb.append( getText() ).append( '(' ).append( boost ).append( ")(" );
for( Toffs to : termsOffsets ){
sb.append( to );
}
@@ -230,10 +329,58 @@ public class FieldPhraseList {
return seqnum;
}
+ @Override
+ public int compareTo( WeightedPhraseInfo other ) {
+ int diff = getStartOffset() - other.getStartOffset();
+ if ( diff != 0 ) {
+ return diff;
+ }
+ diff = getEndOffset() - other.getEndOffset();
+ if ( diff != 0 ) {
+ return diff;
+ }
+ return (int) Math.signum( getBoost() - other.getBoost() );
+ }
+
+ @Override
+ public int hashCode() {
+ final int prime = 31;
+ int result = 1;
+ result = prime * result + getStartOffset();
+ result = prime * result + getEndOffset();
+ long b = Double.doubleToLongBits( getBoost() );
+ result = prime * result + ( int )( b ^ ( b >>> 32 ) );
+ return result;
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj) {
+ return true;
+ }
+ if (obj == null) {
+ return false;
+ }
+ if (getClass() != obj.getClass()) {
+ return false;
+ }
+ WeightedPhraseInfo other = (WeightedPhraseInfo) obj;
+ if (getStartOffset() != other.getStartOffset()) {
+ return false;
+ }
+ if (getEndOffset() != other.getEndOffset()) {
+ return false;
+ }
+ if (getBoost() != other.getBoost()) {
+ return false;
+ }
+ return true;
+ }
+
/**
* Term offsets (start + end)
*/
- public static class Toffs {
+ public static class Toffs implements Comparable< Toffs > {
private int startOffset;
private int endOffset;
public Toffs( int startOffset, int endOffset ){
@@ -250,6 +397,42 @@ public class FieldPhraseList {
return endOffset;
}
@Override
+ public int compareTo( Toffs other ) {
+ int diff = getStartOffset() - other.getStartOffset();
+ if ( diff != 0 ) {
+ return diff;
+ }
+ return getEndOffset() - other.getEndOffset();
+ }
+ @Override
+ public int hashCode() {
+ final int prime = 31;
+ int result = 1;
+ result = prime * result + getStartOffset();
+ result = prime * result + getEndOffset();
+ return result;
+ }
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj) {
+ return true;
+ }
+ if (obj == null) {
+ return false;
+ }
+ if (getClass() != obj.getClass()) {
+ return false;
+ }
+ Toffs other = (Toffs) obj;
+ if (getStartOffset() != other.getStartOffset()) {
+ return false;
+ }
+ if (getEndOffset() != other.getEndOffset()) {
+ return false;
+ }
+ return true;
+ }
+ @Override
public String toString(){
StringBuilder sb = new StringBuilder();
sb.append( '(' ).append( startOffset ).append( ',' ).append( endOffset ).append( ')' );
Modified: lucene/dev/branches/lucene4956/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldQuery.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldQuery.java?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldQuery.java (original)
+++ lucene/dev/branches/lucene4956/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldQuery.java Mon Oct 21 18:58:24 2013
@@ -17,8 +17,6 @@ package org.apache.lucene.search.vectorh
*/
import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
@@ -41,7 +39,6 @@ import org.apache.lucene.search.PhraseQu
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.vectorhighlight.FieldTermStack.TermInfo;
-import org.apache.lucene.util.InPlaceMergeSorter;
/**
* FieldQuery breaks down query object into terms/phrases and keeps
@@ -333,8 +330,7 @@ public class FieldQuery {
return root.searchPhrase( phraseCandidate );
}
- /** Get the root map for the given field name. */
- public QueryPhraseMap getRootMap( String fieldName ){
+ private QueryPhraseMap getRootMap( String fieldName ){
return rootMaps.get( fieldMatch ? fieldName : null );
}
@@ -351,7 +347,6 @@ public class FieldQuery {
boolean terminal;
int slop; // valid if terminal == true and phraseHighlight == true
float boost; // valid if terminal == true
- int[] positions; // valid if terminal == true
int termOrPhraseNumber; // valid if terminal == true
FieldQuery fieldQuery;
Map<String, QueryPhraseMap> subMap = new HashMap<String, QueryPhraseMap>();
@@ -374,107 +369,38 @@ public class FieldQuery {
return map;
}
- void add( Query query, IndexReader reader ) {
+ void add( Query query, IndexReader reader ) {
if( query instanceof TermQuery ){
addTerm( ((TermQuery)query).getTerm(), query.getBoost() );
}
else if( query instanceof PhraseQuery ){
PhraseQuery pq = (PhraseQuery)query;
- final Term[] terms = pq.getTerms();
- final int[] positions = pq.getPositions();
- new InPlaceMergeSorter() {
-
- @Override
- protected void swap(int i, int j) {
- Term tmpTerm = terms[i];
- terms[i] = terms[j];
- terms[j] = tmpTerm;
-
- int tmpPos = positions[i];
- positions[i] = positions[j];
- positions[j] = tmpPos;
- }
-
- @Override
- protected int compare(int i, int j) {
- return positions[i] - positions[j];
- }
- }.sort(0, terms.length);
-
- addToMap(pq, terms, positions, 0, subMap, pq.getSlop());
+ Term[] terms = pq.getTerms();
+ Map<String, QueryPhraseMap> map = subMap;
+ QueryPhraseMap qpm = null;
+ for( Term term : terms ){
+ qpm = getOrNewMap( map, term.text() );
+ map = qpm.subMap;
+ }
+ qpm.markTerminal( pq.getSlop(), pq.getBoost() );
}
else
throw new RuntimeException( "query \"" + query.toString() + "\" must be flatten first." );
}
-
- private int numTermsAtSamePosition(int[] positions, int i) {
- int numTermsAtSamePosition = 1;
- for (int j = i + 1; j < positions.length; ++j) {
- if (positions[j] == positions[i]) {
- ++numTermsAtSamePosition;
- }
- }
- return numTermsAtSamePosition;
- }
-
- private void addToMap(PhraseQuery pq, Term[] terms, int[] positions, int i, Map<String, QueryPhraseMap> map, int slop) {
- int numTermsAtSamePosition = numTermsAtSamePosition(positions, i);
- for (int j = 0; j < numTermsAtSamePosition; ++j) {
- QueryPhraseMap qpm = getOrNewMap(map, terms[i + j].text());
- if (i + numTermsAtSamePosition == terms.length) {
- qpm.markTerminal(pq.getSlop(), pq.getBoost(), uniquePositions(positions));
- } else {
- addToMap(pq, terms, positions, i + numTermsAtSamePosition, qpm.subMap, slop);
- }
- }
- if (slop > 2 && i + numTermsAtSamePosition < terms.length) {
- Term[] otherTerms = Arrays.copyOf(terms, terms.length);
- int[] otherPositions = Arrays.copyOf(positions, positions.length);
- final int nextTermAtSamePosition = numTermsAtSamePosition(positions, i + numTermsAtSamePosition);
- System.arraycopy(terms, i + numTermsAtSamePosition, otherTerms, i, nextTermAtSamePosition);
- System.arraycopy(positions, i + numTermsAtSamePosition, otherPositions, i, nextTermAtSamePosition);
- System.arraycopy(terms, i, otherTerms, i + nextTermAtSamePosition, numTermsAtSamePosition);
- System.arraycopy(positions, i, otherPositions, i + nextTermAtSamePosition, numTermsAtSamePosition);
- addToMap(pq, otherTerms, otherPositions, i, map, slop - 2);
- }
- }
-
- private int[] uniquePositions(int[] positions) {
- int uniqueCount = 1;
- for (int i = 1; i < positions.length; ++i) {
- if (positions[i] != positions[i - 1]) {
- ++uniqueCount;
- }
- }
- if (uniqueCount == positions.length) {
- return positions;
- }
- int[] result = new int[uniqueCount];
- result[0] = positions[0];
- for (int i = 1, j = 1; i < positions.length; ++i) {
- if (positions[i] != positions[i - 1]) {
- result[j++] = positions[i];
- }
- }
- return result;
- }
-
+
public QueryPhraseMap getTermMap( String term ){
return subMap.get( term );
}
private void markTerminal( float boost ){
- markTerminal( 0, boost, null );
+ markTerminal( 0, boost );
}
- private void markTerminal( int slop, float boost, int[] positions ){
- if (slop > this.slop || (slop == this.slop && boost > this.boost)) {
- this.terminal = true;
- this.slop = slop;
- this.boost = boost;
- this.termOrPhraseNumber = fieldQuery.nextTermOrPhraseNumber();
- this.positions = positions;
- }
+ private void markTerminal( int slop, float boost ){
+ this.terminal = true;
+ this.slop = slop;
+ this.boost = boost;
+ this.termOrPhraseNumber = fieldQuery.nextTermOrPhraseNumber();
}
public boolean isTerminal(){
@@ -509,20 +435,15 @@ public class FieldQuery {
// if the candidate is a term, it is valid
if( phraseCandidate.size() == 1 ) return true;
-
- assert phraseCandidate.size() == positions.length;
// else check whether the candidate is valid phrase
// compare position-gaps between terms to slop
int pos = phraseCandidate.get( 0 ).getPosition();
- int totalDistance = 0;
for( int i = 1; i < phraseCandidate.size(); i++ ){
int nextPos = phraseCandidate.get( i ).getPosition();
- final int expectedDelta = positions[i] - positions[i - 1];
- final int actualDelta = nextPos - pos;
- totalDistance += Math.abs(expectedDelta - actualDelta);
+ if( Math.abs( nextPos - pos - 1 ) > slop ) return false;
pos = nextPos;
}
- return totalDistance <= slop;
+ return true;
}
}
}
Modified: lucene/dev/branches/lucene4956/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java (original)
+++ lucene/dev/branches/lucene4956/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java Mon Oct 21 18:58:24 2013
@@ -145,13 +145,6 @@ public class FieldTermStack {
}
/**
- * Return the top TermInfo object of the stack without removing it.
- */
- public TermInfo peek() {
- return termList.peek();
- }
-
- /**
* @param termInfo the TermInfo object to be put on the top of the stack
*/
public void push( TermInfo termInfo ){
@@ -168,7 +161,8 @@ public class FieldTermStack {
}
/**
- * Single term with its position/offsets in the document and IDF weight
+ * Single term with its position/offsets in the document and IDF weight.
+ * It is Comparable but considers only position.
*/
public static class TermInfo implements Comparable<TermInfo>{
@@ -205,5 +199,30 @@ public class FieldTermStack {
public int compareTo( TermInfo o ){
return ( this.position - o.position );
}
+ @Override
+ public int hashCode() {
+ final int prime = 31;
+ int result = 1;
+ result = prime * result + position;
+ return result;
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj) {
+ return true;
+ }
+ if (obj == null) {
+ return false;
+ }
+ if (getClass() != obj.getClass()) {
+ return false;
+ }
+ TermInfo other = (TermInfo) obj;
+ if (position != other.position) {
+ return false;
+ }
+ return true;
+ }
}
}
Modified: lucene/dev/branches/lucene4956/lucene/highlighter/src/test/org/apache/lucene/search/highlight/OffsetLimitTokenFilterTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/highlighter/src/test/org/apache/lucene/search/highlight/OffsetLimitTokenFilterTest.java?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/highlighter/src/test/org/apache/lucene/search/highlight/OffsetLimitTokenFilterTest.java (original)
+++ lucene/dev/branches/lucene4956/lucene/highlighter/src/test/org/apache/lucene/search/highlight/OffsetLimitTokenFilterTest.java Mon Oct 21 18:58:24 2013
@@ -49,7 +49,7 @@ public class OffsetLimitTokenFilterTest
assertTokenStreamContents(filter, new String[] {"short", "toolong",
"evenmuchlongertext"});
- checkOneTermReuse(new Analyzer() {
+ checkOneTerm(new Analyzer() {
@Override
public TokenStreamComponents createComponents(String fieldName, Reader reader) {
Modified: lucene/dev/branches/lucene4956/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighter.java?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighter.java (original)
+++ lucene/dev/branches/lucene4956/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighter.java Mon Oct 21 18:58:24 2013
@@ -21,6 +21,7 @@ import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.text.BreakIterator;
+import java.util.Arrays;
import java.util.Map;
import org.apache.lucene.analysis.Analyzer;
@@ -47,8 +48,8 @@ import org.apache.lucene.search.Sort;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
-import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.LuceneTestCase.SuppressCodecs;
+import org.apache.lucene.util.LuceneTestCase;
@SuppressCodecs({"MockFixedIntBlock", "MockVariableIntBlock", "MockSep", "MockRandom"})
public class TestPostingsHighlighter extends LuceneTestCase {
@@ -86,12 +87,42 @@ public class TestPostingsHighlighter ext
ir.close();
dir.close();
}
+
+ public void testFormatWithMatchExceedingContentLength2() throws Exception {
+
+ String bodyText = "123 TEST 01234 TEST";
+
+ String[] snippets = formatWithMatchExceedingContentLength(bodyText);
+
+ assertEquals(1, snippets.length);
+ assertEquals("123 <b>TEST</b> 01234 TE", snippets[0]);
+ }
+
+ public void testFormatWithMatchExceedingContentLength3() throws Exception {
+
+ String bodyText = "123 5678 01234 TEST TEST";
+
+ String[] snippets = formatWithMatchExceedingContentLength(bodyText);
+
+ assertEquals(1, snippets.length);
+ assertEquals("123 5678 01234 TE", snippets[0]);
+ }
public void testFormatWithMatchExceedingContentLength() throws Exception {
-
- int maxLength = 17;
+
String bodyText = "123 5678 01234 TEST";
+ String[] snippets = formatWithMatchExceedingContentLength(bodyText);
+
+ assertEquals(1, snippets.length);
+ // LUCENE-5166: no snippet
+ assertEquals("123 5678 01234 TE", snippets[0]);
+ }
+
+ private String[] formatWithMatchExceedingContentLength(String bodyText) throws IOException {
+
+ int maxLength = 17;
+
final Analyzer analyzer = new MockAnalyzer(random());
Directory dir = newDirectory();
@@ -122,12 +153,9 @@ public class TestPostingsHighlighter ext
String snippets[] = highlighter.highlight("body", query, searcher, topDocs);
- assertEquals(1, snippets.length);
- // LUCENE-5166: no snippet
- assertEquals("123 5678 01234 TE", snippets[0]);
-
ir.close();
dir.close();
+ return snippets;
}
// simple test highlighting last word.
@@ -1041,4 +1069,54 @@ public class TestPostingsHighlighter ext
ir.close();
dir.close();
}
+
+ // LUCENE-4906
+ public void testObjectFormatter() throws Exception {
+ Directory dir = newDirectory();
+ IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
+ iwc.setMergePolicy(newLogMergePolicy());
+ RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
+
+ FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
+ offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
+ Field body = new Field("body", "", offsetsType);
+ Document doc = new Document();
+ doc.add(body);
+
+ body.setStringValue("This is a test. Just a test highlighting from postings. Feel free to ignore.");
+ iw.addDocument(doc);
+
+ IndexReader ir = iw.getReader();
+ iw.close();
+
+ IndexSearcher searcher = newSearcher(ir);
+ PostingsHighlighter highlighter = new PostingsHighlighter() {
+ @Override
+ protected PassageFormatter getFormatter(String field) {
+ return new PassageFormatter() {
+ PassageFormatter defaultFormatter = new DefaultPassageFormatter();
+
+ @Override
+ public String[] format(Passage passages[], String content) {
+ // Just turns the String snippet into a length 2
+ // array of String
+ return new String[] {"blah blah", defaultFormatter.format(passages, content).toString()};
+ }
+ };
+ }
+ };
+
+ Query query = new TermQuery(new Term("body", "highlighting"));
+ TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER);
+ assertEquals(1, topDocs.totalHits);
+ int[] docIDs = new int[1];
+ docIDs[0] = topDocs.scoreDocs[0].doc;
+ Map<String,Object[]> snippets = highlighter.highlightFieldsAsObjects(new String[]{"body"}, query, searcher, docIDs, new int[] {1});
+ Object[] bodySnippets = snippets.get("body");
+ assertEquals(1, bodySnippets.length);
+ assertTrue(Arrays.equals(new String[] {"blah blah", "Just a test <b>highlighting</b> from postings. "}, (String[]) bodySnippets[0]));
+
+ ir.close();
+ dir.close();
+ }
}
Modified: lucene/dev/branches/lucene4956/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/AbstractTestCase.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/AbstractTestCase.java?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/AbstractTestCase.java (original)
+++ lucene/dev/branches/lucene4956/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/AbstractTestCase.java Mon Oct 21 18:58:24 2013
@@ -170,20 +170,20 @@ public abstract class AbstractTestCase e
protected List<BytesRef> analyze(String text, String field, Analyzer analyzer) throws IOException {
List<BytesRef> bytesRefs = new ArrayList<BytesRef>();
- TokenStream tokenStream = analyzer.tokenStream(field, text);
- TermToBytesRefAttribute termAttribute = tokenStream.getAttribute(TermToBytesRefAttribute.class);
+ try (TokenStream tokenStream = analyzer.tokenStream(field, text)) {
+ TermToBytesRefAttribute termAttribute = tokenStream.getAttribute(TermToBytesRefAttribute.class);
- BytesRef bytesRef = termAttribute.getBytesRef();
+ BytesRef bytesRef = termAttribute.getBytesRef();
- tokenStream.reset();
+ tokenStream.reset();
- while (tokenStream.incrementToken()) {
- termAttribute.fillBytesRef();
- bytesRefs.add(BytesRef.deepCopyOf(bytesRef));
- }
+ while (tokenStream.incrementToken()) {
+ termAttribute.fillBytesRef();
+ bytesRefs.add(BytesRef.deepCopyOf(bytesRef));
+ }
- tokenStream.end();
- tokenStream.close();
+ tokenStream.end();
+ }
return bytesRefs;
}
@@ -264,7 +264,8 @@ public abstract class AbstractTestCase e
}
@Override
- public final void end(){
+ public final void end() throws IOException {
+ super.end();
offsetAtt.setOffset(getFinalOffset(),getFinalOffset());
}
@@ -318,7 +319,8 @@ public abstract class AbstractTestCase e
}
@Override
- public void reset() {
+ public void reset() throws IOException {
+ super.reset();
startTerm = 0;
nextStartOffset = 0;
snippet = null;
Modified: lucene/dev/branches/lucene4956/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FastVectorHighlighterTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FastVectorHighlighterTest.java?rev=1534320&r1=1534319&r2=1534320&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FastVectorHighlighterTest.java (original)
+++ lucene/dev/branches/lucene4956/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FastVectorHighlighterTest.java Mon Oct 21 18:58:24 2013
@@ -16,18 +16,18 @@ package org.apache.lucene.search.vectorh
* limitations under the License.
*/
import java.io.IOException;
-import java.io.Reader;
-import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+import java.util.TreeMap;
import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.AnalyzerWrapper;
+import org.apache.lucene.analysis.CannedTokenStream;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenFilter;
import org.apache.lucene.analysis.MockTokenizer;
-import org.apache.lucene.analysis.TokenFilter;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.Token;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
@@ -35,7 +35,6 @@ import org.apache.lucene.document.TextFi
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
-import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.queries.CommonTermsQuery;
import org.apache.lucene.search.BooleanClause.Occur;
@@ -45,13 +44,16 @@ import org.apache.lucene.search.PhraseQu
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.search.highlight.DefaultEncoder;
+import org.apache.lucene.search.highlight.Encoder;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.automaton.CharacterRunAutomaton;
+import org.apache.lucene.util.automaton.RegExp;
public class FastVectorHighlighterTest extends LuceneTestCase {
-
- private static final String FIELD = "text";
+
public void testSimpleHighlightTest() throws IOException {
Directory dir = newDirectory();
@@ -298,128 +300,222 @@ public class FastVectorHighlighterTest e
writer.close();
dir.close();
}
+
+ public void testMatchedFields() throws IOException {
+ // Searching just on the stored field doesn't highlight a stopword
+ matchedFieldsTestCase( false, true, "a match", "a <b>match</b>",
+ clause( "field", "a" ), clause( "field", "match" ) );
+
+ // Even if you add an unqueried matched field that would match it
+ matchedFieldsTestCase( "a match", "a <b>match</b>",
+ clause( "field", "a" ), clause( "field", "match" ) );
+
+ // Nor if you query the field but don't add it as a matched field to the highlighter
+ matchedFieldsTestCase( false, false, "a match", "a <b>match</b>",
+ clause( "field_exact", "a" ), clause( "field", "match" ) );
+
+ // But if you query the field and add it as a matched field to the highlighter then it is highlighted
+ matchedFieldsTestCase( "a match", "<b>a</b> <b>match</b>",
+ clause( "field_exact", "a" ), clause( "field", "match" ) );
+
+ // It is also ok to match just the matched field but get highlighting from the stored field
+ matchedFieldsTestCase( "a match", "<b>a</b> <b>match</b>",
+ clause( "field_exact", "a" ), clause( "field_exact", "match" ) );
+
+ // Boosted matched fields work too
+ matchedFieldsTestCase( "a match", "<b>a</b> <b>match</b>",
+ clause( "field_exact", 5, "a" ), clause( "field", "match" ) );
+
+ // It is also ok if both the stored and the matched field match the term
+ matchedFieldsTestCase( "a match", "a <b>match</b>",
+ clause( "field_exact", "match" ), clause( "field", "match" ) );
+
+ // And the highlighter respects the boosts on matched fields when sorting fragments
+ matchedFieldsTestCase( "cat cat junk junk junk junk junk junk junk a cat junk junk",
+ "junk junk <b>a cat</b> junk junk",
+ clause( "field", "cat" ), clause( "field_exact", 5, "a", "cat" ) );
+ matchedFieldsTestCase( "cat cat junk junk junk junk junk junk junk a cat junk junk",
+ "<b>cat</b> <b>cat</b> junk junk junk junk",
+ clause( "field", "cat" ), clause( "field_exact", "a", "cat" ) );
+
+ // The same thing works across three fields as well
+ matchedFieldsTestCase( "cat cat CAT junk junk junk junk junk junk junk a cat junk junk",
+ "junk junk <b>a cat</b> junk junk",
+ clause( "field", "cat" ), clause( "field_exact", 200, "a", "cat" ), clause( "field_super_exact", 5, "CAT" ) );
+ matchedFieldsTestCase( "a cat cat junk junk junk junk junk junk junk a CAT junk junk",
+ "junk junk <b>a CAT</b> junk junk",
+ clause( "field", "cat" ), clause( "field_exact", 5, "a", "cat" ), clause( "field_super_exact", 200, "a", "CAT" ) );
+
+ // And across fields with different tokenizers!
+ matchedFieldsTestCase( "cat cat junk junk junk junk junk junk junk a cat junk junk",
+ "junk junk <b>a cat</b> junk junk",
+ clause( "field_exact", 5, "a", "cat" ), clause( "field_characters", "c" ) );
+ matchedFieldsTestCase( "cat cat junk junk junk junk junk junk junk a cat junk junk",
+ "<b>c</b>at <b>c</b>at junk junk junk junk",
+ clause( "field_exact", "a", "cat" ), clause( "field_characters", "c" ) );
+ matchedFieldsTestCase( "cat cat junk junk junk junk junk junk junk a cat junk junk",
+ "ca<b>t</b> ca<b>t</b> junk junk junk junk",
+ clause( "field_exact", "a", "cat" ), clause( "field_characters", "t" ) );
+ matchedFieldsTestCase( "cat cat junk junk junk junk junk junk junk a cat junk junk",
+ "<b>cat</b> <b>cat</b> junk junk junk junk", // See how the phrases are joined?
+ clause( "field", "cat" ), clause( "field_characters", 5, "c" ) );
+ matchedFieldsTestCase( "cat cat junk junk junk junk junk junk junk a cat junk junk",
+ "junk junk <b>a cat</b> junk junk",
+ clause( "field", "cat" ), clause( "field_characters", 5, "a", " ", "c", "a", "t" ) );
+
+ // Phrases and tokens inside one another are joined
+ matchedFieldsTestCase( "cats wow", "<b>cats w</b>ow",
+ clause( "field", "cats" ), clause( "field_tripples", "s w" ) );
+
+ // Everything works pretty well even if you don't require a field match
+ matchedFieldsTestCase( true, false, "cat cat junk junk junk junk junk junk junk a cat junk junk",
+ "junk junk <b>a cat</b> junk junk",
+ clause( "field", "cat" ), clause( "field_characters", 10, "a", " ", "c", "a", "t" ) );
+
+ // Even boosts keep themselves pretty much intact
+ matchedFieldsTestCase( true, false, "a cat cat junk junk junk junk junk junk junk a CAT junk junk",
+ "junk junk <b>a CAT</b> junk junk",
+ clause( "field", "cat" ), clause( "field_exact", 5, "a", "cat" ), clause( "field_super_exact", 200, "a", "CAT" ) );
+ matchedFieldsTestCase( true, false, "cat cat CAT junk junk junk junk junk junk junk a cat junk junk",
+ "junk junk <b>a cat</b> junk junk",
+ clause( "field", "cat" ), clause( "field_exact", 200, "a", "cat" ), clause( "field_super_exact", 5, "CAT" ) );
+
+ // Except that all the matched field matches apply even if they aren't mentioned in the query
+ // which can make for some confusing scoring. This isn't too big a deal, just something you
+ // need to think about when you don't force a field match.
+ matchedFieldsTestCase( true, false, "cat cat junk junk junk junk junk junk junk a cat junk junk",
+ "<b>cat</b> <b>cat</b> junk junk junk junk",
+ clause( "field", "cat" ), clause( "field_characters", 4, "a", " ", "c", "a", "t" ) );
+
+ // It is also cool to match fields that don't have _exactly_ the same text so long as you are careful.
+ // In this case field_sliced is a prefix of field.
+ matchedFieldsTestCase( "cat cat junk junk junk junk junk junk junk a cat junk junk",
+ "<b>cat</b> <b>cat</b> junk junk junk junk", clause( "field_sliced", "cat" ) );
+
+ // Multiple matches add to the score of the segment
+ matchedFieldsTestCase( "cat cat junk junk junk junk junk junk junk a cat junk junk",
+ "<b>cat</b> <b>cat</b> junk junk junk junk",
+ clause( "field", "cat" ), clause( "field_sliced", "cat" ), clause( "field_exact", 2, "a", "cat" ) );
+ matchedFieldsTestCase( "cat cat junk junk junk junk junk junk junk a cat junk junk",
+ "junk junk <b>a cat</b> junk junk",
+ clause( "field", "cat" ), clause( "field_sliced", "cat" ), clause( "field_exact", 4, "a", "cat" ) );
+
+ // Even fields with tokens on top of one another are ok
+ matchedFieldsTestCase( "cat cat junk junk junk junk junk junk junk a cat junk junk",
+ "<b>cat</b> cat junk junk junk junk",
+ clause( "field_der_red", 2, "der" ), clause( "field_exact", "a", "cat" ) );
+ matchedFieldsTestCase( "cat cat junk junk junk junk junk junk junk a cat junk junk",
+ "<b>cat</b> cat junk junk junk junk",
+ clause( "field_der_red", 2, "red" ), clause( "field_exact", "a", "cat" ) );
+ matchedFieldsTestCase( "cat cat junk junk junk junk junk junk junk a cat junk junk",
+ "<b>cat</b> cat junk junk junk junk",
+ clause( "field_der_red", "red" ), clause( "field_der_red", "der" ), clause( "field_exact", "a", "cat" ) );
+ }
- public void testOverlappingPhrases() throws IOException {
- final Analyzer analyzer = new Analyzer() {
-
- @Override
- protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
- final Tokenizer source = new MockTokenizer(reader);
- TokenStream sink = source;
- sink = new SynonymFilter(sink);
- return new TokenStreamComponents(source, sink);
- }
-
- };
- final Directory directory = newDirectory();
- RandomIndexWriter iw = new RandomIndexWriter(random(), directory, analyzer);
- Document doc = new Document();
- FieldType withVectors = new FieldType(TextField.TYPE_STORED);
- withVectors.setStoreTermVectors(true);
- withVectors.setStoreTermVectorPositions(true);
- withVectors.setStoreTermVectorOffsets(true);
- doc.add(new Field(FIELD, "a b c", withVectors));
- iw.addDocument(doc);
- DirectoryReader ir = iw.getReader();
-
- // Disjunction of two overlapping phrase queries
- final PhraseQuery pq1 = new PhraseQuery();
- pq1.add(new Term(FIELD, "a"), 0);
- pq1.add(new Term(FIELD, "b"), 1);
- pq1.add(new Term(FIELD, "c"), 2);
-
- final PhraseQuery pq2 = new PhraseQuery();
- pq2.add(new Term(FIELD, "a"), 0);
- pq2.add(new Term(FIELD, "B"), 1);
- pq2.add(new Term(FIELD, "c"), 2);
-
- final BooleanQuery bq = new BooleanQuery();
- bq.add(pq1, Occur.SHOULD);
- bq.add(pq2, Occur.SHOULD);
-
- // Single phrase query with two terms at the same position
- final PhraseQuery pq = new PhraseQuery();
- pq.add(new Term(FIELD, "a"), 0);
- pq.add(new Term(FIELD, "b"), 1);
- pq.add(new Term(FIELD, "B"), 1);
- pq.add(new Term(FIELD, "c"), 2);
-
- for (Query query : Arrays.asList(pq1, pq2, bq, pq)) {
- assertEquals(1, new IndexSearcher(ir).search(bq, 1).totalHits);
-
- FastVectorHighlighter highlighter = new FastVectorHighlighter();
- FieldQuery fieldQuery = highlighter.getFieldQuery(query, ir);
- String[] bestFragments = highlighter.getBestFragments(fieldQuery, ir, 0, FIELD, 1000, 1);
- assertEquals("<b>a b c</b>", bestFragments[0]);
- }
-
- ir.close();
- iw.close();
- directory.close();
+ private void matchedFieldsTestCase( String fieldValue, String expected, Query... queryClauses ) throws IOException {
+ matchedFieldsTestCase( true, true, fieldValue, expected, queryClauses );
}
- public void testPhraseWithGap() throws IOException {
- final Directory directory = newDirectory();
- RandomIndexWriter iw = new RandomIndexWriter(random(), directory, new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false));
+ private void matchedFieldsTestCase( boolean useMatchedFields, boolean fieldMatch, String fieldValue, String expected, Query... queryClauses ) throws IOException {
Document doc = new Document();
- FieldType withVectors = new FieldType(TextField.TYPE_STORED);
- withVectors.setStoreTermVectors(true);
- withVectors.setStoreTermVectorPositions(true);
- withVectors.setStoreTermVectorOffsets(true);
- doc.add(new Field(FIELD, "a b c", withVectors));
- iw.addDocument(doc);
- DirectoryReader ir = iw.getReader();
-
- final PhraseQuery pq = new PhraseQuery();
- pq.add(new Term(FIELD, "c"), 2);
- pq.add(new Term(FIELD, "a"), 0);
+ FieldType stored = new FieldType( TextField.TYPE_STORED );
+ stored.setStoreTermVectorOffsets( true );
+ stored.setStoreTermVectorPositions( true );
+ stored.setStoreTermVectors( true );
+ stored.freeze();
+ FieldType matched = new FieldType( TextField.TYPE_NOT_STORED );
+ matched.setStoreTermVectorOffsets( true );
+ matched.setStoreTermVectorPositions( true );
+ matched.setStoreTermVectors( true );
+ matched.freeze();
+ doc.add( new Field( "field", fieldValue, stored ) ); // Whitespace tokenized with English stop words
+ doc.add( new Field( "field_exact", fieldValue, matched ) ); // Whitespace tokenized without stop words
+ doc.add( new Field( "field_super_exact", fieldValue, matched ) ); // Whitespace tokenized without toLower
+ doc.add( new Field( "field_characters", fieldValue, matched ) ); // Each letter is a token
+ doc.add( new Field( "field_tripples", fieldValue, matched ) ); // Every three letters is a token
+ doc.add( new Field( "field_sliced", fieldValue.substring( 0, // Sliced at 10 chars then analyzed just like field
+ Math.min( fieldValue.length() - 1 , 10 ) ), matched ) );
+ doc.add( new Field( "field_der_red", new CannedTokenStream( // Hacky field containing "der" and "red" at pos = 0
+ token( "der", 1, 0, 3 ),
+ token( "red", 0, 0, 3 )
+ ), matched ) );
+
+ final Map<String, Analyzer> fieldAnalyzers = new TreeMap<String, Analyzer>();
+ fieldAnalyzers.put( "field", new MockAnalyzer( random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET ) );
+ fieldAnalyzers.put( "field_exact", new MockAnalyzer( random() ) );
+ fieldAnalyzers.put( "field_super_exact", new MockAnalyzer( random(), MockTokenizer.WHITESPACE, false ) );
+ fieldAnalyzers.put( "field_characters", new MockAnalyzer( random(), new CharacterRunAutomaton( new RegExp(".").toAutomaton() ), true ) );
+ fieldAnalyzers.put( "field_tripples", new MockAnalyzer( random(), new CharacterRunAutomaton( new RegExp("...").toAutomaton() ), true ) );
+ fieldAnalyzers.put( "field_sliced", fieldAnalyzers.get( "field" ) );
+ fieldAnalyzers.put( "field_der_red", fieldAnalyzers.get( "field" ) ); // This is required even though we provide a token stream
+ Analyzer analyzer = new AnalyzerWrapper() {
+ public Analyzer getWrappedAnalyzer(String fieldName) {
+ return fieldAnalyzers.get( fieldName );
+ }
+ };
- assertEquals(1, new IndexSearcher(ir).search(pq, 1).totalHits);
+ Directory dir = newDirectory();
+ IndexWriter writer = new IndexWriter( dir, newIndexWriterConfig( TEST_VERSION_CURRENT, analyzer ) );
+ writer.addDocument( doc );
FastVectorHighlighter highlighter = new FastVectorHighlighter();
- FieldQuery fieldQuery = highlighter.getFieldQuery(pq, ir);
- String[] bestFragments = highlighter.getBestFragments(fieldQuery, ir, 0, FIELD, 1000, 1);
- assertEquals("<b>a</b> b <b>c</b>", bestFragments[0]);
-
- ir.close();
- iw.close();
- directory.close();
- }
-
- // Simple token filter that adds 'B' as a synonym of 'b'
- private static class SynonymFilter extends TokenFilter {
+ FragListBuilder fragListBuilder = new SimpleFragListBuilder();
+ FragmentsBuilder fragmentsBuilder = new ScoreOrderFragmentsBuilder();
+ IndexReader reader = DirectoryReader.open( writer, true );
+ String[] preTags = new String[] { "<b>" };
+ String[] postTags = new String[] { "</b>" };
+ Encoder encoder = new DefaultEncoder();
+ int docId = 0;
+ BooleanQuery query = new BooleanQuery();
+ for ( Query clause : queryClauses ) {
+ query.add( clause, Occur.MUST );
+ }
+ FieldQuery fieldQuery = new FieldQuery( query, reader, true, fieldMatch );
+ String[] bestFragments;
+ if ( useMatchedFields ) {
+ Set< String > matchedFields = new HashSet< String >();
+ matchedFields.add( "field" );
+ matchedFields.add( "field_exact" );
+ matchedFields.add( "field_super_exact" );
+ matchedFields.add( "field_characters" );
+ matchedFields.add( "field_tripples" );
+ matchedFields.add( "field_sliced" );
+ matchedFields.add( "field_der_red" );
+ bestFragments = highlighter.getBestFragments( fieldQuery, reader, docId, "field", matchedFields, 25, 1,
+ fragListBuilder, fragmentsBuilder, preTags, postTags, encoder );
+ } else {
+ bestFragments = highlighter.getBestFragments( fieldQuery, reader, docId, "field", 25, 1,
+ fragListBuilder, fragmentsBuilder, preTags, postTags, encoder );
+ }
+ assertEquals( expected, bestFragments[ 0 ] );
- final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
- final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
+ reader.close();
+ writer.close();
+ dir.close();
+ }
- State pending;
+ private Query clause( String field, String... terms ) {
+ return clause( field, 1, terms );
+ }
- protected SynonymFilter(TokenStream input) {
- super(input);
- }
-
- @Override
- public boolean incrementToken() throws IOException {
- if (pending != null) {
- restoreState(pending);
- termAtt.setEmpty().append('B');
- posIncAtt.setPositionIncrement(0);
- pending = null;
- return true;
- }
- if (!input.incrementToken()) {
- return false;
- }
- if (termAtt.toString().equals("b")) {
- pending = captureState();
+ private Query clause( String field, float boost, String... terms ) {
+ Query q;
+ if ( terms.length == 1 ) {
+ q = new TermQuery( new Term( field, terms[ 0 ] ) );
+ } else {
+ PhraseQuery pq = new PhraseQuery();
+ for ( String term: terms ) {
+ pq.add( new Term( field, term ) );
}
- return true;
+ q = pq;
}
+ q.setBoost( boost );
+ return q;
+ }
- @Override
- public void reset() throws IOException {
- super.reset();
- pending = null;
- }
+ private static Token token( String term, int posInc, int startOffset, int endOffset ) {
+ Token t = new Token( term, startOffset, endOffset );
+ t.setPositionIncrement( posInc );
+ return t;
}
}