You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ja...@apache.org on 2013/05/30 09:53:46 UTC
svn commit: r1487777 [27/50] - in /lucene/dev/branches/security: ./
dev-tools/ dev-tools/eclipse/dot.settings/ dev-tools/idea/.idea/
dev-tools/idea/.idea/libraries/ dev-tools/idea/lucene/replicator/
dev-tools/maven/ dev-tools/maven/lucene/ dev-tools/ma...
Modified: lucene/dev/branches/security/lucene/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java (original)
+++ lucene/dev/branches/security/lucene/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java Thu May 30 07:53:18 2013
@@ -67,45 +67,44 @@ public class TestGrouping extends Lucene
dir,
newIndexWriterConfig(TEST_VERSION_CURRENT,
new MockAnalyzer(random())).setMergePolicy(newLogMergePolicy()));
- boolean canUseIDV = true;
// 0
Document doc = new Document();
- addGroupField(doc, groupField, "author1", canUseIDV);
+ addGroupField(doc, groupField, "author1");
doc.add(new TextField("content", "random text", Field.Store.YES));
doc.add(new Field("id", "1", customType));
w.addDocument(doc);
// 1
doc = new Document();
- addGroupField(doc, groupField, "author1", canUseIDV);
+ addGroupField(doc, groupField, "author1");
doc.add(new TextField("content", "some more random text", Field.Store.YES));
doc.add(new Field("id", "2", customType));
w.addDocument(doc);
// 2
doc = new Document();
- addGroupField(doc, groupField, "author1", canUseIDV);
+ addGroupField(doc, groupField, "author1");
doc.add(new TextField("content", "some more random textual data", Field.Store.YES));
doc.add(new Field("id", "3", customType));
w.addDocument(doc);
// 3
doc = new Document();
- addGroupField(doc, groupField, "author2", canUseIDV);
+ addGroupField(doc, groupField, "author2");
doc.add(new TextField("content", "some random text", Field.Store.YES));
doc.add(new Field("id", "4", customType));
w.addDocument(doc);
// 4
doc = new Document();
- addGroupField(doc, groupField, "author3", canUseIDV);
+ addGroupField(doc, groupField, "author3");
doc.add(new TextField("content", "some more random text", Field.Store.YES));
doc.add(new Field("id", "5", customType));
w.addDocument(doc);
// 5
doc = new Document();
- addGroupField(doc, groupField, "author3", canUseIDV);
+ addGroupField(doc, groupField, "author3");
doc.add(new TextField("content", "random", Field.Store.YES));
doc.add(new Field("id", "6", customType));
w.addDocument(doc);
@@ -116,12 +115,12 @@ public class TestGrouping extends Lucene
doc.add(new Field("id", "6", customType));
w.addDocument(doc);
- IndexSearcher indexSearcher = new IndexSearcher(w.getReader());
+ IndexSearcher indexSearcher = newSearcher(w.getReader());
w.close();
final Sort groupSort = Sort.RELEVANCE;
- if (canUseIDV && random().nextBoolean()) {
+ if (random().nextBoolean()) {
groupField += "_dv";
}
@@ -172,18 +171,16 @@ public class TestGrouping extends Lucene
dir.close();
}
- private void addGroupField(Document doc, String groupField, String value, boolean canUseIDV) {
+ private void addGroupField(Document doc, String groupField, String value) {
doc.add(new TextField(groupField, value, Field.Store.YES));
- if (canUseIDV) {
- doc.add(new SortedDocValuesField(groupField + "_dv", new BytesRef(value)));
- }
+ doc.add(new SortedDocValuesField(groupField + "_dv", new BytesRef(value)));
}
private AbstractFirstPassGroupingCollector<?> createRandomFirstPassCollector(String groupField, Sort groupSort, int topDocs) throws IOException {
AbstractFirstPassGroupingCollector<?> selected;
if (random().nextBoolean()) {
ValueSource vs = new BytesRefFieldSource(groupField);
- selected = new FunctionFirstPassGroupingCollector(vs, new HashMap<Object, Object>(), groupSort, topDocs);
+ selected = new FunctionFirstPassGroupingCollector(vs, new HashMap<>(), groupSort, topDocs);
} else {
selected = new TermFirstPassGroupingCollector(groupField, groupSort, topDocs);
}
@@ -196,7 +193,7 @@ public class TestGrouping extends Lucene
private AbstractFirstPassGroupingCollector<?> createFirstPassCollector(String groupField, Sort groupSort, int topDocs, AbstractFirstPassGroupingCollector<?> firstPassGroupingCollector) throws IOException {
if (TermFirstPassGroupingCollector.class.isAssignableFrom(firstPassGroupingCollector.getClass())) {
ValueSource vs = new BytesRefFieldSource(groupField);
- return new FunctionFirstPassGroupingCollector(vs, new HashMap<Object, Object>(), groupSort, topDocs);
+ return new FunctionFirstPassGroupingCollector(vs, new HashMap<>(), groupSort, topDocs);
} else {
return new TermFirstPassGroupingCollector(groupField, groupSort, topDocs);
}
@@ -238,9 +235,9 @@ public class TestGrouping extends Lucene
return new TermSecondPassGroupingCollector(groupField, searchGroups, groupSort, sortWithinGroup, maxDocsPerGroup , getScores, getMaxScores, fillSortFields);
} else {
ValueSource vs = new BytesRefFieldSource(groupField);
- List<SearchGroup<MutableValue>> mvalSearchGroups = new ArrayList<SearchGroup<MutableValue>>(searchGroups.size());
+ List<SearchGroup<MutableValue>> mvalSearchGroups = new ArrayList<>(searchGroups.size());
for (SearchGroup<BytesRef> mergedTopGroup : searchGroups) {
- SearchGroup<MutableValue> sg = new SearchGroup<MutableValue>();
+ SearchGroup<MutableValue> sg = new SearchGroup<>();
MutableValueStr groupValue = new MutableValueStr();
if (mergedTopGroup.groupValue != null) {
groupValue.value = mergedTopGroup.groupValue;
@@ -253,7 +250,7 @@ public class TestGrouping extends Lucene
mvalSearchGroups.add(sg);
}
- return new FunctionSecondPassGroupingCollector(mvalSearchGroups, groupSort, sortWithinGroup, maxDocsPerGroup, getScores, getMaxScores, fillSortFields, vs, new HashMap<Object, Object>());
+ return new FunctionSecondPassGroupingCollector(mvalSearchGroups, groupSort, sortWithinGroup, maxDocsPerGroup, getScores, getMaxScores, fillSortFields, vs, new HashMap<>());
}
}
@@ -263,7 +260,7 @@ public class TestGrouping extends Lucene
return new TermAllGroupsCollector(groupField);
} else {
ValueSource vs = new BytesRefFieldSource(groupField);
- return new FunctionAllGroupsCollector(vs, new HashMap<Object, Object>());
+ return new FunctionAllGroupsCollector(vs, new HashMap<>());
}
}
@@ -299,9 +296,9 @@ public class TestGrouping extends Lucene
return null;
}
- List<SearchGroup<BytesRef>> groups = new ArrayList<SearchGroup<BytesRef>>(mutableValueGroups.size());
+ List<SearchGroup<BytesRef>> groups = new ArrayList<>(mutableValueGroups.size());
for (SearchGroup<MutableValue> mutableValueGroup : mutableValueGroups) {
- SearchGroup<BytesRef> sg = new SearchGroup<BytesRef>();
+ SearchGroup<BytesRef> sg = new SearchGroup<>();
sg.groupValue = mutableValueGroup.groupValue.exists() ? ((MutableValueStr) mutableValueGroup.groupValue).value : null;
sg.sortValues = mutableValueGroup.sortValues;
groups.add(sg);
@@ -318,10 +315,10 @@ public class TestGrouping extends Lucene
return ((TermSecondPassGroupingCollector) c).getTopGroups(withinGroupOffset);
} else if (c.getClass().isAssignableFrom(FunctionSecondPassGroupingCollector.class)) {
TopGroups<MutableValue> mvalTopGroups = ((FunctionSecondPassGroupingCollector) c).getTopGroups(withinGroupOffset);
- List<GroupDocs<BytesRef>> groups = new ArrayList<GroupDocs<BytesRef>>(mvalTopGroups.groups.length);
+ List<GroupDocs<BytesRef>> groups = new ArrayList<>(mvalTopGroups.groups.length);
for (GroupDocs<MutableValue> mvalGd : mvalTopGroups.groups) {
BytesRef groupValue = mvalGd.groupValue.exists() ? ((MutableValueStr) mvalGd.groupValue).value : null;
- groups.add(new GroupDocs<BytesRef>(Float.NaN, mvalGd.maxScore, mvalGd.totalHits, mvalGd.scoreDocs, groupValue, mvalGd.groupSortValues));
+ groups.add(new GroupDocs<>(Float.NaN, mvalGd.maxScore, mvalGd.totalHits, mvalGd.scoreDocs, groupValue, mvalGd.groupSortValues));
}
return new TopGroups<BytesRef>(mvalTopGroups.groupSort, mvalTopGroups.withinGroupSort, mvalTopGroups.totalHitCount, mvalTopGroups.totalGroupedHitCount, groups.toArray(new GroupDocs[groups.size()]), Float.NaN);
}
@@ -349,7 +346,7 @@ public class TestGrouping extends Lucene
}
private Sort getRandomSort() {
- final List<SortField> sortFields = new ArrayList<SortField>();
+ final List<SortField> sortFields = new ArrayList<>();
if (random().nextInt(7) == 2) {
sortFields.add(SortField.FIELD_SCORE);
} else {
@@ -411,14 +408,14 @@ public class TestGrouping extends Lucene
final Comparable<?> c;
final SortField sf = sortFields[fieldIDX];
if (sf.getType() == SortField.Type.SCORE) {
- c = new Float(d.score);
+ c = d.score;
} else if (sf.getField().equals("sort1")) {
c = d.sort1;
} else if (sf.getField().equals("sort2")) {
c = d.sort2;
} else {
assertEquals("id", sf.getField());
- c = new Integer(d.id);
+ c = d.id;
}
fields[fieldIDX] = c;
}
@@ -449,12 +446,12 @@ public class TestGrouping extends Lucene
final Comparator<GroupDoc> groupSortComp = getComparator(groupSort);
Arrays.sort(groupDocs, groupSortComp);
- final HashMap<BytesRef,List<GroupDoc>> groups = new HashMap<BytesRef,List<GroupDoc>>();
- final List<BytesRef> sortedGroups = new ArrayList<BytesRef>();
- final List<Comparable<?>[]> sortedGroupFields = new ArrayList<Comparable<?>[]>();
+ final HashMap<BytesRef,List<GroupDoc>> groups = new HashMap<>();
+ final List<BytesRef> sortedGroups = new ArrayList<>();
+ final List<Comparable<?>[]> sortedGroupFields = new ArrayList<>();
int totalHitCount = 0;
- Set<BytesRef> knownGroups = new HashSet<BytesRef>();
+ Set<BytesRef> knownGroups = new HashSet<>();
//System.out.println("TEST: slowGrouping");
for(GroupDoc d : groupDocs) {
@@ -479,7 +476,7 @@ public class TestGrouping extends Lucene
if (fillFields) {
sortedGroupFields.add(fillFields(d, groupSort));
}
- l = new ArrayList<GroupDoc>();
+ l = new ArrayList<>();
groups.put(d.group, l);
}
l.add(d);
@@ -519,7 +516,7 @@ public class TestGrouping extends Lucene
hits = new ScoreDoc[0];
}
- result[idx-groupOffset] = new GroupDocs<BytesRef>(Float.NaN,
+ result[idx-groupOffset] = new GroupDocs<>(Float.NaN,
0.0f,
docs.size(),
hits,
@@ -528,20 +525,20 @@ public class TestGrouping extends Lucene
}
if (doAllGroups) {
- return new TopGroups<BytesRef>(
- new TopGroups<BytesRef>(groupSort.getSort(), docSort.getSort(), totalHitCount, totalGroupedHitCount, result, Float.NaN),
- knownGroups.size()
+ return new TopGroups<>(
+ new TopGroups<>(groupSort.getSort(), docSort.getSort(), totalHitCount, totalGroupedHitCount, result, Float.NaN),
+ knownGroups.size()
);
} else {
- return new TopGroups<BytesRef>(groupSort.getSort(), docSort.getSort(), totalHitCount, totalGroupedHitCount, result, Float.NaN);
+ return new TopGroups<>(groupSort.getSort(), docSort.getSort(), totalHitCount, totalGroupedHitCount, result, Float.NaN);
}
}
private DirectoryReader getDocBlockReader(Directory dir, GroupDoc[] groupDocs) throws IOException {
// Coalesce by group, but in random order:
Collections.shuffle(Arrays.asList(groupDocs), random());
- final Map<BytesRef,List<GroupDoc>> groupMap = new HashMap<BytesRef,List<GroupDoc>>();
- final List<BytesRef> groupValues = new ArrayList<BytesRef>();
+ final Map<BytesRef,List<GroupDoc>> groupMap = new HashMap<>();
+ final List<BytesRef> groupValues = new ArrayList<>();
for(GroupDoc groupDoc : groupDocs) {
if (!groupMap.containsKey(groupDoc.group)) {
@@ -557,7 +554,7 @@ public class TestGrouping extends Lucene
newIndexWriterConfig(TEST_VERSION_CURRENT,
new MockAnalyzer(random())));
- final List<List<Document>> updateDocs = new ArrayList<List<Document>>();
+ final List<List<Document>> updateDocs = new ArrayList<>();
FieldType groupEndType = new FieldType(StringField.TYPE_NOT_STORED);
groupEndType.setIndexOptions(IndexOptions.DOCS_ONLY);
@@ -565,7 +562,7 @@ public class TestGrouping extends Lucene
//System.out.println("TEST: index groups");
for(BytesRef group : groupValues) {
- final List<Document> docs = new ArrayList<Document>();
+ final List<Document> docs = new ArrayList<>();
//System.out.println("TEST: group=" + (group == null ? "null" : group.utf8ToString()));
for(GroupDoc groupValue : groupMap.get(group)) {
Document doc = new Document();
@@ -637,7 +634,7 @@ public class TestGrouping extends Lucene
System.out.println("TEST: numDocs=" + numDocs + " numGroups=" + numGroups);
}
- final List<BytesRef> groups = new ArrayList<BytesRef>();
+ final List<BytesRef> groups = new ArrayList<>();
for(int i=0;i<numGroups;i++) {
String randomValue;
do {
@@ -790,14 +787,14 @@ public class TestGrouping extends Lucene
// ReaderBlocks only increases maxDoc() vs reader, which
// means a monotonic shift in scores, so we can
// reliably remap them w/ Map:
- final Map<String,Map<Float,Float>> scoreMap = new HashMap<String,Map<Float,Float>>();
+ final Map<String,Map<Float,Float>> scoreMap = new HashMap<>();
// Tricky: must separately set .score2, because the doc
// block index was created with possible deletions!
//System.out.println("fixup score2");
for(int contentID=0;contentID<3;contentID++) {
//System.out.println(" term=real" + contentID);
- final Map<Float,Float> termScoreMap = new HashMap<Float,Float>();
+ final Map<Float,Float> termScoreMap = new HashMap<>();
scoreMap.put("real"+contentID, termScoreMap);
//System.out.println("term=real" + contentID + " dfold=" + s.docFreq(new Term("content", "real"+contentID)) +
//" dfnew=" + sBlocks.docFreq(new Term("content", "real"+contentID)));
@@ -939,7 +936,7 @@ public class TestGrouping extends Lucene
// Get 1st pass top groups using shards
- ValueHolder<Boolean> idvBasedImplsUsedSharded = new ValueHolder<Boolean>(false);
+ ValueHolder<Boolean> idvBasedImplsUsedSharded = new ValueHolder<>(false);
final TopGroups<BytesRef> topGroupsShards = searchShards(s, shards.subSearchers, query, groupSort, docSort,
groupOffset, topNGroups, docOffset, docsPerGroup, getScores, getMaxScores, canUseIDV, false, idvBasedImplsUsedSharded);
final AbstractSecondPassGroupingCollector<?> c2;
@@ -971,7 +968,7 @@ public class TestGrouping extends Lucene
if (doAllGroups) {
TopGroups<BytesRef> tempTopGroups = getTopGroups(c2, docOffset);
- groupsResult = new TopGroups<BytesRef>(tempTopGroups, allGroupsCollector.getGroupCount());
+ groupsResult = new TopGroups<>(tempTopGroups, allGroupsCollector.getGroupCount());
} else {
groupsResult = getTopGroups(c2, docOffset);
}
@@ -1058,7 +1055,7 @@ public class TestGrouping extends Lucene
final TopGroups<BytesRef> groupsResultBlocks;
if (doAllGroups && tempTopGroupsBlocks != null) {
assertEquals((int) tempTopGroupsBlocks.totalGroupCount, allGroupsCollector2.getGroupCount());
- groupsResultBlocks = new TopGroups<BytesRef>(tempTopGroupsBlocks, allGroupsCollector2.getGroupCount());
+ groupsResultBlocks = new TopGroups<>(tempTopGroupsBlocks, allGroupsCollector2.getGroupCount());
} else {
groupsResultBlocks = tempTopGroupsBlocks;
}
@@ -1086,7 +1083,7 @@ public class TestGrouping extends Lucene
// Block index does not index DocValues so we pass
// false for canUseIDV:
final TopGroups<BytesRef> topGroupsBlockShards = searchShards(sBlocks, shardsBlocks.subSearchers, query,
- groupSort, docSort, groupOffset, topNGroups, docOffset, docsPerGroup, getScores, getMaxScores, false, false, new ValueHolder<Boolean>(false));
+ groupSort, docSort, groupOffset, topNGroups, docOffset, docsPerGroup, getScores, getMaxScores, false, false, new ValueHolder<>(false));
if (expectedGroups != null) {
// Fixup scores for reader2
@@ -1168,8 +1165,8 @@ public class TestGrouping extends Lucene
}
// Run 1st pass collector to get top groups per shard
final Weight w = topSearcher.createNormalizedWeight(query);
- final List<Collection<SearchGroup<BytesRef>>> shardGroups = new ArrayList<Collection<SearchGroup<BytesRef>>>();
- List<AbstractFirstPassGroupingCollector<?>> firstPassGroupingCollectors = new ArrayList<AbstractFirstPassGroupingCollector<?>>();
+ final List<Collection<SearchGroup<BytesRef>>> shardGroups = new ArrayList<>();
+ List<AbstractFirstPassGroupingCollector<?>> firstPassGroupingCollectors = new ArrayList<>();
AbstractFirstPassGroupingCollector<?> firstPassCollector = null;
boolean shardsCanUseIDV;
if (canUseIDV) {
Modified: lucene/dev/branches/security/lucene/highlighter/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/highlighter/build.xml?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/highlighter/build.xml (original)
+++ lucene/dev/branches/security/lucene/highlighter/build.xml Thu May 30 07:53:18 2013
@@ -23,6 +23,9 @@
Highlights search keywords in results
</description>
+ <!-- some files for testing that do not have license headers -->
+ <property name="rat.excludes" value="**/*.utf8"/>
+
<import file="../module-build.xml"/>
<path id="classpath">
Modified: lucene/dev/branches/security/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java (original)
+++ lucene/dev/branches/security/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java Thu May 30 07:53:18 2013
@@ -251,7 +251,7 @@ public class TokenSources {
if (unsortedTokens != null) {
tokensInOriginalOrder = unsortedTokens.toArray(new Token[unsortedTokens
.size()]);
- ArrayUtil.mergeSort(tokensInOriginalOrder, new Comparator<Token>() {
+ ArrayUtil.timSort(tokensInOriginalOrder, new Comparator<Token>() {
@Override
public int compare(Token t1, Token t2) {
if (t1.startOffset() == t2.startOffset()) return t1.endOffset()
Modified: lucene/dev/branches/security/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermPositionVector.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermPositionVector.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermPositionVector.java (original)
+++ lucene/dev/branches/security/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermPositionVector.java Thu May 30 07:53:18 2013
@@ -86,7 +86,7 @@ public final class TokenStreamFromTermPo
this.positionedTokens.add(token);
}
}
- CollectionUtil.mergeSort(this.positionedTokens, tokenComparator);
+ CollectionUtil.timSort(this.positionedTokens, tokenComparator);
int lastPosition = -1;
for (final Token token : this.positionedTokens) {
int thisPosition = token.getPositionIncrement();
Modified: lucene/dev/branches/security/lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java (original)
+++ lucene/dev/branches/security/lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java Thu May 30 07:53:18 2013
@@ -69,7 +69,7 @@ public class WeightedSpanTermExtractor {
private boolean cachedTokenStream;
private boolean wrapToCaching = true;
private int maxDocCharsToAnalyze;
- private AtomicReader reader = null;
+ private AtomicReader internalReader = null;
public WeightedSpanTermExtractor() {
@@ -350,7 +350,7 @@ public class WeightedSpanTermExtractor {
}
protected AtomicReaderContext getLeafContext() throws IOException {
- if (reader == null) {
+ if (internalReader == null) {
if(wrapToCaching && !(tokenStream instanceof CachingTokenFilter)) {
assert !cachedTokenStream;
tokenStream = new CachingTokenFilter(new OffsetLimitTokenFilter(tokenStream, maxDocCharsToAnalyze));
@@ -361,9 +361,9 @@ public class WeightedSpanTermExtractor {
tokenStream.reset();
final IndexSearcher searcher = indexer.createSearcher();
// MEM index has only atomic ctx
- reader = new DelegatingAtomicReader(((AtomicReaderContext)searcher.getTopReaderContext()).reader());
+ internalReader = new DelegatingAtomicReader(((AtomicReaderContext)searcher.getTopReaderContext()).reader());
}
- return reader.getContext();
+ return internalReader.getContext();
}
/*
@@ -468,7 +468,7 @@ public class WeightedSpanTermExtractor {
try {
extract(query, terms);
} finally {
- IOUtils.close(reader);
+ IOUtils.close(internalReader);
}
return terms;
@@ -516,7 +516,7 @@ public class WeightedSpanTermExtractor {
weightedSpanTerm.weight *= idf;
}
} finally {
- IOUtils.close(reader);
+ IOUtils.close(internalReader);
}
return terms;
Modified: lucene/dev/branches/security/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/Passage.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/Passage.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/Passage.java (original)
+++ lucene/dev/branches/security/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/Passage.java Thu May 30 07:53:18 2013
@@ -17,10 +17,10 @@ package org.apache.lucene.search.posting
* limitations under the License.
*/
-import org.apache.lucene.index.Term;
import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.InPlaceMergeSorter;
import org.apache.lucene.util.RamUsageEstimator;
-import org.apache.lucene.util.SorterTemplate;
/**
* Represents a passage (typically a sentence of the document).
@@ -36,18 +36,24 @@ public final class Passage {
int matchStarts[] = new int[8];
int matchEnds[] = new int[8];
- Term matchTerms[] = new Term[8];
+ BytesRef matchTerms[] = new BytesRef[8];
int numMatches = 0;
- void addMatch(int startOffset, int endOffset, Term term) {
+ void addMatch(int startOffset, int endOffset, BytesRef term) {
assert startOffset >= this.startOffset && startOffset <= this.endOffset;
if (numMatches == matchStarts.length) {
- matchStarts = ArrayUtil.grow(matchStarts, numMatches+1);
- matchEnds = ArrayUtil.grow(matchEnds, numMatches+1);
- Term newMatchTerms[] = new Term[ArrayUtil.oversize(numMatches+1, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
+ int newLength = ArrayUtil.oversize(numMatches+1, RamUsageEstimator.NUM_BYTES_OBJECT_REF);
+ int newMatchStarts[] = new int[newLength];
+ int newMatchEnds[] = new int[newLength];
+ BytesRef newMatchTerms[] = new BytesRef[newLength];
+ System.arraycopy(matchStarts, 0, newMatchStarts, 0, numMatches);
+ System.arraycopy(matchEnds, 0, newMatchEnds, 0, numMatches);
System.arraycopy(matchTerms, 0, newMatchTerms, 0, numMatches);
+ matchStarts = newMatchStarts;
+ matchEnds = newMatchEnds;
matchTerms = newMatchTerms;
}
+ assert matchStarts.length == matchEnds.length && matchEnds.length == matchTerms.length;
matchStarts[numMatches] = startOffset;
matchEnds[numMatches] = endOffset;
matchTerms[numMatches] = term;
@@ -57,8 +63,8 @@ public final class Passage {
void sort() {
final int starts[] = matchStarts;
final int ends[] = matchEnds;
- final Term terms[] = matchTerms;
- new SorterTemplate() {
+ final BytesRef terms[] = matchTerms;
+ new InPlaceMergeSorter() {
@Override
protected void swap(int i, int j) {
int temp = starts[i];
@@ -69,30 +75,17 @@ public final class Passage {
ends[i] = ends[j];
ends[j] = temp;
- Term tempTerm = terms[i];
+ BytesRef tempTerm = terms[i];
terms[i] = terms[j];
terms[j] = tempTerm;
}
@Override
protected int compare(int i, int j) {
- // TODO: java7 use Integer.compare(starts[i], starts[j])
- return Long.signum(((long)starts[i]) - starts[j]);
+ return Integer.compare(starts[i], starts[j]);
}
- @Override
- protected void setPivot(int i) {
- pivot = starts[i];
- }
-
- @Override
- protected int comparePivot(int j) {
- // TODO: java7 use Integer.compare(pivot, starts[j])
- return Long.signum(((long)pivot) - starts[j]);
- }
-
- int pivot;
- }.mergeSort(0, numMatches-1);
+ }.sort(0, numMatches);
}
void reset() {
@@ -157,11 +150,11 @@ public final class Passage {
}
/**
- * Term of the matches, corresponding with {@link #getMatchStarts()}.
+ * BytesRef (term text) of the matches, corresponding with {@link #getMatchStarts()}.
* <p>
* Only {@link #getNumMatches()} are valid.
*/
- public Term[] getMatchTerms() {
+ public BytesRef[] getMatchTerms() {
return matchTerms;
}
}
Modified: lucene/dev/branches/security/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PassageFormatter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PassageFormatter.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PassageFormatter.java (original)
+++ lucene/dev/branches/security/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PassageFormatter.java Thu May 30 07:53:18 2013
@@ -19,74 +19,20 @@ package org.apache.lucene.search.posting
/**
* Creates a formatted snippet from the top passages.
- * <p>
- * The default implementation marks the query terms as bold, and places
- * ellipses between unconnected passages.
+ *
* @lucene.experimental
*/
-public class PassageFormatter {
- private final String preTag;
- private final String postTag;
- private final String ellipsis;
-
- /**
- * Creates a new PassageFormatter with the default tags.
- */
- public PassageFormatter() {
- this("<b>", "</b>", "... ");
- }
-
- /**
- * Creates a new PassageFormatter with custom tags.
- * @param preTag text which should appear before a highlighted term.
- * @param postTag text which should appear after a highlighted term.
- * @param ellipsis text which should be used to connect two unconnected passages.
- */
- public PassageFormatter(String preTag, String postTag, String ellipsis) {
- if (preTag == null || postTag == null || ellipsis == null) {
- throw new NullPointerException();
- }
- this.preTag = preTag;
- this.postTag = postTag;
- this.ellipsis = ellipsis;
- }
-
+public abstract class PassageFormatter {
+
/**
* Formats the top <code>passages</code> from <code>content</code>
* into a human-readable text snippet.
- *
+ *
* @param passages top-N passages for the field. Note these are sorted in
* the order that they appear in the document for convenience.
* @param content content for the field.
* @return formatted highlight
*/
- public String format(Passage passages[], String content) {
- StringBuilder sb = new StringBuilder();
- int pos = 0;
- for (Passage passage : passages) {
- // don't add ellipsis if its the first one, or if its connected.
- if (passage.startOffset > pos && pos > 0) {
- sb.append(ellipsis);
- }
- pos = passage.startOffset;
- for (int i = 0; i < passage.numMatches; i++) {
- int start = passage.matchStarts[i];
- int end = passage.matchEnds[i];
- // its possible to have overlapping terms
- if (start > pos) {
- sb.append(content.substring(pos, start));
- }
- if (end > pos) {
- sb.append(preTag);
- sb.append(content.substring(Math.max(pos, start), end));
- sb.append(postTag);
- pos = end;
- }
- }
- // its possible a "term" from the analyzer could span a sentence boundary.
- sb.append(content.substring(pos, Math.max(pos, passage.endOffset)));
- pos = passage.endOffset;
- }
- return sb.toString();
- }
+ public abstract String format(Passage passages[], String content);
+
}
Modified: lucene/dev/branches/security/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PassageScorer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PassageScorer.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PassageScorer.java (original)
+++ lucene/dev/branches/security/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PassageScorer.java Thu May 30 07:53:18 2013
@@ -30,15 +30,37 @@ public class PassageScorer {
// TODO: this formula is completely made up. It might not provide relevant snippets!
/** BM25 k1 parameter, controls term frequency normalization */
- public static final float k1 = 1.2f;
+ final float k1;
/** BM25 b parameter, controls length normalization. */
- public static final float b = 0.75f;
+ final float b;
+ /** A pivot used for length normalization. */
+ final float pivot;
/**
- * A pivot used for length normalization.
- * The default value is the typical average English sentence length.
+ * Creates PassageScorer with these default values:
+ * <ul>
+ * <li>{@code k1 = 1.2},
+ * <li>{@code b = 0.75}.
+ * <li>{@code pivot = 87}
+ * </ul>
*/
- public static final float pivot = 87f;
+ public PassageScorer() {
+ // 1.2 and 0.75 are well-known bm25 defaults (but maybe not the best here) ?
+ // 87 is typical average english sentence length.
+ this(1.2f, 0.75f, 87f);
+ }
+
+ /**
+ * Creates PassageScorer with specified scoring parameters
+ * @param k1 Controls non-linear term frequency normalization (saturation).
+ * @param b Controls to what degree passage length normalizes tf values.
+ * @param pivot Pivot value for length normalization (some rough idea of average sentence length in characters).
+ */
+ public PassageScorer(float k1, float b, float pivot) {
+ this.k1 = k1;
+ this.b = b;
+ this.pivot = pivot;
+ }
/**
* Computes term importance, given its in-document statistics.
Modified: lucene/dev/branches/security/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java (original)
+++ lucene/dev/branches/security/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java Thu May 30 07:53:18 2013
@@ -19,6 +19,7 @@ package org.apache.lucene.search.posting
import java.io.IOException;
import java.text.BreakIterator;
+import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.HashMap;
@@ -33,6 +34,7 @@ import org.apache.lucene.index.AtomicRea
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexReaderContext;
import org.apache.lucene.index.MultiReader;
@@ -41,12 +43,12 @@ import org.apache.lucene.index.StoredFie
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
-import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.InPlaceMergeSorter;
import org.apache.lucene.util.UnicodeUtil;
/**
@@ -62,7 +64,7 @@ import org.apache.lucene.util.UnicodeUti
* into a {@link Passage}, and then scores each Passage using a separate {@link PassageScorer}.
* Passages are finally formatted into highlighted snippets with a {@link PassageFormatter}.
* <p>
- * <b>WARNING</b>: The code is very new and may still have some exciting bugs!
+ * <b>WARNING</b>: The code is very new and probably still has some exciting bugs!
* <p>
* Example usage:
* <pre class="prettyprint">
@@ -81,7 +83,7 @@ import org.apache.lucene.util.UnicodeUti
* This is thread-safe, and can be used across different readers.
* @lucene.experimental
*/
-public final class PostingsHighlighter {
+public class PostingsHighlighter {
// TODO: maybe allow re-analysis for tiny fields? currently we require offsets,
// but if the analyzer is really fast and the field is tiny, this might really be
@@ -95,9 +97,14 @@ public final class PostingsHighlighter {
public static final int DEFAULT_MAX_LENGTH = 10000;
private final int maxLength;
- private final BreakIterator breakIterator;
- private final PassageScorer scorer;
- private final PassageFormatter formatter;
+
+ /** Set the first time {@link #getFormatter} is called,
+ * and then reused. */
+ private PassageFormatter defaultFormatter;
+
+ /** Set the first time {@link #getScorer} is called,
+ * and then reused. */
+ private PassageScorer defaultScorer;
/**
* Creates a new highlighter with default parameters.
@@ -112,32 +119,44 @@ public final class PostingsHighlighter {
* @throws IllegalArgumentException if <code>maxLength</code> is negative or <code>Integer.MAX_VALUE</code>
*/
public PostingsHighlighter(int maxLength) {
- this(maxLength, BreakIterator.getSentenceInstance(Locale.ROOT), new PassageScorer(), new PassageFormatter());
- }
-
- /**
- * Creates a new highlighter with custom parameters.
- * @param maxLength maximum content size to process.
- * @param breakIterator used for finding passage boundaries.
- * @param scorer used for ranking passages.
- * @param formatter used for formatting passages into highlighted snippets.
- * @throws IllegalArgumentException if <code>maxLength</code> is negative or <code>Integer.MAX_VALUE</code>
- */
- public PostingsHighlighter(int maxLength, BreakIterator breakIterator, PassageScorer scorer, PassageFormatter formatter) {
if (maxLength < 0 || maxLength == Integer.MAX_VALUE) {
// two reasons: no overflow problems in BreakIterator.preceding(offset+1),
// our sentinel in the offsets queue uses this value to terminate.
throw new IllegalArgumentException("maxLength must be < Integer.MAX_VALUE");
}
- if (breakIterator == null || scorer == null || formatter == null) {
- throw new NullPointerException();
- }
this.maxLength = maxLength;
- this.breakIterator = breakIterator;
- this.scorer = scorer;
- this.formatter = formatter;
}
+ /** Returns the {@link BreakIterator} to use for
+ * dividing text into passages. This returns
+ * {@link BreakIterator#getSentenceInstance(Locale)} by default;
+ * subclasses can override to customize. */
+ protected BreakIterator getBreakIterator(String field) {
+ return BreakIterator.getSentenceInstance(Locale.ROOT);
+ }
+
+ /** Returns the {@link PassageFormatter} to use for
+ * formatting passages into highlighted snippets. This
+ * returns a new {@code PassageFormatter} by default;
+ * subclasses can override to customize. */
+ protected PassageFormatter getFormatter(String field) {
+ if (defaultFormatter == null) {
+ defaultFormatter = new DefaultPassageFormatter();
+ }
+ return defaultFormatter;
+ }
+
+ /** Returns the {@link PassageScorer} to use for
+ * ranking passages. This
+ * returns a new {@code PassageScorer} by default;
+ * subclasses can override to customize. */
+ protected PassageScorer getScorer(String field) {
+ if (defaultScorer == null) {
+ defaultScorer = new PassageScorer();
+ }
+ return defaultScorer;
+ }
+
/**
* Highlights the top passages from a single field.
*
@@ -147,7 +166,8 @@ public final class PostingsHighlighter {
* @param searcher searcher that was previously used to execute the query.
* @param topDocs TopDocs containing the summary result documents to highlight.
* @return Array of formatted snippets corresponding to the documents in <code>topDocs</code>.
- * If no highlights were found for a document, its value is <code>null</code>.
+ * If no highlights were found for a document, the
+ * first sentence for the field will be returned.
* @throws IOException if an I/O error occurred during processing
* @throws IllegalArgumentException if <code>field</code> was indexed without
* {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
@@ -167,13 +187,15 @@ public final class PostingsHighlighter {
* @param maxPassages The maximum number of top-N ranked passages used to
* form the highlighted snippets.
* @return Array of formatted snippets corresponding to the documents in <code>topDocs</code>.
- * If no highlights were found for a document, its value is <code>null</code>.
+ * If no highlights were found for a document, the
+ * first {@code maxPassages} sentences from the
+ * field will be returned.
* @throws IOException if an I/O error occurred during processing
* @throws IllegalArgumentException if <code>field</code> was indexed without
* {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
*/
public String[] highlight(String field, Query query, IndexSearcher searcher, TopDocs topDocs, int maxPassages) throws IOException {
- Map<String,String[]> res = highlightFields(new String[] { field }, query, searcher, topDocs, maxPassages);
+ Map<String,String[]> res = highlightFields(new String[] { field }, query, searcher, topDocs, new int[] { maxPassages });
return res.get(field);
}
@@ -196,13 +218,16 @@ public final class PostingsHighlighter {
* @param topDocs TopDocs containing the summary result documents to highlight.
* @return Map keyed on field name, containing the array of formatted snippets
* corresponding to the documents in <code>topDocs</code>.
- * If no highlights were found for a document, its value is <code>null</code>.
+ * If no highlights were found for a document, the
+ * first sentence from the field will be returned.
* @throws IOException if an I/O error occurred during processing
* @throws IllegalArgumentException if <code>field</code> was indexed without
* {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
*/
public Map<String,String[]> highlightFields(String fields[], Query query, IndexSearcher searcher, TopDocs topDocs) throws IOException {
- return highlightFields(fields, query, searcher, topDocs, 1);
+ int maxPassages[] = new int[fields.length];
+ Arrays.fill(maxPassages, 1);
+ return highlightFields(fields, query, searcher, topDocs, maxPassages);
}
/**
@@ -226,69 +251,148 @@ public final class PostingsHighlighter {
* form the highlighted snippets.
* @return Map keyed on field name, containing the array of formatted snippets
* corresponding to the documents in <code>topDocs</code>.
- * If no highlights were found for a document, its value is <code>null</code>.
+ * If no highlights were found for a document, the
+ * first {@code maxPassages} sentences from the
+ * field will be returned.
* @throws IOException if an I/O error occurred during processing
* @throws IllegalArgumentException if <code>field</code> was indexed without
* {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
*/
- public Map<String,String[]> highlightFields(String fields[], Query query, IndexSearcher searcher, TopDocs topDocs, int maxPassages) throws IOException {
- final IndexReader reader = searcher.getIndexReader();
+ public Map<String,String[]> highlightFields(String fields[], Query query, IndexSearcher searcher, TopDocs topDocs, int maxPassages[]) throws IOException {
final ScoreDoc scoreDocs[] = topDocs.scoreDocs;
- query = rewrite(query);
- SortedSet<Term> queryTerms = new TreeSet<Term>();
- query.extractTerms(queryTerms);
-
int docids[] = new int[scoreDocs.length];
for (int i = 0; i < docids.length; i++) {
docids[i] = scoreDocs[i].doc;
}
+
+ return highlightFields(fields, query, searcher, docids, maxPassages);
+ }
+
+ /**
+ * Highlights the top-N passages from multiple fields,
+ * for the provided int[] docids.
+ *
+ * @param fieldsIn field names to highlight.
+ * Must have a stored string value and also be indexed with offsets.
+ * @param query query to highlight.
+ * @param searcher searcher that was previously used to execute the query.
+ * @param docidsIn containing the document IDs to highlight.
+ * @param maxPassagesIn The maximum number of top-N ranked passages per-field used to
+ * form the highlighted snippets.
+ * @return Map keyed on field name, containing the array of formatted snippets
+ * corresponding to the documents in <code>topDocs</code>.
+ * If no highlights were found for a document, the
+ * first {@code maxPassages} from the field will
+ * be returned.
+ * @throws IOException if an I/O error occurred during processing
+ * @throws IllegalArgumentException if <code>field</code> was indexed without
+ * {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
+ */
+ public Map<String,String[]> highlightFields(String fieldsIn[], Query query, IndexSearcher searcher, int[] docidsIn, int maxPassagesIn[]) throws IOException {
+ if (fieldsIn.length < 1) {
+ throw new IllegalArgumentException("fieldsIn must not be empty");
+ }
+ if (fieldsIn.length != maxPassagesIn.length) {
+ throw new IllegalArgumentException("invalid number of maxPassagesIn");
+ }
+ final IndexReader reader = searcher.getIndexReader();
+ query = rewrite(query);
+ SortedSet<Term> queryTerms = new TreeSet<Term>();
+ query.extractTerms(queryTerms);
+
IndexReaderContext readerContext = reader.getContext();
List<AtomicReaderContext> leaves = readerContext.leaves();
- BreakIterator bi = (BreakIterator)breakIterator.clone();
+ // Make our own copies because we sort in-place:
+ int[] docids = new int[docidsIn.length];
+ System.arraycopy(docidsIn, 0, docids, 0, docidsIn.length);
+ final String fields[] = new String[fieldsIn.length];
+ System.arraycopy(fieldsIn, 0, fields, 0, fieldsIn.length);
+ final int maxPassages[] = new int[maxPassagesIn.length];
+ System.arraycopy(maxPassagesIn, 0, maxPassages, 0, maxPassagesIn.length);
// sort for sequential io
Arrays.sort(docids);
- Arrays.sort(fields);
+ new InPlaceMergeSorter() {
+
+ @Override
+ protected void swap(int i, int j) {
+ String tmp = fields[i];
+ fields[i] = fields[j];
+ fields[j] = tmp;
+ int tmp2 = maxPassages[i];
+ maxPassages[i] = maxPassages[j];
+ maxPassages[j] = tmp2;
+ }
+
+ @Override
+ protected int compare(int i, int j) {
+ return fields[i].compareTo(fields[j]);
+ }
+
+ }.sort(0, fields.length);
// pull stored data:
- LimitedStoredFieldVisitor visitor = new LimitedStoredFieldVisitor(fields, maxLength);
- String contents[][] = new String[fields.length][docids.length];
- for (int i = 0; i < docids.length; i++) {
- searcher.doc(docids[i], visitor);
- for (int j = 0; j < fields.length; j++) {
- contents[j][i] = visitor.getValue(j).toString();
- }
- visitor.reset();
- }
+ String[][] contents = loadFieldValues(searcher, fields, docids, maxLength);
Map<String,String[]> highlights = new HashMap<String,String[]>();
for (int i = 0; i < fields.length; i++) {
String field = fields[i];
+ int numPassages = maxPassages[i];
Term floor = new Term(field, "");
Term ceiling = new Term(field, UnicodeUtil.BIG_TERM);
SortedSet<Term> fieldTerms = queryTerms.subSet(floor, ceiling);
// TODO: should we have some reasonable defaults for term pruning? (e.g. stopwords)
- Term terms[] = fieldTerms.toArray(new Term[fieldTerms.size()]);
- Map<Integer,String> fieldHighlights = highlightField(field, contents[i], bi, terms, docids, leaves, maxPassages);
+
+ // Strip off the redundant field:
+ BytesRef terms[] = new BytesRef[fieldTerms.size()];
+ int termUpto = 0;
+ for(Term term : fieldTerms) {
+ terms[termUpto++] = term.bytes();
+ }
+ Map<Integer,String> fieldHighlights = highlightField(field, contents[i], getBreakIterator(field), terms, docids, leaves, numPassages);
- String[] result = new String[scoreDocs.length];
- for (int j = 0; j < scoreDocs.length; j++) {
- result[j] = fieldHighlights.get(scoreDocs[j].doc);
+ String[] result = new String[docids.length];
+ for (int j = 0; j < docidsIn.length; j++) {
+ result[j] = fieldHighlights.get(docidsIn[j]);
}
highlights.put(field, result);
}
return highlights;
}
+
+ /** Loads the String values for each field X docID to be
+ * highlighted. By default this loads from stored
+ * fields, but a subclass can change the source. This
+ * method should allocate the String[fields.length][docids.length]
+ * and fill all values. The returned Strings must be
+ * identical to what was indexed. */
+ protected String[][] loadFieldValues(IndexSearcher searcher, String[] fields, int[] docids, int maxLength) throws IOException {
+ String contents[][] = new String[fields.length][docids.length];
+ LimitedStoredFieldVisitor visitor = new LimitedStoredFieldVisitor(fields, maxLength);
+ for (int i = 0; i < docids.length; i++) {
+ searcher.doc(docids[i], visitor);
+ for (int j = 0; j < fields.length; j++) {
+ contents[j][i] = visitor.getValue(j).toString();
+ }
+ visitor.reset();
+ }
+ return contents;
+ }
- private Map<Integer,String> highlightField(String field, String contents[], BreakIterator bi, Term terms[], int[] docids, List<AtomicReaderContext> leaves, int maxPassages) throws IOException {
+ private Map<Integer,String> highlightField(String field, String contents[], BreakIterator bi, BytesRef terms[], int[] docids, List<AtomicReaderContext> leaves, int maxPassages) throws IOException {
Map<Integer,String> highlights = new HashMap<Integer,String>();
// reuse in the real sense... for docs in same segment we just advance our old enum
DocsAndPositionsEnum postings[] = null;
TermsEnum termsEnum = null;
int lastLeaf = -1;
-
+
+ PassageFormatter fieldFormatter = getFormatter(field);
+ if (fieldFormatter == null) {
+ throw new NullPointerException("PassageFormatter cannot be null");
+ }
+
for (int i = 0; i < docids.length; i++) {
String content = contents[i];
if (content.length() == 0) {
@@ -308,9 +412,13 @@ public final class PostingsHighlighter {
postings = new DocsAndPositionsEnum[terms.length];
}
Passage passages[] = highlightDoc(field, terms, content.length(), bi, doc - subContext.docBase, termsEnum, postings, maxPassages);
+ if (passages.length == 0) {
+ passages = getEmptyHighlight(field, bi, maxPassages);
+ }
if (passages.length > 0) {
- // otherwise a null snippet
- highlights.put(doc, formatter.format(passages, content));
+ // otherwise a null snippet (eg if field is missing
+ // entirely from the doc)
+ highlights.put(doc, fieldFormatter.format(passages, content));
}
lastLeaf = leaf;
}
@@ -321,8 +429,12 @@ public final class PostingsHighlighter {
// algorithm: treat sentence snippets as miniature documents
// we can intersect these with the postings lists via BreakIterator.preceding(offset),s
// score each sentence as norm(sentenceStartOffset) * sum(weight * tf(freq))
- private Passage[] highlightDoc(String field, Term terms[], int contentLength, BreakIterator bi, int doc,
+ private Passage[] highlightDoc(String field, BytesRef terms[], int contentLength, BreakIterator bi, int doc,
TermsEnum termsEnum, DocsAndPositionsEnum[] postings, int n) throws IOException {
+ PassageScorer scorer = getScorer(field);
+ if (scorer == null) {
+ throw new NullPointerException("PassageScorer cannot be null");
+ }
PriorityQueue<OffsetsEnum> pq = new PriorityQueue<OffsetsEnum>();
float weights[] = new float[terms.length];
// initialize postings
@@ -333,7 +445,7 @@ public final class PostingsHighlighter {
continue;
} else if (de == null) {
postings[i] = EMPTY; // initially
- if (!termsEnum.seekExact(terms[i].bytes(), true)) {
+ if (!termsEnum.seekExact(terms[i], true)) {
continue; // term not found
}
de = postings[i] = termsEnum.docsAndPositions(null, null, DocsAndPositionsEnum.FLAG_OFFSETS);
@@ -361,10 +473,12 @@ public final class PostingsHighlighter {
PriorityQueue<Passage> passageQueue = new PriorityQueue<Passage>(n, new Comparator<Passage>() {
@Override
public int compare(Passage left, Passage right) {
- if (right.score == left.score) {
- return right.startOffset - left.endOffset;
+ if (left.score < right.score) {
+ return -1;
+ } else if (left.score > right.score) {
+ return 1;
} else {
- return right.score > left.score ? 1 : -1;
+ return left.startOffset - right.startOffset;
}
}
});
@@ -435,7 +549,35 @@ public final class PostingsHighlighter {
}
current.score += weights[off.id] * scorer.tf(tf, current.endOffset - current.startOffset);
}
- return new Passage[0];
+
+ // Dead code but compiler disagrees:
+ assert false;
+ return null;
+ }
+
+ /** Called to summarize a document when no hits were
+ * found. By default this just returns the first
+ * {@code maxPassages} sentences; subclasses can override
+ * to customize. */
+ protected Passage[] getEmptyHighlight(String fieldName, BreakIterator bi, int maxPassages) {
+ // BreakIterator should be un-next'd:
+ List<Passage> passages = new ArrayList<Passage>();
+ int pos = bi.current();
+ assert pos == 0;
+ while (passages.size() < maxPassages) {
+ int next = bi.next();
+ if (next == BreakIterator.DONE) {
+ break;
+ }
+ Passage passage = new Passage();
+ passage.score = Float.NaN;
+ passage.startOffset = pos;
+ passage.endOffset = next;
+ passages.add(passage);
+ pos = next;
+ }
+
+ return passages.toArray(new Passage[passages.size()]);
}
private static class OffsetsEnum implements Comparable<OffsetsEnum> {
@@ -457,7 +599,7 @@ public final class PostingsHighlighter {
if (off == otherOff) {
return id - other.id;
} else {
- return Long.signum(((long)off) - otherOff);
+ return Integer.compare(off, otherOff);
}
} catch (IOException e) {
throw new RuntimeException(e);
@@ -490,6 +632,9 @@ public final class PostingsHighlighter {
@Override
public int advance(int target) throws IOException { return NO_MORE_DOCS; }
+
+ @Override
+ public long cost() { return 0; }
};
/**
Modified: lucene/dev/branches/security/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BaseFragListBuilder.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BaseFragListBuilder.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BaseFragListBuilder.java (original)
+++ lucene/dev/branches/security/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BaseFragListBuilder.java Thu May 30 07:53:18 2013
@@ -46,63 +46,98 @@ public abstract class BaseFragListBuilde
this( MARGIN_DEFAULT );
}
- protected FieldFragList createFieldFragList( FieldPhraseList fieldPhraseList, FieldFragList fieldFragList, int fragCharSize ){
-
+ protected FieldFragList createFieldFragList( FieldPhraseList fieldPhraseList, FieldFragList fieldFragList, int fragCharSize ){
if( fragCharSize < minFragCharSize )
throw new IllegalArgumentException( "fragCharSize(" + fragCharSize + ") is too small. It must be " + minFragCharSize + " or higher." );
List<WeightedPhraseInfo> wpil = new ArrayList<WeightedPhraseInfo>();
- Iterator<WeightedPhraseInfo> ite = fieldPhraseList.getPhraseList().iterator();
+ IteratorQueue<WeightedPhraseInfo> queue = new IteratorQueue<WeightedPhraseInfo>(fieldPhraseList.getPhraseList().iterator());
WeightedPhraseInfo phraseInfo = null;
int startOffset = 0;
- boolean taken = false;
- while( true ){
- if( !taken ){
- if( !ite.hasNext() ) break;
- phraseInfo = ite.next();
- }
- taken = false;
- if( phraseInfo == null ) break;
-
+ while((phraseInfo = queue.top()) != null){
// if the phrase violates the border of previous fragment, discard it and try next phrase
- if( phraseInfo.getStartOffset() < startOffset ) continue;
-
+ if( phraseInfo.getStartOffset() < startOffset ) {
+ queue.removeTop();
+ continue;
+ }
+
wpil.clear();
- wpil.add( phraseInfo );
- int firstOffset = phraseInfo.getStartOffset();
- int st = phraseInfo.getStartOffset() - margin < startOffset ?
- startOffset : phraseInfo.getStartOffset() - margin;
- int en = st + fragCharSize;
- if( phraseInfo.getEndOffset() > en )
- en = phraseInfo.getEndOffset();
-
- int lastEndOffset = phraseInfo.getEndOffset();
- while( true ){
- if( ite.hasNext() ){
- phraseInfo = ite.next();
- taken = true;
- if( phraseInfo == null ) break;
- }
- else
+ final int currentPhraseStartOffset = phraseInfo.getStartOffset();
+ int currentPhraseEndOffset = phraseInfo.getEndOffset();
+ int spanStart = Math.max(currentPhraseStartOffset - margin, startOffset);
+ int spanEnd = Math.max(currentPhraseEndOffset, spanStart + fragCharSize);
+ if (acceptPhrase(queue.removeTop(), currentPhraseEndOffset - currentPhraseStartOffset, fragCharSize)) {
+ wpil.add(phraseInfo);
+ }
+ while((phraseInfo = queue.top()) != null) { // pull until we crossed the current spanEnd
+ if (phraseInfo.getEndOffset() <= spanEnd) {
+ currentPhraseEndOffset = phraseInfo.getEndOffset();
+ if (acceptPhrase(queue.removeTop(), currentPhraseEndOffset - currentPhraseStartOffset, fragCharSize)) {
+ wpil.add(phraseInfo);
+ }
+ } else {
break;
- if( phraseInfo.getEndOffset() <= en ){
- wpil.add( phraseInfo );
- lastEndOffset = phraseInfo.getEndOffset();
}
- else
- break;
}
- int matchLen = lastEndOffset - firstOffset;
- //now recalculate the start and end position to "center" the result
- int newMargin = (fragCharSize-matchLen)/2;
- st = firstOffset - newMargin;
- if(st<startOffset){
- st = startOffset;
+ if (wpil.isEmpty()) {
+ continue;
+ }
+
+ final int matchLen = currentPhraseEndOffset - currentPhraseStartOffset;
+ // now recalculate the start and end position to "center" the result
+ final int newMargin = Math.max(0, (fragCharSize-matchLen)/2); // matchLen can be > fragCharSize prevent IAOOB here
+ spanStart = currentPhraseStartOffset - newMargin;
+ if (spanStart < startOffset) {
+ spanStart = startOffset;
}
- en = st+fragCharSize;
- startOffset = en;
- fieldFragList.add( st, en, wpil );
+ // whatever is bigger here we grow this out
+ spanEnd = spanStart + Math.max(matchLen, fragCharSize);
+ startOffset = spanEnd;
+ fieldFragList.add(spanStart, spanEnd, wpil);
}
return fieldFragList;
}
+
+ /**
+ * A predicate to decide if the given {@link WeightedPhraseInfo} should be
+ * accepted as a highlighted phrase or if it should be discarded.
+ * <p>
+ * The default implementation discards phrases that are composed of more than one term
+ * and where the matchLength exceeds the fragment character size.
+ *
+ * @param info the phrase info to accept
+ * @param matchLength the match length of the current phrase
+ * @param fragCharSize the configured fragment character size
+ * @return <code>true</code> if this phrase info should be accepted as a highligh phrase
+ */
+ protected boolean acceptPhrase(WeightedPhraseInfo info, int matchLength, int fragCharSize) {
+ return info.getTermsOffsets().size() <= 1 || matchLength <= fragCharSize;
+ }
+
+ private static final class IteratorQueue<T> {
+ private final Iterator<T> iter;
+ private T top;
+
+ public IteratorQueue(Iterator<T> iter) {
+ this.iter = iter;
+ T removeTop = removeTop();
+ assert removeTop == null;
+ }
+
+ public T top() {
+ return top;
+ }
+
+ public T removeTop() {
+ T currentTop = top;
+ if (iter.hasNext()) {
+ top = iter.next();
+ } else {
+ top = null;
+ }
+ return currentTop;
+ }
+
+ }
+
}
Modified: lucene/dev/branches/security/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java (original)
+++ lucene/dev/branches/security/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java Thu May 30 07:53:18 2013
@@ -81,7 +81,7 @@ public class FieldPhraseList {
if( ti != null )
nextMap = currMap.getTermMap( ti.getText() );
if( ti == null || nextMap == null ){
- if( ti != null )
+ if( ti != null )
fieldTermStack.push( ti );
if( currMap.isValidTermOrPhrase( phraseCandidate ) ){
addIfNoOverlap( new WeightedPhraseInfo( phraseCandidate, currMap.getBoost(), currMap.getTermOrPhraseNumber() ) );
Modified: lucene/dev/branches/security/lucene/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java (original)
+++ lucene/dev/branches/security/lucene/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java Thu May 30 07:53:18 2013
@@ -91,7 +91,7 @@ public class HighlighterTest extends Bas
phraseQuery.add(new Term(FIELD_NAME, "long"));
query = phraseQuery;
- searcher = new IndexSearcher(reader);
+ searcher = newSearcher(reader);
TopDocs hits = searcher.search(query, 10);
QueryScorer scorer = new QueryScorer(query, FIELD_NAME);
@@ -122,7 +122,7 @@ public class HighlighterTest extends Bas
query.add(new Term(FIELD_NAME, "long"));
query.add(new Term(FIELD_NAME, "very"));
- searcher = new IndexSearcher(reader);
+ searcher = newSearcher(reader);
TopDocs hits = searcher.search(query, 10);
assertEquals(2, hits.totalHits);
QueryScorer scorer = new QueryScorer(query, FIELD_NAME);
@@ -164,12 +164,21 @@ public class HighlighterTest extends Bas
public String toString(String field) {
return null;
}
-
+
+ @Override
+ public int hashCode() {
+ return 31 * super.hashCode();
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ return super.equals(obj);
+ }
};
Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
- searcher = new IndexSearcher(reader);
+ searcher = newSearcher(reader);
TopDocs hits = searcher.search(query, 10);
assertEquals(2, hits.totalHits);
QueryScorer scorer = new QueryScorer(query, FIELD_NAME);
@@ -238,7 +247,7 @@ public class HighlighterTest extends Bas
*/
private String highlightField(Query query, String fieldName, String text)
throws IOException, InvalidTokenOffsetsException {
- TokenStream tokenStream = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true)
+ TokenStream tokenStream = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET)
.tokenStream(fieldName, new StringReader(text));
// Assuming "<B>", "</B>" used to highlight
SimpleHTMLFormatter formatter = new SimpleHTMLFormatter();
@@ -413,7 +422,7 @@ public class HighlighterTest extends Bas
public void testSpanRegexQuery() throws Exception {
query = new SpanOrQuery(new SpanMultiTermQueryWrapper<RegexpQuery>(new RegexpQuery(new Term(FIELD_NAME, "ken.*"))));
- searcher = new IndexSearcher(reader);
+ searcher = newSearcher(reader);
hits = searcher.search(query, 100);
int maxNumFragmentsRequired = 2;
@@ -437,7 +446,7 @@ public class HighlighterTest extends Bas
public void testRegexQuery() throws Exception {
query = new RegexpQuery(new Term(FIELD_NAME, "ken.*"));
- searcher = new IndexSearcher(reader);
+ searcher = newSearcher(reader);
hits = searcher.search(query, 100);
int maxNumFragmentsRequired = 2;
@@ -459,10 +468,35 @@ public class HighlighterTest extends Bas
numHighlights == 5);
}
+ public void testExternalReader() throws Exception {
+ query = new RegexpQuery(new Term(FIELD_NAME, "ken.*"));
+ searcher = newSearcher(reader);
+ hits = searcher.search(query, 100);
+ int maxNumFragmentsRequired = 2;
+
+ QueryScorer scorer = new QueryScorer(query, reader, FIELD_NAME);
+ Highlighter highlighter = new Highlighter(this, scorer);
+
+ for (int i = 0; i < hits.totalHits; i++) {
+ String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
+ TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(text));
+
+ highlighter.setTextFragmenter(new SimpleFragmenter(40));
+
+ String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired,
+ "...");
+ if (VERBOSE) System.out.println("\t" + result);
+ }
+
+ assertTrue(reader.docFreq(new Term(FIELD_NAME, "hello")) > 0);
+ assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
+ numHighlights == 5);
+ }
+
public void testNumericRangeQuery() throws Exception {
// doesn't currently highlight, but make sure it doesn't cause exception either
query = NumericRangeQuery.newIntRange(NUMERIC_FIELD_NAME, 2, 6, true, true);
- searcher = new IndexSearcher(reader);
+ searcher = newSearcher(reader);
hits = searcher.search(query, 100);
int maxNumFragmentsRequired = 2;
@@ -861,7 +895,7 @@ public class HighlighterTest extends Bas
query = new WildcardQuery(new Term(FIELD_NAME, "ken*"));
((WildcardQuery)query).setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE);
- searcher = new IndexSearcher(reader);
+ searcher = newSearcher(reader);
// can't rewrite ConstantScore if you want to highlight it -
// it rewrites to ConstantScoreQuery which cannot be highlighted
// query = unReWrittenQuery.rewrite(reader);
@@ -1274,7 +1308,7 @@ public class HighlighterTest extends Bas
}
public void testMaxSizeHighlight() throws Exception {
- final MockAnalyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true);
+ final MockAnalyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET);
// we disable MockTokenizer checks because we will forcefully limit the
// tokenstream and call end() before incrementToken() returns false.
analyzer.setEnableChecks(false);
@@ -1309,7 +1343,7 @@ public class HighlighterTest extends Bas
CharacterRunAutomaton stopWords = new CharacterRunAutomaton(BasicAutomata.makeString("stoppedtoken"));
// we disable MockTokenizer checks because we will forcefully limit the
// tokenstream and call end() before incrementToken() returns false.
- final MockAnalyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopWords, true);
+ final MockAnalyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopWords);
analyzer.setEnableChecks(false);
TermQuery query = new TermQuery(new Term("data", goodWord));
@@ -1360,7 +1394,7 @@ public class HighlighterTest extends Bas
Highlighter hg = getHighlighter(query, "text", fm);
hg.setTextFragmenter(new NullFragmenter());
hg.setMaxDocCharsToAnalyze(36);
- String match = hg.getBestFragment(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopWords, true), "text", text);
+ String match = hg.getBestFragment(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopWords), "text", text);
assertTrue(
"Matched text should contain remainder of text after highlighted query ",
match.endsWith("in it"));
@@ -1376,8 +1410,8 @@ public class HighlighterTest extends Bas
public void run() throws Exception {
numHighlights = 0;
// test to show how rewritten query can still be used
- searcher = new IndexSearcher(reader);
- Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true);
+ searcher = newSearcher(reader);
+ Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET);
BooleanQuery query = new BooleanQuery();
query.add(new WildcardQuery(new Term(FIELD_NAME, "jf?")), Occur.SHOULD);
@@ -1756,7 +1790,7 @@ public class HighlighterTest extends Bas
private void searchIndex() throws IOException, InvalidTokenOffsetsException {
Query query = new TermQuery(new Term("t_text1", "random"));
IndexReader reader = DirectoryReader.open(dir);
- IndexSearcher searcher = new IndexSearcher(reader);
+ IndexSearcher searcher = newSearcher(reader);
// This scorer can return negative idf -> null fragment
Scorer scorer = new QueryTermScorer( query, searcher.getIndexReader(), "t_text1" );
// This scorer doesn't use idf (patch version)
@@ -1809,7 +1843,7 @@ public class HighlighterTest extends Bas
}
public void doSearching(Query unReWrittenQuery) throws Exception {
- searcher = new IndexSearcher(reader);
+ searcher = newSearcher(reader);
// for any multi-term queries to work (prefix, wildcard, range,fuzzy etc)
// you must use a rewritten query!
query = unReWrittenQuery.rewrite(reader);
@@ -1841,11 +1875,11 @@ public class HighlighterTest extends Bas
super.setUp();
a = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false);
- analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true);
+ analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET);
dir = newDirectory();
ramDir = newDirectory();
IndexWriter writer = new IndexWriter(ramDir, newIndexWriterConfig(
- TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true)));
+ TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET)));
for (String text : texts) {
addDoc(writer, text);
}
Modified: lucene/dev/branches/security/lucene/highlighter/src/test/org/apache/lucene/search/highlight/custom/HighlightCustomQueryTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/highlighter/src/test/org/apache/lucene/search/highlight/custom/HighlightCustomQueryTest.java?rev=1487777&r1=1487776&r2=1487777&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/highlighter/src/test/org/apache/lucene/search/highlight/custom/HighlightCustomQueryTest.java (original)
+++ lucene/dev/branches/security/lucene/highlighter/src/test/org/apache/lucene/search/highlight/custom/HighlightCustomQueryTest.java Thu May 30 07:53:18 2013
@@ -89,7 +89,7 @@ public class HighlightCustomQueryTest ex
private String highlightField(Query query, String fieldName,
String text) throws IOException, InvalidTokenOffsetsException {
TokenStream tokenStream = new MockAnalyzer(random(), MockTokenizer.SIMPLE,
- true, MockTokenFilter.ENGLISH_STOPSET, true).tokenStream(fieldName,
+ true, MockTokenFilter.ENGLISH_STOPSET).tokenStream(fieldName,
new StringReader(text));
// Assuming "<B>", "</B>" used to highlight
SimpleHTMLFormatter formatter = new SimpleHTMLFormatter();