You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ds...@apache.org on 2017/01/08 04:10:54 UTC
lucene-solr:master: LUCENE-7620: UnifiedHighlighter: new
LengthGoalBreakIterator wrapper
Repository: lucene-solr
Updated Branches:
refs/heads/master ac14fc32e -> ea4998952
LUCENE-7620: UnifiedHighlighter: new LengthGoalBreakIterator wrapper
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/ea499895
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/ea499895
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/ea499895
Branch: refs/heads/master
Commit: ea49989524e96563f2b9bdd4256012239907882f
Parents: ac14fc3
Author: David Smiley <ds...@apache.org>
Authored: Sat Jan 7 23:10:48 2017 -0500
Committer: David Smiley <ds...@apache.org>
Committed: Sat Jan 7 23:10:48 2017 -0500
----------------------------------------------------------------------
lucene/CHANGES.txt | 6 +-
.../uhighlight/LengthGoalBreakIterator.java | 185 +++++++++++++++++++
.../lucene/search/uhighlight/Passage.java | 1 +
.../uhighlight/LengthGoalBreakIteratorTest.java | 104 +++++++++++
4 files changed, 295 insertions(+), 1 deletion(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ea499895/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 30c9ab0..4bbf9ee 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -216,7 +216,11 @@ Improvements
ensure all dimensions are indexed. (Adrien Grand)
* LUCENE-7614: Complex Phrase Query parser ignores double quotes around single token
- prefix, wildcard, range queries (Mikhail Khludnev)
+ prefix, wildcard, range queries (Mikhail Khludnev)
+
+* LUCENE-7620: Added LengthGoalBreakIterator, a wrapper around another B.I. to skip breaks
+ that would create Passages that are too short. Only for use with the UnifiedHighlighter
+ (and probably PostingsHighlighter). (David Smiley)
Optimizations
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ea499895/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/LengthGoalBreakIterator.java
----------------------------------------------------------------------
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/LengthGoalBreakIterator.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/LengthGoalBreakIterator.java
new file mode 100644
index 0000000..3134013
--- /dev/null
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/LengthGoalBreakIterator.java
@@ -0,0 +1,185 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.search.uhighlight;
+
+import java.text.BreakIterator;
+import java.text.CharacterIterator;
+
+/**
+ * Wraps another {@link BreakIterator} to skip past breaks that would result in passages that are too
+ * short. It's still possible to get a short passage but only at the very end of the input text.
+ * <p>
+ * Important: This is not a general purpose {@link BreakIterator}; it's only designed to work in a way
+ * compatible with the {@link UnifiedHighlighter}. Some assumptions are checked with Java assertions.
+ *
+ * @lucene.experimental
+ */
+public class LengthGoalBreakIterator extends BreakIterator {
+
+ private final BreakIterator baseIter;
+ private final int lengthGoal;
+ private final boolean isMinimumLength; // if false then is "closest to" length
+
+ /** Breaks will be at least {@code minLength} apart (to the extent possible). */
+ public static LengthGoalBreakIterator createMinLength(BreakIterator baseIter, int minLength) {
+ return new LengthGoalBreakIterator(baseIter, minLength, true);
+ }
+
+ /** Breaks will be on average {@code targetLength} apart; the closest break to this target (before or after)
+ * is chosen. */
+ public static LengthGoalBreakIterator createClosestToLength(BreakIterator baseIter, int targetLength) {
+ return new LengthGoalBreakIterator(baseIter, targetLength, false);
+ }
+
+ private LengthGoalBreakIterator(BreakIterator baseIter, int lengthGoal, boolean isMinimumLength) {
+ this.baseIter = baseIter;
+ this.lengthGoal = lengthGoal;
+ this.isMinimumLength = isMinimumLength;
+ }
+
+ // note: the only methods that will get called are setText(txt), getText(),
+ // getSummaryPassagesNoHighlight: current(), first(), next()
+ // highlightOffsetsEnums: preceding(int), and following(int)
+ // Nonetheless we make some attempt to implement the rest; mostly delegating.
+
+ @Override
+ public String toString() {
+ String goalDesc = isMinimumLength ? "minLen" : "targetLen";
+ return getClass().getSimpleName() + "{" + goalDesc + "=" + lengthGoal + ", baseIter=" + baseIter + "}";
+ }
+
+ @Override
+ public Object clone() {
+ return new LengthGoalBreakIterator((BreakIterator) baseIter.clone(), lengthGoal, isMinimumLength);
+ }
+
+ @Override
+ public CharacterIterator getText() {
+ return baseIter.getText();
+ }
+
+ @Override
+ public void setText(String newText) {
+ baseIter.setText(newText);
+ }
+
+ @Override
+ public void setText(CharacterIterator newText) {
+ baseIter.setText(newText);
+ }
+
+ @Override
+ public int current() {
+ return baseIter.current();
+ }
+
+ @Override
+ public int first() {
+ return baseIter.first();
+ }
+
+ @Override
+ public int last() {
+ return baseIter.last();
+ }
+
+ @Override
+ public int next(int n) {
+ assert false : "Not supported";
+ return baseIter.next(n); // probably wrong
+ }
+
+ // called by getSummaryPassagesNoHighlight to generate default summary.
+ @Override
+ public int next() {
+ return following(current());
+ }
+
+ @Override
+ public int previous() {
+ assert false : "Not supported";
+ return baseIter.previous();
+ }
+
+ // called while the current position is the start of a new passage; find end of passage
+ @Override
+ public int following(int followingIdx) {
+ final int startIdx = current();
+ if (followingIdx < startIdx) {
+ assert false : "Not supported";
+ return baseIter.following(followingIdx);
+ }
+ final int targetIdx = startIdx + lengthGoal;
+ // When followingIdx >= targetIdx, we can simply delegate since it will be >= the target
+ if (followingIdx >= targetIdx - 1) {
+ return baseIter.following(followingIdx);
+ }
+ // If target exceeds the text length, return the last index.
+ if (targetIdx >= getText().getEndIndex()) {
+ return baseIter.last();
+ }
+
+ // Find closest break >= the target
+ final int afterIdx = baseIter.following(targetIdx - 1);
+ if (afterIdx == DONE) { // we're at the end; can this happen?
+ return current();
+ }
+ if (afterIdx == targetIdx) { // right on the money
+ return afterIdx;
+ }
+ if (isMinimumLength) { // thus never undershoot
+ return afterIdx;
+ }
+
+ // note: it is a shame that we invoke preceding() *in addition to* following(); BI's are sometimes expensive.
+
+ // Find closest break < target
+ final int beforeIdx = baseIter.preceding(targetIdx); // or could do baseIter.previous() but we hope the BI implements preceding()
+ if (beforeIdx <= followingIdx) { // too far back
+ return moveToBreak(afterIdx);
+ }
+
+ if (targetIdx - beforeIdx <= afterIdx - targetIdx) {
+ return beforeIdx;
+ }
+ return moveToBreak(afterIdx);
+ }
+
+ private int moveToBreak(int idx) { // precondition: idx is a known break
+ // bi.isBoundary(idx) has side-effect of moving the position. Not obvious!
+ //boolean moved = baseIter.isBoundary(idx); // probably not particularly expensive
+ //assert moved && current() == idx;
+
+ // TODO fix: Would prefer to do "- 1" instead of "- 2" but CustomSeparatorBreakIterator has a bug.
+ int current = baseIter.following(idx - 2);
+ assert current == idx : "following() didn't move us to the expected index.";
+ return idx;
+ }
+
+ // called at start of new Passage given first word start offset
+ @Override
+ public int preceding(int offset) {
+ return baseIter.preceding(offset); // no change needed
+ }
+
+ @Override
+ public boolean isBoundary(int offset) {
+ assert false : "Not supported";
+ return baseIter.isBoundary(offset);
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ea499895/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/Passage.java
----------------------------------------------------------------------
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/Passage.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/Passage.java
index d64b96e..3efb694 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/Passage.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/Passage.java
@@ -171,6 +171,7 @@ public class Passage {
/** @lucene.internal */
public void setEndOffset(int endOffset) {
+ assert startOffset <= endOffset;
this.endOffset = endOffset;
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ea499895/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/LengthGoalBreakIteratorTest.java
----------------------------------------------------------------------
diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/LengthGoalBreakIteratorTest.java b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/LengthGoalBreakIteratorTest.java
new file mode 100644
index 0000000..42d2bf6
--- /dev/null
+++ b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/LengthGoalBreakIteratorTest.java
@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.search.uhighlight;
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.postingshighlight.CustomSeparatorBreakIterator;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.QueryBuilder;
+
+public class LengthGoalBreakIteratorTest extends LuceneTestCase {
+ private static final String FIELD = "body";
+
+ // We test LengthGoalBreakIterator as it is used by the UnifiedHighlighter instead of directly, because it is
+ // not a general purpose BreakIterator. A unit test of it directly wouldn't give as much confidence.
+
+ private final Analyzer analyzer =
+ new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);//whitespace, punctuation, lowercase
+
+ // We do a '.' BreakIterator and test varying the length goal.
+ // 0 1
+ // 01234567890123456789
+ final String content = "Aa bb. Cc dd. Ee ff";
+
+ public void testTargetLen() throws IOException {
+ // "goal" means target length goal to find closest break
+
+ // at first word:
+ Query query = query("aa");
+ assertEquals("almost two sent",
+ "<b>Aa</b> bb.", highlightClosestToLen(content, query, 9));
+ assertEquals( "barely two sent",
+ "<b>Aa</b> bb. Cc dd.", highlightClosestToLen(content, query, 10));
+ assertEquals("long goal",
+ "<b>Aa</b> bb. Cc dd. Ee ff", highlightClosestToLen(content, query, 17 + random().nextInt(20)));
+
+ // at some word not at start of passage
+ query = query("dd");
+ assertEquals("short goal",
+ " Cc <b>dd</b>.", highlightClosestToLen(content, query, random().nextInt(5)));
+ assertEquals("almost two sent",
+ " Cc <b>dd</b>.", highlightClosestToLen(content, query, 10));
+ assertEquals("barely two sent",
+ " Cc <b>dd</b>. Ee ff", highlightClosestToLen(content, query, 11));
+ assertEquals("long goal",
+ " Cc <b>dd</b>. Ee ff", highlightClosestToLen(content, query, 12 + random().nextInt(20)));
+ }
+
+ public void testMinLen() throws IOException {
+ // minLen mode is simpler than targetLen... just test a few cases
+
+ Query query = query("dd");
+ assertEquals("almost two sent",
+ " Cc <b>dd</b>.", highlightMinLen(content, query, 6));
+ assertEquals("barely two sent",
+ " Cc <b>dd</b>. Ee ff", highlightMinLen(content, query, 7));
+ }
+
+ public void testDefaultSummaryTargetLen() throws IOException {
+ Query query = query("zz");
+ assertEquals("Aa bb.",
+ highlightClosestToLen(content, query, random().nextInt(10))); // < 10
+ assertEquals("Aa bb. Cc dd.",
+ highlightClosestToLen(content, query, 10 + 6)); // cusp of adding 3rd sentence
+ assertEquals("Aa bb. Cc dd. Ee ff",
+ highlightClosestToLen(content, query, 17 + random().nextInt(20))); // >= 14
+ }
+
+ private Query query(String qStr) {
+ return new QueryBuilder(analyzer).createBooleanQuery(FIELD, qStr);
+ }
+
+ private String highlightClosestToLen(String content, Query query, int lengthGoal) throws IOException {
+ UnifiedHighlighter highlighter = new UnifiedHighlighter(null, analyzer);
+ highlighter.setBreakIterator(() -> LengthGoalBreakIterator.createClosestToLength(new CustomSeparatorBreakIterator('.'), lengthGoal));
+ return highlighter.highlightWithoutSearcher(FIELD, query, content, 1).toString();
+ }
+
+ private String highlightMinLen(String content, Query query, int lengthGoal) throws IOException {
+ // differs from above only by "createMinLength"
+ UnifiedHighlighter highlighter = new UnifiedHighlighter(null, analyzer);
+ highlighter.setBreakIterator(() -> LengthGoalBreakIterator.createMinLength(new CustomSeparatorBreakIterator('.'), lengthGoal));
+ return highlighter.highlightWithoutSearcher(FIELD, query, content, 1).toString();
+ }
+}
\ No newline at end of file