You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2013/09/12 19:12:51 UTC
svn commit: r1522666 - in /lucene/dev/branches/branch_4x/lucene: ./
suggest/src/java/org/apache/lucene/search/suggest/
suggest/src/java/org/apache/lucene/search/suggest/analyzing/
suggest/src/test/org/apache/lucene/search/suggest/analyzing/
Author: mikemccand
Date: Thu Sep 12 17:12:51 2013
New Revision: 1522666
URL: http://svn.apache.org/r1522666
Log:
LUCENE-5133: allow highlight to Object for AnalyzingInfixSuggester for advanced use cases
Modified:
lucene/dev/branches/branch_4x/lucene/CHANGES.txt
lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/Lookup.java
lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggester.java
lucene/dev/branches/branch_4x/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggesterTest.java
Modified: lucene/dev/branches/branch_4x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/CHANGES.txt?rev=1522666&r1=1522665&r2=1522666&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_4x/lucene/CHANGES.txt Thu Sep 12 17:12:51 2013
@@ -11,6 +11,11 @@ New Features
for advanced use cases where String is too restrictive (Luca
Cavanna, Robert Muir, Mike McCandless)
+* LUCENE-5133: Changed AnalyzingInfixSuggester.highlight to return
+ Object instead of String, to allow for advanced use cases where
+ String is too restrictive (Robert Muir, Shai Erera, Mike
+ McCandless)
+
Changes in backwards compatibility policy
* LUCENE-5204: Directory doesn't have default implementations for
Modified: lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/Lookup.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/Lookup.java?rev=1522666&r1=1522665&r2=1522666&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/Lookup.java (original)
+++ lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/Lookup.java Thu Sep 12 17:12:51 2013
@@ -41,6 +41,10 @@ public abstract class Lookup {
/** the key's text */
public final CharSequence key;
+ /** Expert: custom Object to hold the result of a
+ * highlighted suggestion. */
+ public final Object highlightKey;
+
/** the key's weight */
public final long value;
@@ -59,6 +63,17 @@ public abstract class Lookup {
*/
public LookupResult(CharSequence key, long value, BytesRef payload) {
this.key = key;
+ this.highlightKey = null;
+ this.value = value;
+ this.payload = payload;
+ }
+
+ /**
+ * Create a new result from a key+highlightKey+weight+payload triple.
+ */
+ public LookupResult(CharSequence key, Object highlightKey, long value, BytesRef payload) {
+ this.key = key;
+ this.highlightKey = highlightKey;
this.value = value;
this.payload = payload;
}
Modified: lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggester.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggester.java?rev=1522666&r1=1522665&r2=1522666&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggester.java (original)
+++ lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggester.java Thu Sep 12 17:12:51 2013
@@ -67,6 +67,7 @@ import org.apache.lucene.search.TermQuer
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.spell.TermFreqIterator;
import org.apache.lucene.search.spell.TermFreqPayloadIterator;
+import org.apache.lucene.search.suggest.Lookup.LookupResult; // javadocs
import org.apache.lucene.search.suggest.Lookup;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
@@ -98,8 +99,10 @@ public class AnalyzingInfixSuggester ext
/** Field name used for the indexed text. */
protected final static String TEXT_FIELD_NAME = "text";
- private final Analyzer queryAnalyzer;
- final Analyzer indexAnalyzer;
+ /** Analyzer used at search time */
+ protected final Analyzer queryAnalyzer;
+ /** Analyzer used at index time */
+ protected final Analyzer indexAnalyzer;
final Version matchVersion;
private final File indexPath;
final int minPrefixChars;
@@ -422,9 +425,6 @@ public class AnalyzingInfixSuggester ext
ScoreDoc sd = hits.scoreDocs[i];
textDV.get(sd.doc, scratch);
String text = scratch.utf8ToString();
- if (doHighlight) {
- text = highlight(text, matchedTokens, prefixToken);
- }
long score = weightsDV.get(sd.doc);
BytesRef payload;
@@ -435,7 +435,15 @@ public class AnalyzingInfixSuggester ext
payload = null;
}
- results.add(new LookupResult(text, score, payload));
+ LookupResult result;
+
+ if (doHighlight) {
+ Object highlightKey = highlight(text, matchedTokens, prefixToken);
+ result = new LookupResult(highlightKey.toString(), highlightKey, score, payload);
+ } else {
+ result = new LookupResult(text, score, payload);
+ }
+ results.add(result);
}
//System.out.println((System.currentTimeMillis() - t0) + " msec for infix suggest");
//System.out.println(results);
@@ -451,7 +459,11 @@ public class AnalyzingInfixSuggester ext
return in;
}
- private String highlight(String text, Set<String> matchedTokens, String prefixToken) throws IOException {
+ /** Override this method to customize the Object
+ * representing a single highlighted suggestions; the
+ * result is set on each {@link
+ * LookupResult#highlightKey} member. */
+ protected Object highlight(String text, Set<String> matchedTokens, String prefixToken) throws IOException {
TokenStream ts = queryAnalyzer.tokenStream("text", new StringReader(text));
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
@@ -463,7 +475,7 @@ public class AnalyzingInfixSuggester ext
int startOffset = offsetAtt.startOffset();
int endOffset = offsetAtt.endOffset();
if (upto < startOffset) {
- sb.append(text.substring(upto, startOffset));
+ addNonMatch(sb, text.substring(upto, startOffset));
upto = startOffset;
} else if (upto > startOffset) {
continue;
@@ -481,24 +493,38 @@ public class AnalyzingInfixSuggester ext
ts.end();
int endOffset = offsetAtt.endOffset();
if (upto < endOffset) {
- sb.append(text.substring(upto));
+ addNonMatch(sb, text.substring(upto));
}
ts.close();
return sb.toString();
}
- /** Appends the whole matched token to the provided {@code
- * StringBuilder}. */
+ /** Called while highlighting a single result, to append a
+ * non-matching chunk of text from the suggestion to the
+ * provided fragments list.
+ * @param sb The {@code StringBuilder} to append to
+ * @param text The text chunk to add
+ */
+ protected void addNonMatch(StringBuilder sb, String text) {
+ sb.append(text);
+ }
+
+ /** Called while highlighting a single result, to append
+ * the whole matched token to the provided fragments list.
+ * @param sb The {@code StringBuilder} to append to
+ * @param surface The surface form (original) text
+ * @param analyzed The analyzed token corresponding to the surface form text
+ */
protected void addWholeMatch(StringBuilder sb, String surface, String analyzed) {
sb.append("<b>");
sb.append(surface);
sb.append("</b>");
}
- /** Append a matched prefix token, to the provided
- * {@code StringBuilder}.
- * @param sb {@code StringBuilder} to append to
+ /** Called while highlighting a single result, to append a
+ * matched prefix token, to the provided fragments list.
+ * @param sb The {@code StringBuilder} to append to
* @param surface The fragment of the surface form
* (indexed during {@link #build}, corresponding to
* this match
@@ -509,13 +535,10 @@ public class AnalyzingInfixSuggester ext
// TODO: apps can try to invert their analysis logic
// here, e.g. downcase the two before checking prefix:
sb.append("<b>");
- if (surface.startsWith(prefixToken)) {
- sb.append(surface.substring(0, prefixToken.length()));
- sb.append("</b>");
+ sb.append(surface.substring(0, prefixToken.length()));
+ sb.append("</b>");
+ if (prefixToken.length() < surface.length()) {
sb.append(surface.substring(prefixToken.length()));
- } else {
- sb.append(surface);
- sb.append("</b>");
}
}
Modified: lucene/dev/branches/branch_4x/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggesterTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggesterTest.java?rev=1522666&r1=1522665&r2=1522666&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggesterTest.java (original)
+++ lucene/dev/branches/branch_4x/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggesterTest.java Thu Sep 12 17:12:51 2013
@@ -18,14 +18,20 @@ package org.apache.lucene.search.suggest
*/
import java.io.File;
+import java.io.IOException;
import java.io.Reader;
+import java.io.StringReader;
+import java.util.ArrayList;
import java.util.List;
-import java.util.Locale;
+import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.StopFilter;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.search.suggest.Lookup.LookupResult;
import org.apache.lucene.search.suggest.TermFreqPayload;
@@ -120,6 +126,109 @@ public class AnalyzingInfixSuggesterTest
suggester.close();
}
+ /** Used to return highlighted result; see {@link
+ * LookupResult#highlightKey} */
+ private static final class LookupHighlightFragment {
+ /** Portion of text for this fragment. */
+ public final String text;
+
+ /** True if this text matched a part of the user's
+ * query. */
+ public final boolean isHit;
+
+ /** Sole constructor. */
+ public LookupHighlightFragment(String text, boolean isHit) {
+ this.text = text;
+ this.isHit = isHit;
+ }
+
+ @Override
+ public String toString() {
+ return "LookupHighlightFragment(text=" + text + " isHit=" + isHit + ")";
+ }
+ }
+
+ @SuppressWarnings("unchecked")
+ public void testHighlightAsObject() throws Exception {
+ TermFreqPayload keys[] = new TermFreqPayload[] {
+ new TermFreqPayload("a penny saved is a penny earned", 10, new BytesRef("foobaz")),
+ };
+
+ File tempDir = _TestUtil.getTempDir("AnalyzingInfixSuggesterTest");
+
+ Analyzer a = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false);
+ AnalyzingInfixSuggester suggester = new AnalyzingInfixSuggester(TEST_VERSION_CURRENT, tempDir, a, a, 3) {
+ @Override
+ protected Directory getDirectory(File path) {
+ return newDirectory();
+ }
+
+ @Override
+ protected Object highlight(String text, Set<String> matchedTokens, String prefixToken) throws IOException {
+ TokenStream ts = queryAnalyzer.tokenStream("text", new StringReader(text));
+ CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
+ OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
+ ts.reset();
+ List<LookupHighlightFragment> fragments = new ArrayList<LookupHighlightFragment>();
+ int upto = 0;
+ while (ts.incrementToken()) {
+ String token = termAtt.toString();
+ int startOffset = offsetAtt.startOffset();
+ int endOffset = offsetAtt.endOffset();
+ if (upto < startOffset) {
+ fragments.add(new LookupHighlightFragment(text.substring(upto, startOffset), false));
+ upto = startOffset;
+ } else if (upto > startOffset) {
+ continue;
+ }
+
+ if (matchedTokens.contains(token)) {
+ // Token matches.
+ fragments.add(new LookupHighlightFragment(text.substring(startOffset, endOffset), true));
+ upto = endOffset;
+ } else if (prefixToken != null && token.startsWith(prefixToken)) {
+ fragments.add(new LookupHighlightFragment(text.substring(startOffset, startOffset+prefixToken.length()), true));
+ if (prefixToken.length() < token.length()) {
+ fragments.add(new LookupHighlightFragment(text.substring(startOffset+prefixToken.length(), startOffset+token.length()), false));
+ }
+ upto = endOffset;
+ }
+ }
+ ts.end();
+ int endOffset = offsetAtt.endOffset();
+ if (upto < endOffset) {
+ fragments.add(new LookupHighlightFragment(text.substring(upto), false));
+ }
+ ts.close();
+
+ return fragments;
+ }
+ };
+ suggester.build(new TermFreqPayloadArrayIterator(keys));
+
+ List<LookupResult> results = suggester.lookup(_TestUtil.stringToCharSequence("ear", random()), 10, true, true);
+ assertEquals(1, results.size());
+ assertEquals("a penny saved is a penny <b>ear</b>ned", toString((List<LookupHighlightFragment>) results.get(0).highlightKey));
+ assertEquals(10, results.get(0).value);
+ assertEquals(new BytesRef("foobaz"), results.get(0).payload);
+ suggester.close();
+ }
+
+ public String toString(List<LookupHighlightFragment> fragments) {
+ StringBuilder sb = new StringBuilder();
+ for(LookupHighlightFragment fragment : fragments) {
+ if (fragment.isHit) {
+ sb.append("<b>");
+ }
+ sb.append(fragment.text);
+ if (fragment.isHit) {
+ sb.append("</b>");
+ }
+ }
+
+ return sb.toString();
+ }
+
public void testRandomMinPrefixLength() throws Exception {
TermFreqPayload keys[] = new TermFreqPayload[] {
new TermFreqPayload("lend me your ear", 8, new BytesRef("foobar")),
@@ -240,24 +349,17 @@ public class AnalyzingInfixSuggesterTest
suggester.build(new TermFreqPayloadArrayIterator(keys));
List<LookupResult> results = suggester.lookup(_TestUtil.stringToCharSequence("penn", random()), 10, true, true);
assertEquals(1, results.size());
- assertEquals("a <b>Penny</b> saved is a <b>penn</b>y earned", results.get(0).key);
+ assertEquals("a <b>Penn</b>y saved is a <b>penn</b>y earned", results.get(0).key);
suggester.close();
- // Try again, but overriding addPrefixMatch to normalize case:
+ // Try again, but overriding addPrefixMatch to highlight
+ // the entire hit:
suggester = new AnalyzingInfixSuggester(TEST_VERSION_CURRENT, tempDir, a, a, 3) {
@Override
protected void addPrefixMatch(StringBuilder sb, String surface, String analyzed, String prefixToken) {
- prefixToken = prefixToken.toLowerCase(Locale.ROOT);
- String surfaceLower = surface.toLowerCase(Locale.ROOT);
sb.append("<b>");
- if (surfaceLower.startsWith(prefixToken)) {
- sb.append(surface.substring(0, prefixToken.length()));
- sb.append("</b>");
- sb.append(surface.substring(prefixToken.length()));
- } else {
- sb.append(surface);
- sb.append("</b>");
- }
+ sb.append(surface);
+ sb.append("</b>");
}
@Override
@@ -268,7 +370,7 @@ public class AnalyzingInfixSuggesterTest
suggester.build(new TermFreqPayloadArrayIterator(keys));
results = suggester.lookup(_TestUtil.stringToCharSequence("penn", random()), 10, true, true);
assertEquals(1, results.size());
- assertEquals("a <b>Penn</b>y saved is a <b>penn</b>y earned", results.get(0).key);
+ assertEquals("a <b>Penny</b> saved is a <b>penny</b> earned", results.get(0).key);
suggester.close();
}