You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2013/09/12 19:12:51 UTC
svn commit: r1522666 - in /lucene/dev/branches/branch_4x/lucene: ./ suggest/src/java/org/apache/lucene/search/suggest/ suggest/src/java/org/apache/lucene/search/suggest/analyzing/ suggest/src/test/org/apache/lucene/search/suggest/analyzing/

Author: mikemccand
Date: Thu Sep 12 17:12:51 2013
New Revision: 1522666

URL: http://svn.apache.org/r1522666
Log:
LUCENE-5133: allow highlight to Object for AnalyzingInfixSuggester for advanced use cases

Modified:
    lucene/dev/branches/branch_4x/lucene/CHANGES.txt
    lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/Lookup.java
    lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggester.java
    lucene/dev/branches/branch_4x/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggesterTest.java

Modified: lucene/dev/branches/branch_4x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/CHANGES.txt?rev=1522666&r1=1522665&r2=1522666&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_4x/lucene/CHANGES.txt Thu Sep 12 17:12:51 2013
@@ -11,6 +11,11 @@ New Features
   for advanced use cases where String is too restrictive (Luca
   Cavanna, Robert Muir, Mike McCandless)
 
+* LUCENE-5133: Changed AnalyzingInfixSuggester.highlight to return
+  Object instead of String, to allow for advanced use cases where
+  String is too restrictive (Robert Muir, Shai Erera, Mike
+  McCandless)
+
 Changes in backwards compatibility policy
 
 * LUCENE-5204: Directory doesn't have default implementations for

Modified: lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/Lookup.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/Lookup.java?rev=1522666&r1=1522665&r2=1522666&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/Lookup.java (original)
+++ lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/Lookup.java Thu Sep 12 17:12:51 2013
@@ -41,6 +41,10 @@ public abstract class Lookup {
     /** the key's text */
     public final CharSequence key;
 
+    /** Expert: custom Object to hold the result of a
+     *  highlighted suggestion. */
+    public final Object highlightKey;
+
     /** the key's weight */
     public final long value;
 
@@ -59,6 +63,17 @@ public abstract class Lookup {
      */
     public LookupResult(CharSequence key, long value, BytesRef payload) {
       this.key = key;
+      this.highlightKey = null;
+      this.value = value;
+      this.payload = payload;
+    }
+
+    /**
+     * Create a new result from a key+highlightKey+weight+payload triple.
+     */
+    public LookupResult(CharSequence key, Object highlightKey, long value, BytesRef payload) {
+      this.key = key;
+      this.highlightKey = highlightKey;
       this.value = value;
       this.payload = payload;
     }

Modified: lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggester.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggester.java?rev=1522666&r1=1522665&r2=1522666&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggester.java (original)
+++ lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggester.java Thu Sep 12 17:12:51 2013
@@ -67,6 +67,7 @@ import org.apache.lucene.search.TermQuer
 import org.apache.lucene.search.TopDocs;
 import org.apache.lucene.search.spell.TermFreqIterator;
 import org.apache.lucene.search.spell.TermFreqPayloadIterator;
+import org.apache.lucene.search.suggest.Lookup.LookupResult; // javadocs
 import org.apache.lucene.search.suggest.Lookup;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.FSDirectory;
@@ -98,8 +99,10 @@ public class AnalyzingInfixSuggester ext
   /** Field name used for the indexed text. */
   protected final static String TEXT_FIELD_NAME = "text";
 
-  private final Analyzer queryAnalyzer;
-  final Analyzer indexAnalyzer;
+  /** Analyzer used at search time */
+  protected final Analyzer queryAnalyzer;
+  /** Analyzer used at index time */
+  protected final Analyzer indexAnalyzer;
   final Version matchVersion;
   private final File indexPath;
   final int minPrefixChars;
@@ -422,9 +425,6 @@ public class AnalyzingInfixSuggester ext
         ScoreDoc sd = hits.scoreDocs[i];
         textDV.get(sd.doc, scratch);
         String text = scratch.utf8ToString();
-        if (doHighlight) {
-          text = highlight(text, matchedTokens, prefixToken);
-        }
         long score = weightsDV.get(sd.doc);
 
         BytesRef payload;
@@ -435,7 +435,15 @@ public class AnalyzingInfixSuggester ext
           payload = null;
         }
 
-        results.add(new LookupResult(text, score, payload));
+        LookupResult result;
+
+        if (doHighlight) {
+          Object highlightKey = highlight(text, matchedTokens, prefixToken);
+          result = new LookupResult(highlightKey.toString(), highlightKey, score, payload);
+        } else {
+          result = new LookupResult(text, score, payload);
+        }
+        results.add(result);
       }
       //System.out.println((System.currentTimeMillis() - t0) + " msec for infix suggest");
       //System.out.println(results);
@@ -451,7 +459,11 @@ public class AnalyzingInfixSuggester ext
     return in;
   }
 
-  private String highlight(String text, Set<String> matchedTokens, String prefixToken) throws IOException {
+  /** Override this method to customize the Object
+   *  representing a single highlighted suggestions; the
+   *  result is set on each {@link
+   *  LookupResult#highlightKey} member. */
+  protected Object highlight(String text, Set<String> matchedTokens, String prefixToken) throws IOException {
     TokenStream ts = queryAnalyzer.tokenStream("text", new StringReader(text));
     CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
     OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
@@ -463,7 +475,7 @@ public class AnalyzingInfixSuggester ext
       int startOffset = offsetAtt.startOffset();
       int endOffset = offsetAtt.endOffset();
       if (upto < startOffset) {
-        sb.append(text.substring(upto, startOffset));
+        addNonMatch(sb, text.substring(upto, startOffset));
         upto = startOffset;
       } else if (upto > startOffset) {
         continue;
@@ -481,24 +493,38 @@ public class AnalyzingInfixSuggester ext
     ts.end();
     int endOffset = offsetAtt.endOffset();
     if (upto < endOffset) {
-      sb.append(text.substring(upto));
+      addNonMatch(sb, text.substring(upto));
     }
     ts.close();
 
     return sb.toString();
   }
 
-  /** Appends the whole matched token to the provided {@code
-   *  StringBuilder}. */
+  /** Called while highlighting a single result, to append a
+   *  non-matching chunk of text from the suggestion to the
+   *  provided fragments list.
+   *  @param sb The {@code StringBuilder} to append to
+   *  @param text The text chunk to add
+   */
+  protected void addNonMatch(StringBuilder sb, String text) {
+    sb.append(text);
+  }
+
+  /** Called while highlighting a single result, to append
+   *  the whole matched token to the provided fragments list.
+   *  @param sb The {@code StringBuilder} to append to
+   *  @param surface The surface form (original) text
+   *  @param analyzed The analyzed token corresponding to the surface form text
+   */
   protected void addWholeMatch(StringBuilder sb, String surface, String analyzed) {
     sb.append("<b>");
     sb.append(surface);
     sb.append("</b>");
   }
 
-  /** Append a matched prefix token, to the provided
-   *  {@code StringBuilder}. 
-   *  @param sb {@code StringBuilder} to append to
+  /** Called while highlighting a single result, to append a
+   *  matched prefix token, to the provided fragments list.
+   *  @param sb The {@code StringBuilder} to append to
    *  @param surface The fragment of the surface form
    *        (indexed during {@link #build}, corresponding to
    *        this match
@@ -509,13 +535,10 @@ public class AnalyzingInfixSuggester ext
     // TODO: apps can try to invert their analysis logic
     // here, e.g. downcase the two before checking prefix:
     sb.append("<b>");
-    if (surface.startsWith(prefixToken)) {
-      sb.append(surface.substring(0, prefixToken.length()));
-      sb.append("</b>");
+    sb.append(surface.substring(0, prefixToken.length()));
+    sb.append("</b>");
+    if (prefixToken.length() < surface.length()) {
       sb.append(surface.substring(prefixToken.length()));
-    } else {
-      sb.append(surface);
-      sb.append("</b>");
     }
   }
 

Modified: lucene/dev/branches/branch_4x/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggesterTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggesterTest.java?rev=1522666&r1=1522665&r2=1522666&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggesterTest.java (original)
+++ lucene/dev/branches/branch_4x/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggesterTest.java Thu Sep 12 17:12:51 2013
@@ -18,14 +18,20 @@ package org.apache.lucene.search.suggest
  */
 
 import java.io.File;
+import java.io.IOException;
 import java.io.Reader;
+import java.io.StringReader;
+import java.util.ArrayList;
 import java.util.List;
-import java.util.Locale;
+import java.util.Set;
 
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.MockAnalyzer;
 import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.core.StopFilter;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.util.CharArraySet;
 import org.apache.lucene.search.suggest.Lookup.LookupResult;
 import org.apache.lucene.search.suggest.TermFreqPayload;
@@ -120,6 +126,109 @@ public class AnalyzingInfixSuggesterTest
     suggester.close();
   }
 
+  /** Used to return highlighted result; see {@link
+   *  LookupResult#highlightKey} */
+  private static final class LookupHighlightFragment {
+    /** Portion of text for this fragment. */
+    public final String text;
+
+    /** True if this text matched a part of the user's
+     *  query. */
+    public final boolean isHit;
+
+    /** Sole constructor. */
+    public LookupHighlightFragment(String text, boolean isHit) {
+      this.text = text;
+      this.isHit = isHit;
+    }
+
+    @Override
+    public String toString() {
+      return "LookupHighlightFragment(text=" + text + " isHit=" + isHit + ")";
+    }
+  }
+
+  @SuppressWarnings("unchecked")
+  public void testHighlightAsObject() throws Exception {
+    TermFreqPayload keys[] = new TermFreqPayload[] {
+      new TermFreqPayload("a penny saved is a penny earned", 10, new BytesRef("foobaz")),
+    };
+
+    File tempDir = _TestUtil.getTempDir("AnalyzingInfixSuggesterTest");
+
+    Analyzer a = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false);
+    AnalyzingInfixSuggester suggester = new AnalyzingInfixSuggester(TEST_VERSION_CURRENT, tempDir, a, a, 3) {
+        @Override
+        protected Directory getDirectory(File path) {
+          return newDirectory();
+        }
+
+        @Override
+        protected Object highlight(String text, Set<String> matchedTokens, String prefixToken) throws IOException {
+          TokenStream ts = queryAnalyzer.tokenStream("text", new StringReader(text));
+          CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
+          OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
+          ts.reset();
+          List<LookupHighlightFragment> fragments = new ArrayList<LookupHighlightFragment>();
+          int upto = 0;
+          while (ts.incrementToken()) {
+            String token = termAtt.toString();
+            int startOffset = offsetAtt.startOffset();
+            int endOffset = offsetAtt.endOffset();
+            if (upto < startOffset) {
+              fragments.add(new LookupHighlightFragment(text.substring(upto, startOffset), false));
+              upto = startOffset;
+            } else if (upto > startOffset) {
+              continue;
+            }
+
+            if (matchedTokens.contains(token)) {
+              // Token matches.
+              fragments.add(new LookupHighlightFragment(text.substring(startOffset, endOffset), true));
+              upto = endOffset;
+            } else if (prefixToken != null && token.startsWith(prefixToken)) {
+              fragments.add(new LookupHighlightFragment(text.substring(startOffset, startOffset+prefixToken.length()), true));
+              if (prefixToken.length() < token.length()) {
+                fragments.add(new LookupHighlightFragment(text.substring(startOffset+prefixToken.length(), startOffset+token.length()), false));
+              }
+              upto = endOffset;
+            }
+          }
+          ts.end();
+          int endOffset = offsetAtt.endOffset();
+          if (upto < endOffset) {
+            fragments.add(new LookupHighlightFragment(text.substring(upto), false));
+          }
+          ts.close();
+
+          return fragments;
+        }
+      };
+    suggester.build(new TermFreqPayloadArrayIterator(keys));
+
+    List<LookupResult> results = suggester.lookup(_TestUtil.stringToCharSequence("ear", random()), 10, true, true);
+    assertEquals(1, results.size());
+    assertEquals("a penny saved is a penny <b>ear</b>ned", toString((List<LookupHighlightFragment>) results.get(0).highlightKey));
+    assertEquals(10, results.get(0).value);
+    assertEquals(new BytesRef("foobaz"), results.get(0).payload);
+    suggester.close();
+  }
+
+  public String toString(List<LookupHighlightFragment> fragments) {
+    StringBuilder sb = new StringBuilder();
+    for(LookupHighlightFragment fragment : fragments) {
+      if (fragment.isHit) {
+        sb.append("<b>");
+      }
+      sb.append(fragment.text);
+      if (fragment.isHit) {
+        sb.append("</b>");
+      }
+    }
+
+    return sb.toString();
+  }
+
   public void testRandomMinPrefixLength() throws Exception {
     TermFreqPayload keys[] = new TermFreqPayload[] {
       new TermFreqPayload("lend me your ear", 8, new BytesRef("foobar")),
@@ -240,24 +349,17 @@ public class AnalyzingInfixSuggesterTest
     suggester.build(new TermFreqPayloadArrayIterator(keys));
     List<LookupResult> results = suggester.lookup(_TestUtil.stringToCharSequence("penn", random()), 10, true, true);
     assertEquals(1, results.size());
-    assertEquals("a <b>Penny</b> saved is a <b>penn</b>y earned", results.get(0).key);
+    assertEquals("a <b>Penn</b>y saved is a <b>penn</b>y earned", results.get(0).key);
     suggester.close();
 
-    // Try again, but overriding addPrefixMatch to normalize case:
+    // Try again, but overriding addPrefixMatch to highlight
+    // the entire hit:
     suggester = new AnalyzingInfixSuggester(TEST_VERSION_CURRENT, tempDir, a, a, 3) {
         @Override
         protected void addPrefixMatch(StringBuilder sb, String surface, String analyzed, String prefixToken) {
-          prefixToken = prefixToken.toLowerCase(Locale.ROOT);
-          String surfaceLower = surface.toLowerCase(Locale.ROOT);
           sb.append("<b>");
-          if (surfaceLower.startsWith(prefixToken)) {
-            sb.append(surface.substring(0, prefixToken.length()));
-            sb.append("</b>");
-            sb.append(surface.substring(prefixToken.length()));
-          } else {
-            sb.append(surface);
-            sb.append("</b>");
-          }
+          sb.append(surface);
+          sb.append("</b>");
         }
 
         @Override
@@ -268,7 +370,7 @@ public class AnalyzingInfixSuggesterTest
     suggester.build(new TermFreqPayloadArrayIterator(keys));
     results = suggester.lookup(_TestUtil.stringToCharSequence("penn", random()), 10, true, true);
     assertEquals(1, results.size());
-    assertEquals("a <b>Penn</b>y saved is a <b>penn</b>y earned", results.get(0).key);
+    assertEquals("a <b>Penny</b> saved is a <b>penny</b> earned", results.get(0).key);
     suggester.close();
   }