You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2013/07/02 16:08:26 UTC

svn commit: r1498945 - in /lucene/dev/trunk: lucene/ lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/ lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/ solr/core/src/java/org/apache/solr/highlight/

Author: rmuir
Date: Tue Jul  2 14:08:26 2013
New Revision: 1498945

URL: http://svn.apache.org/r1498945
Log:
LUCENE-5087: Add getMultiValuedSeparator to PostingsHighlighter

Modified:
    lucene/dev/trunk/lucene/CHANGES.txt
    lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java
    lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighter.java
    lucene/dev/trunk/solr/core/src/java/org/apache/solr/highlight/PostingsSolrHighlighter.java

Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1498945&r1=1498944&r2=1498945&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Tue Jul  2 14:08:26 2013
@@ -268,6 +268,11 @@ New Features
 * SOLR-4565: Extend NorwegianLightStemFilter and NorwegianMinimalStemFilter 
   to handle "nynorsk" (Erlend Garåsen, janhoy via Robert Muir)
 
+* LUCENE-5087: Add getMultiValuedSeparator to PostingsHighlighter, for cases
+  where you want a different logical separator between field values. This can
+  be set to e.g. U+2029 PARAGRAPH SEPARATOR if you never want passes to span
+  values. (Mike McCandless, Robert Muir)
+
 API Changes
 
 * LUCENE-5077: Make it easier to use compressed norms. Lucene42NormsFormat takes

Modified: lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java?rev=1498945&r1=1498944&r2=1498945&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java (original)
+++ lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java Tue Jul  2 14:08:26 2013
@@ -369,7 +369,11 @@ public class PostingsHighlighter {
    *  identical to what was indexed. */
   protected String[][] loadFieldValues(IndexSearcher searcher, String[] fields, int[] docids, int maxLength) throws IOException {
     String contents[][] = new String[fields.length][docids.length];
-    LimitedStoredFieldVisitor visitor = new LimitedStoredFieldVisitor(fields, maxLength);
+    char valueSeparators[] = new char[fields.length];
+    for (int i = 0; i < fields.length; i++) {
+      valueSeparators[i] = getMultiValuedSeparator(fields[i]);
+    }
+    LimitedStoredFieldVisitor visitor = new LimitedStoredFieldVisitor(fields, valueSeparators, maxLength);
     for (int i = 0; i < docids.length; i++) {
       searcher.doc(docids[i], visitor);
       for (int j = 0; j < fields.length; j++) {
@@ -379,6 +383,16 @@ public class PostingsHighlighter {
     }
     return contents;
   }
+  
+  /** 
+   * Returns the logical separator between values for multi-valued fields.
+   * The default value is a space character, which means passages can span across values,
+   * but a subclass can override, for example with {@code U+2029 PARAGRAPH SEPARATOR (PS)}
+   * if each value holds a discrete passage for highlighting.
+   */
+  protected char getMultiValuedSeparator(String field) {
+    return ' ';
+  }
     
   private Map<Integer,String> highlightField(String field, String contents[], BreakIterator bi, BytesRef terms[], int[] docids, List<AtomicReaderContext> leaves, int maxPassages) throws IOException {  
     Map<Integer,String> highlights = new HashMap<Integer,String>();
@@ -652,12 +666,15 @@ public class PostingsHighlighter {
   
   private static class LimitedStoredFieldVisitor extends StoredFieldVisitor {
     private final String fields[];
+    private final char valueSeparators[];
     private final int maxLength;
     private final StringBuilder builders[];
     private int currentField = -1;
     
-    public LimitedStoredFieldVisitor(String fields[], int maxLength) {
+    public LimitedStoredFieldVisitor(String fields[], char valueSeparators[], int maxLength) {
+      assert fields.length == valueSeparators.length;
       this.fields = fields;
+      this.valueSeparators = valueSeparators;
       this.maxLength = maxLength;
       builders = new StringBuilder[fields.length];
       for (int i = 0; i < builders.length; i++) {
@@ -670,7 +687,7 @@ public class PostingsHighlighter {
       assert currentField >= 0;
       StringBuilder builder = builders[currentField];
       if (builder.length() > 0 && builder.length() < maxLength) {
-        builder.append(' '); // for the offset gap, TODO: make this configurable
+        builder.append(valueSeparators[currentField]);
       }
       if (builder.length() + value.length() > maxLength) {
         builder.append(value, 0, maxLength - builder.length());

Modified: lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighter.java?rev=1498945&r1=1498944&r2=1498945&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighter.java (original)
+++ lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighter.java Tue Jul  2 14:08:26 2013
@@ -922,4 +922,48 @@ public class TestPostingsHighlighter ext
     ir.close();
     dir.close();
   }
+  
+  /** customizing the gap separator to force a sentence break */
+  public void testGapSeparator() throws Exception {
+    Directory dir = newDirectory();
+    // use simpleanalyzer for more natural tokenization (else "test." is a token)
+    IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true));
+    iwc.setMergePolicy(newLogMergePolicy());
+    RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
+    
+    FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
+    offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
+    Document doc = new Document();
+    
+    Field body1 = new Field("body", "", offsetsType);
+    body1.setStringValue("This is a multivalued field");
+    doc.add(body1);
+    
+    Field body2 = new Field("body", "", offsetsType);
+    body2.setStringValue("This is something different");
+    doc.add(body2);
+    
+    iw.addDocument(doc);
+    
+    IndexReader ir = iw.getReader();
+    iw.close();
+    
+    IndexSearcher searcher = newSearcher(ir);
+    PostingsHighlighter highlighter = new PostingsHighlighter() {
+      @Override
+      protected char getMultiValuedSeparator(String field) {
+        assert field.equals("body");
+        return '\u2029';
+      }
+    };
+    Query query = new TermQuery(new Term("body", "field"));
+    TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER);
+    assertEquals(1, topDocs.totalHits);
+    String snippets[] = highlighter.highlight("body", query, searcher, topDocs);
+    assertEquals(1, snippets.length);
+    assertEquals("This is a multivalued <b>field</b>\u2029", snippets[0]);
+    
+    ir.close();
+    dir.close();
+  }
 }

Modified: lucene/dev/trunk/solr/core/src/java/org/apache/solr/highlight/PostingsSolrHighlighter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/java/org/apache/solr/highlight/PostingsSolrHighlighter.java?rev=1498945&r1=1498944&r2=1498945&view=diff
==============================================================================
--- lucene/dev/trunk/solr/core/src/java/org/apache/solr/highlight/PostingsSolrHighlighter.java (original)
+++ lucene/dev/trunk/solr/core/src/java/org/apache/solr/highlight/PostingsSolrHighlighter.java Tue Jul  2 14:08:26 2013
@@ -67,6 +67,7 @@ import org.apache.solr.util.plugin.Plugi
  *       &lt;str name="hl.bs.variant"&gt;&lt;/str&gt;
  *       &lt;str name="hl.bs.type"&gt;SENTENCE&lt;/str&gt;
  *       &lt;int name="hl.maxAnalyzedChars"&gt;10000&lt;/int&gt;
+ *       &lt;str name="hl.multiValuedSeparatorChar"&gt; &lt;/str&gt;
  *     &lt;/lst&gt;
  *   &lt;/requestHandler&gt;
  * </pre>
@@ -96,6 +97,7 @@ import org.apache.solr.util.plugin.Plugi
  *    <li>hl.bs.country (string) specifies country code for BreakIterator. default is empty string (root locale)
  *    <li>hl.bs.variant (string) specifies country code for BreakIterator. default is empty string (root locale)
  *    <li>hl.maxAnalyzedChars specifies how many characters at most will be processed in a document.
+ *    <li>hl.multiValuedSeparatorChar specifies the logical separator between values for multi-valued fields.
  *        NOTE: currently hl.maxAnalyzedChars cannot yet be specified per-field
  *  </ul>
  *  
@@ -167,6 +169,15 @@ public class PostingsSolrHighlighter ext
           String type = params.getFieldParam(field, HighlightParams.BS_TYPE);
           return parseBreakIterator(type, locale);
         }
+
+        @Override
+        protected char getMultiValuedSeparator(String field) {
+          String sep = params.getFieldParam(field, HighlightParams.MULTI_VALUED_SEPARATOR, " ");
+          if (sep.length() != 1) {
+            throw new IllegalArgumentException(HighlightParams.MULTI_VALUED_SEPARATOR + " must be exactly one character.");
+          }
+          return sep.charAt(0);
+        }
       };
       
       Map<String,String[]> snippets = highlighter.highlightFields(fieldNames, query, searcher, docIDs, maxPassages);