You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2013/04/06 16:27:45 UTC

svn commit: r1465251 - in /lucene/dev/branches/branch_4x: ./ solr/ solr/core/ solr/core/src/java/org/apache/solr/highlight/ solr/core/src/test-files/solr/collection1/conf/ solr/core/src/test/org/apache/solr/highlight/ solr/solrj/ solr/solrj/src/java/or...

Author: rmuir
Date: Sat Apr  6 14:27:44 2013
New Revision: 1465251

URL: http://svn.apache.org/r1465251
Log:
SOLR-4683: add BreakIterator config to PostingsSolrHighlighter

Modified:
    lucene/dev/branches/branch_4x/   (props changed)
    lucene/dev/branches/branch_4x/solr/   (props changed)
    lucene/dev/branches/branch_4x/solr/core/   (props changed)
    lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/highlight/PostingsSolrHighlighter.java
    lucene/dev/branches/branch_4x/solr/core/src/test-files/solr/collection1/conf/schema-postingshighlight.xml
    lucene/dev/branches/branch_4x/solr/core/src/test/org/apache/solr/highlight/TestPostingsSolrHighlighter.java
    lucene/dev/branches/branch_4x/solr/solrj/   (props changed)
    lucene/dev/branches/branch_4x/solr/solrj/src/java/org/apache/solr/common/params/HighlightParams.java

Modified: lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/highlight/PostingsSolrHighlighter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/highlight/PostingsSolrHighlighter.java?rev=1465251&r1=1465250&r2=1465251&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/highlight/PostingsSolrHighlighter.java (original)
+++ lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/highlight/PostingsSolrHighlighter.java Sat Apr  6 14:27:44 2013
@@ -20,6 +20,7 @@ package org.apache.solr.highlight;
 import java.io.IOException;
 import java.text.BreakIterator;
 import java.util.Collections;
+import java.util.Locale;
 import java.util.Map;
 import java.util.Set;
 
@@ -29,6 +30,7 @@ import org.apache.lucene.search.postings
 import org.apache.lucene.search.postingshighlight.PassageFormatter;
 import org.apache.lucene.search.postingshighlight.PassageScorer;
 import org.apache.lucene.search.postingshighlight.PostingsHighlighter;
+import org.apache.lucene.search.postingshighlight.WholeBreakIterator;
 import org.apache.solr.common.params.HighlightParams;
 import org.apache.solr.common.params.SolrParams;
 import org.apache.solr.common.util.NamedList;
@@ -58,6 +60,10 @@ import org.apache.solr.util.plugin.Plugi
  *       <float name="hl.score.k1">1.2</float>
  *       <float name="hl.score.b">0.75</float>
  *       <float name="hl.score.pivot">87</float>
+ *       <str name="hl.bs.language"></str>
+ *       <str name="hl.bs.country"></str>
+ *       <str name="hl.bs.variant"></str>
+ *       <str name="hl.bs.type">SENTENCE</str>
  *       <int name="hl.maxAnalyzedChars">10000</int>
  *     </lst>
  *   </requestHandler>
@@ -74,7 +80,7 @@ import org.apache.solr.util.plugin.Plugi
  *    <li>fields to highlight must be configured with storeOffsetsWithPositions="true"
  *    <li>hl.q (string) can specify the query
  *    <li>hl.fl (string) specifies the field list.
- *    <li>hl.snippets (int) specifies how many underlying sentence fragments form the resulting snippet.
+ *    <li>hl.snippets (int) specifies how many underlying passages form the resulting snippet.
  *    <li>hl.tag.pre (string) specifies text which appears before a highlighted term.
  *    <li>hl.tag.post (string) specifies text which appears after a highlighted term.
  *    <li>hl.tag.ellipsis (string) specifies text which joins non-adjacent passages.
@@ -82,6 +88,10 @@ import org.apache.solr.util.plugin.Plugi
  *    <li>hl.score.k1 (float) specifies bm25 scoring parameter 'k1'
  *    <li>hl.score.b (float) specifies bm25 scoring parameter 'b'
  *    <li>hl.score.pivot (float) specifies bm25 scoring parameter 'avgdl'
+ *    <li>hl.bs.type (string) specifies how to divide text into passages: [SENTENCE, LINE, WORD, CHAR, WHOLE]
+ *    <li>hl.bs.language (string) specifies language code for BreakIterator. default is empty string (root locale)
+ *    <li>hl.bs.country (string) specifies country code for BreakIterator. default is empty string (root locale)
+ *    <li>hl.bs.variant (string) specifies country code for BreakIterator. default is empty string (root locale)
  *    <li>hl.maxAnalyzedChars specifies how many characters at most will be processed in a document.
  *        NOTE: currently hl.maxAnalyzedChars cannot yet be specified per-field
  *  </ul>
@@ -143,6 +153,16 @@ public class PostingsSolrHighlighter ext
           float pivot = params.getFieldFloat(fieldName, HighlightParams.SCORE_PIVOT, 87f);
           return new PassageScorer(k1, b, pivot);
         }
+
+        @Override
+        protected BreakIterator getBreakIterator(String field) {
+          String language = params.getFieldParam(field, HighlightParams.BS_LANGUAGE);
+          String country = params.getFieldParam(field, HighlightParams.BS_COUNTRY);
+          String variant = params.getFieldParam(field, HighlightParams.BS_VARIANT);
+          Locale locale = parseLocale(language, country, variant);
+          String type = params.getFieldParam(field, HighlightParams.BS_TYPE);
+          return parseBreakIterator(type, locale);
+        }
       };
       
       Map<String,String[]> snippets = highlighter.highlightFields(fieldNames, query, searcher, docIDs, maxPassages);
@@ -212,4 +232,36 @@ public class PostingsSolrHighlighter ext
       return new String[docIDs.length];
     }
   }
+  
+  /** parse a break iterator type for the specified locale */
+  protected BreakIterator parseBreakIterator(String type, Locale locale) {
+    if (type == null || "SENTENCE".equals(type)) {
+      return BreakIterator.getSentenceInstance(locale);
+    } else if ("LINE".equals(type)) {
+      return BreakIterator.getLineInstance(locale);
+    } else if ("WORD".equals(type)) {
+      return BreakIterator.getWordInstance(locale);
+    } else if ("CHARACTER".equals(type)) {
+      return BreakIterator.getCharacterInstance(locale);
+    } else if ("WHOLE".equals(type)) {
+      return new WholeBreakIterator();
+    } else {
+      throw new IllegalArgumentException("Unknown " + HighlightParams.BS_TYPE + ": " + type);
+    }
+  }
+  
+  /** parse a locale from a language+country+variant spec */
+  protected Locale parseLocale(String language, String country, String variant) {
+    if (language == null && country == null && variant == null) {
+      return Locale.ROOT;
+    } else if (language != null && country == null && variant != null) {
+      throw new IllegalArgumentException("To specify variant, country is required");
+    } else if (language != null && country != null && variant != null) {
+      return new Locale(language, country, variant);
+    } else if (language != null && country != null) {
+      return new Locale(language, country);
+    } else { 
+      return new Locale(language);
+    }
+  }
 }

Modified: lucene/dev/branches/branch_4x/solr/core/src/test-files/solr/collection1/conf/schema-postingshighlight.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/solr/core/src/test-files/solr/collection1/conf/schema-postingshighlight.xml?rev=1465251&r1=1465250&r2=1465251&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/solr/core/src/test-files/solr/collection1/conf/schema-postingshighlight.xml (original)
+++ lucene/dev/branches/branch_4x/solr/core/src/test-files/solr/collection1/conf/schema-postingshighlight.xml Sat Apr  6 14:27:44 2013
@@ -26,6 +26,7 @@
     <fieldtype name="text" class="solr.TextField">
       <analyzer>
         <tokenizer class="solr.MockTokenizerFactory"/>
+        <filter class="solr.LowerCaseFilterFactory"/>
       </analyzer>
     </fieldtype>
     
@@ -33,6 +34,7 @@
     <fieldtype name="text_offsets" class="solr.TextField" storeOffsetsWithPositions="true">
       <analyzer>
         <tokenizer class="solr.MockTokenizerFactory"/>
+        <filter class="solr.LowerCaseFilterFactory"/>
       </analyzer>
     </fieldtype>
    </types>

Modified: lucene/dev/branches/branch_4x/solr/core/src/test/org/apache/solr/highlight/TestPostingsSolrHighlighter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/solr/core/src/test/org/apache/solr/highlight/TestPostingsSolrHighlighter.java?rev=1465251&r1=1465250&r2=1465251&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/solr/core/src/test/org/apache/solr/highlight/TestPostingsSolrHighlighter.java (original)
+++ lucene/dev/branches/branch_4x/solr/core/src/test/org/apache/solr/highlight/TestPostingsSolrHighlighter.java Sat Apr  6 14:27:44 2013
@@ -42,7 +42,12 @@ public class TestPostingsSolrHighlighter
     assertTrue(schema.getField("text").storeOffsetsWithPositions());
     assertTrue(schema.getField("text3").storeOffsetsWithPositions());
     assertFalse(schema.getField("text2").storeOffsetsWithPositions());
-    
+  }
+  
+  @Override
+  public void setUp() throws Exception {
+    super.setUp();
+    clearIndex();
     assertU(adoc("text", "document one", "text2", "document one", "text3", "crappy document", "id", "101"));
     assertU(adoc("text", "second document", "text2", "second document", "text3", "crappier document", "id", "102"));
     assertU(commit());
@@ -126,4 +131,20 @@ public class TestPostingsSolrHighlighter
         "//lst[@name='highlighting']/lst[@name='102']/arr[@name='text']/str='second <em>document</em>'",
         "//lst[@name='highlighting']/lst[@name='102']/arr[@name='text3']/str='crappier [document]'");
   }
+  
+  public void testBreakIterator() {
+    assertQ("different breakiterator", 
+        req("q", "text:document", "sort", "id asc", "hl", "true", "hl.bs.type", "WORD"),
+        "count(//lst[@name='highlighting']/*)=2",
+        "//lst[@name='highlighting']/lst[@name='101']/arr[@name='text']/str='<em>document</em>'",
+        "//lst[@name='highlighting']/lst[@name='102']/arr[@name='text']/str='<em>document</em>'");
+  }
+  
+  public void testBreakIterator2() {
+    assertU(adoc("text", "Document one has a first sentence. Document two has a second sentence.", "id", "103"));
+    assertU(commit());
+    assertQ("different breakiterator", 
+        req("q", "text:document", "sort", "id asc", "hl", "true", "hl.bs.type", "WHOLE"),
+        "//lst[@name='highlighting']/lst[@name='103']/arr[@name='text']/str='<em>Document</em> one has a first sentence. <em>Document</em> two has a second sentence.'");
+  }
 }

Modified: lucene/dev/branches/branch_4x/solr/solrj/src/java/org/apache/solr/common/params/HighlightParams.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/solr/solrj/src/java/org/apache/solr/common/params/HighlightParams.java?rev=1465251&r1=1465250&r2=1465251&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/solr/solrj/src/java/org/apache/solr/common/params/HighlightParams.java (original)
+++ lucene/dev/branches/branch_4x/solr/solrj/src/java/org/apache/solr/common/params/HighlightParams.java Sat Apr  6 14:27:44 2013
@@ -41,6 +41,7 @@ public interface HighlightParams {
   public static final String BS_TYPE = HIGHLIGHT+".bs.type";
   public static final String BS_LANGUAGE = HIGHLIGHT+".bs.language";
   public static final String BS_COUNTRY = HIGHLIGHT+".bs.country";
+  public static final String BS_VARIANT = HIGHLIGHT+".bs.variant";
   public static final String FIELD_MATCH = HIGHLIGHT+".requireFieldMatch";
   public static final String DEFAULT_SUMMARY = HIGHLIGHT + ".defaultSummary";
   public static final String ALTERNATE_FIELD = HIGHLIGHT+".alternateField";