You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ds...@apache.org on 2017/03/02 02:04:15 UTC

lucene-solr:master: SOLR-10153: (and SOLR-10152): UH & PH: Add hl.bs.type=SEPARATOR with new param hl.bs.separator

Repository: lucene-solr
Updated Branches:
  refs/heads/master 8684fe794 -> d1d73bfbe


SOLR-10153: (and SOLR-10152): UH & PH: Add hl.bs.type=SEPARATOR with new param hl.bs.separator


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/d1d73bfb
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/d1d73bfb
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/d1d73bfb

Branch: refs/heads/master
Commit: d1d73bfbea3db4adead960fae3597bec7647fba6
Parents: 8684fe7
Author: David Smiley <ds...@apache.org>
Authored: Wed Mar 1 21:04:07 2017 -0500
Committer: David Smiley <ds...@apache.org>
Committed: Wed Mar 1 21:04:07 2017 -0500

----------------------------------------------------------------------
 solr/CHANGES.txt                                |  3 ++
 .../solr/highlight/PostingsSolrHighlighter.java | 35 ++++++++++++++++----
 .../solr/highlight/UnifiedSolrHighlighter.java  | 28 ++++++++++++----
 .../highlight/TestPostingsSolrHighlighter.java  | 13 ++++++++
 .../highlight/TestUnifiedSolrHighlighter.java   | 13 ++++++++
 .../solr/common/params/HighlightParams.java     |  1 +
 6 files changed, 80 insertions(+), 13 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/d1d73bfb/solr/CHANGES.txt
----------------------------------------------------------------------
diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index db5e3e6..fa02c39 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -137,6 +137,9 @@ New Features
 * SOLR-10158: Add support for "preload" option in MMapDirectoryFactory.
   (Amrit Sarkar via Uwe Schindler)
 
+* SOLR-10153 & SOLR-10152: The Unified and Postings based highlighters: Add hl.bs.type=SEPARATOR along with new param
+  hl.bs.separator to break passages by a provided single character. (Amrit Sarkar, David Smiley)
+
 Bug Fixes
 ----------------------
 

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/d1d73bfb/solr/core/src/java/org/apache/solr/highlight/PostingsSolrHighlighter.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/highlight/PostingsSolrHighlighter.java b/solr/core/src/java/org/apache/solr/highlight/PostingsSolrHighlighter.java
index 9fcf9f3..5ea3db1 100644
--- a/solr/core/src/java/org/apache/solr/highlight/PostingsSolrHighlighter.java
+++ b/solr/core/src/java/org/apache/solr/highlight/PostingsSolrHighlighter.java
@@ -26,12 +26,14 @@ import java.util.Set;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.search.Query;
+import org.apache.lucene.search.postingshighlight.CustomSeparatorBreakIterator;
 import org.apache.lucene.search.postingshighlight.DefaultPassageFormatter;
 import org.apache.lucene.search.postingshighlight.Passage;
 import org.apache.lucene.search.postingshighlight.PassageFormatter;
 import org.apache.lucene.search.postingshighlight.PassageScorer;
 import org.apache.lucene.search.postingshighlight.PostingsHighlighter;
 import org.apache.lucene.search.postingshighlight.WholeBreakIterator;
+import org.apache.solr.common.SolrException;
 import org.apache.solr.common.params.HighlightParams;
 import org.apache.solr.common.params.SolrParams;
 import org.apache.solr.common.util.NamedList;
@@ -239,12 +241,33 @@ public class PostingsSolrHighlighter extends SolrHighlighter implements PluginIn
 
     @Override
     protected BreakIterator getBreakIterator(String field) {
-      String language = params.getFieldParam(field, HighlightParams.BS_LANGUAGE);
-      String country = params.getFieldParam(field, HighlightParams.BS_COUNTRY);
-      String variant = params.getFieldParam(field, HighlightParams.BS_VARIANT);
-      Locale locale = parseLocale(language, country, variant);
       String type = params.getFieldParam(field, HighlightParams.BS_TYPE);
-      return parseBreakIterator(type, locale);
+      if ("WHOLE".equals(type)) {
+        return new WholeBreakIterator();
+      } else if ("SEPARATOR".equals(type)) {
+        char customSep = parseBiSepChar(params.getFieldParam(field, HighlightParams.BS_SEP));
+        return new CustomSeparatorBreakIterator(customSep);
+      } else {
+        String language = params.getFieldParam(field, HighlightParams.BS_LANGUAGE);
+        String country = params.getFieldParam(field, HighlightParams.BS_COUNTRY);
+        String variant = params.getFieldParam(field, HighlightParams.BS_VARIANT);
+        Locale locale = parseLocale(language, country, variant);
+        return parseBreakIterator(type, locale);
+      }
+    }
+
+    /**
+     * parse custom separator char for {@link CustomSeparatorBreakIterator}
+     */
+    protected char parseBiSepChar(String sepChar) {
+      if (sepChar == null) {
+        throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, HighlightParams.BS_SEP + " not passed");
+      }
+      if (sepChar.length() != 1) {
+        throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, HighlightParams.BS_SEP +
+            " must be a single char but got: '" + sepChar + "'");
+      }
+      return sepChar.charAt(0);
     }
 
     @Override
@@ -276,8 +299,6 @@ public class PostingsSolrHighlighter extends SolrHighlighter implements PluginIn
       return BreakIterator.getWordInstance(locale);
     } else if ("CHARACTER".equals(type)) {
       return BreakIterator.getCharacterInstance(locale);
-    } else if ("WHOLE".equals(type)) {
-      return new WholeBreakIterator();
     } else {
       throw new IllegalArgumentException("Unknown " + HighlightParams.BS_TYPE + ": " + type);
     }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/d1d73bfb/solr/core/src/java/org/apache/solr/highlight/UnifiedSolrHighlighter.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/highlight/UnifiedSolrHighlighter.java b/solr/core/src/java/org/apache/solr/highlight/UnifiedSolrHighlighter.java
index c80e522..e9c842c 100644
--- a/solr/core/src/java/org/apache/solr/highlight/UnifiedSolrHighlighter.java
+++ b/solr/core/src/java/org/apache/solr/highlight/UnifiedSolrHighlighter.java
@@ -28,12 +28,14 @@ import java.util.function.Predicate;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.search.DocIdSetIterator;
 import org.apache.lucene.search.Query;
+import org.apache.lucene.search.postingshighlight.CustomSeparatorBreakIterator;
 import org.apache.lucene.search.postingshighlight.WholeBreakIterator;
 import org.apache.lucene.search.uhighlight.DefaultPassageFormatter;
 import org.apache.lucene.search.uhighlight.LengthGoalBreakIterator;
 import org.apache.lucene.search.uhighlight.PassageFormatter;
 import org.apache.lucene.search.uhighlight.PassageScorer;
 import org.apache.lucene.search.uhighlight.UnifiedHighlighter;
+import org.apache.solr.common.SolrException;
 import org.apache.solr.common.params.HighlightParams;
 import org.apache.solr.common.params.SolrParams;
 import org.apache.solr.common.util.NamedList;
@@ -298,18 +300,20 @@ public class UnifiedSolrHighlighter extends SolrHighlighter implements PluginInf
       // Use a default fragsize the same as the regex Fragmenter (original Highlighter) since we're
       //  both likely shooting for sentence-like patterns.
       int fragsize = params.getFieldInt(field, HighlightParams.FRAGSIZE, LuceneRegexFragmenter.DEFAULT_FRAGMENT_SIZE);
-      if (fragsize == 0) { // special value; no fragmenting
+      String type = params.getFieldParam(field, HighlightParams.BS_TYPE);
+      if (fragsize == 0 || "WHOLE".equals(type)) { // 0 is special value; no fragmenting
         return new WholeBreakIterator();
+      } else if ("SEPARATOR".equals(type)) {
+        char customSep = parseBiSepChar(params.getFieldParam(field, HighlightParams.BS_SEP));
+        return new CustomSeparatorBreakIterator(customSep);
       }
-
       String language = params.getFieldParam(field, HighlightParams.BS_LANGUAGE);
       String country = params.getFieldParam(field, HighlightParams.BS_COUNTRY);
       String variant = params.getFieldParam(field, HighlightParams.BS_VARIANT);
       Locale locale = parseLocale(language, country, variant);
-      String type = params.getFieldParam(field, HighlightParams.BS_TYPE);
       BreakIterator baseBI = parseBreakIterator(type, locale);
 
-      if (fragsize <= 1 || baseBI instanceof WholeBreakIterator) { // no real minimum size
+      if (fragsize <= 1) { // no real minimum size
         return baseBI;
       }
       return LengthGoalBreakIterator.createMinLength(baseBI, fragsize);
@@ -317,6 +321,20 @@ public class UnifiedSolrHighlighter extends SolrHighlighter implements PluginInf
     }
 
     /**
+     * parse custom separator char for {@link CustomSeparatorBreakIterator}
+     */
+    protected char parseBiSepChar(String sepChar) {
+      if (sepChar == null) {
+        throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, HighlightParams.BS_SEP + " not passed");
+      }
+      if (sepChar.length() != 1) {
+        throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, HighlightParams.BS_SEP +
+            " must be a single char but got: '" + sepChar + "'");
+      }
+      return sepChar.charAt(0);
+    }
+
+    /**
      * parse a break iterator type for the specified locale
      */
     protected BreakIterator parseBreakIterator(String type, Locale locale) {
@@ -328,8 +346,6 @@ public class UnifiedSolrHighlighter extends SolrHighlighter implements PluginInf
         return BreakIterator.getWordInstance(locale);
       } else if ("CHARACTER".equals(type)) {
         return BreakIterator.getCharacterInstance(locale);
-      } else if ("WHOLE".equals(type)) {
-        return new WholeBreakIterator();
       } else {
         throw new IllegalArgumentException("Unknown " + HighlightParams.BS_TYPE + ": " + type);
       }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/d1d73bfb/solr/core/src/test/org/apache/solr/highlight/TestPostingsSolrHighlighter.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/highlight/TestPostingsSolrHighlighter.java b/solr/core/src/test/org/apache/solr/highlight/TestPostingsSolrHighlighter.java
index 074f9f4..3f25464 100644
--- a/solr/core/src/test/org/apache/solr/highlight/TestPostingsSolrHighlighter.java
+++ b/solr/core/src/test/org/apache/solr/highlight/TestPostingsSolrHighlighter.java
@@ -145,6 +145,19 @@ public class TestPostingsSolrHighlighter extends SolrTestCaseJ4 {
         "//lst[@name='highlighting']/lst[@name='103']/arr[@name='text']/str='<em>Document</em> one has a first sentence. <em>Document</em> two has a second sentence.'");
   }
   
+  public void testBreakIterator3() {
+    assertU(adoc("text", "This document contains # special characters, while the other document contains the same # special character.", "id", "103"));
+    assertU(adoc("text", "While the other document contains the same # special character.", "id", "104"));
+    assertU(commit());
+    assertQ("different breakiterator", 
+        req("q", "text:document", "sort", "id asc", "hl", "true", "hl.bs.type", "SEPARATOR","hl.bs.separator","#"),
+        "//lst[@name='highlighting']/lst[@name='103']/arr[@name='text']/str='This <em>document</em> contains #'");
+    assertQ("different breakiterator", 
+        req("q", "text:document", "sort", "id asc", "hl", "true", "hl.bs.type", "SEPARATOR","hl.bs.separator","#"),
+        "//lst[@name='highlighting']/lst[@name='104']/arr[@name='text']/str='While the other <em>document</em> contains the same #'");
+
+  }
+  
   public void testEncoder() {
     assertU(adoc("text", "Document one has a first <i>sentence</i>.", "id", "103"));
     assertU(commit());

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/d1d73bfb/solr/core/src/test/org/apache/solr/highlight/TestUnifiedSolrHighlighter.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/highlight/TestUnifiedSolrHighlighter.java b/solr/core/src/test/org/apache/solr/highlight/TestUnifiedSolrHighlighter.java
index d452829..9835518 100644
--- a/solr/core/src/test/org/apache/solr/highlight/TestUnifiedSolrHighlighter.java
+++ b/solr/core/src/test/org/apache/solr/highlight/TestUnifiedSolrHighlighter.java
@@ -221,6 +221,19 @@ public class TestUnifiedSolrHighlighter extends SolrTestCaseJ4 {
         req("q", "text:document", "sort", "id asc", "hl", "true", "hl.fragsize", "0"),
         "//lst[@name='highlighting']/lst[@name='103']/arr[@name='text']/str='<em>Document</em> one has a first sentence. <em>Document</em> two has a second sentence.'");
   }
+  
+  public void testBreakIteratorCustom() {
+    assertU(adoc("text", "This document contains # special characters, while the other document contains the same # special character.", "id", "103"));
+    assertU(adoc("text", "While the other document contains the same # special character.", "id", "104"));
+    assertU(commit());
+    assertQ("CUSTOM breakiterator", 
+        req("q", "text:document", "sort", "id asc", "hl", "true", "hl.bs.type", "SEPARATOR","hl.bs.separator","#","hl.fragsize", "-1"),
+        "//lst[@name='highlighting']/lst[@name='103']/arr[@name='text']/str='This <em>document</em> contains #'");
+    assertQ("different breakiterator", 
+        req("q", "text:document", "sort", "id asc", "hl", "true", "hl.bs.type", "SEPARATOR","hl.bs.separator","#","hl.fragsize", "-1"),
+        "//lst[@name='highlighting']/lst[@name='104']/arr[@name='text']/str='While the other <em>document</em> contains the same #'");
+
+  }
 
   public void testFragsize() {
     // test default is 70... so make a sentence that is a little less (closer to 70 than end of text)

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/d1d73bfb/solr/solrj/src/java/org/apache/solr/common/params/HighlightParams.java
----------------------------------------------------------------------
diff --git a/solr/solrj/src/java/org/apache/solr/common/params/HighlightParams.java b/solr/solrj/src/java/org/apache/solr/common/params/HighlightParams.java
index 57f6734..3741214 100644
--- a/solr/solrj/src/java/org/apache/solr/common/params/HighlightParams.java
+++ b/solr/solrj/src/java/org/apache/solr/common/params/HighlightParams.java
@@ -63,6 +63,7 @@ public interface HighlightParams {
   public static final String BS_LANGUAGE = HIGHLIGHT+".bs.language"; // FVH, UH, PH
   public static final String BS_COUNTRY  = HIGHLIGHT+".bs.country"; // FVH, UH, PH
   public static final String BS_VARIANT  = HIGHLIGHT+".bs.variant"; // FVH, UH, PH
+  public static final String BS_SEP      = HIGHLIGHT+".bs.separator"; // UH, PH
 
   // formatting
   public static final String FORMATTER   = HIGHLIGHT+".formatter"; // OH