You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ds...@apache.org on 2017/03/02 02:04:15 UTC
lucene-solr:master: SOLR-10153: (and SOLR-10152): UH & PH: Add
hl.bs.type=SEPARATOR with new param hl.bs.separator
Repository: lucene-solr
Updated Branches:
refs/heads/master 8684fe794 -> d1d73bfbe
SOLR-10153: (and SOLR-10152): UH & PH: Add hl.bs.type=SEPARATOR with new param hl.bs.separator
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/d1d73bfb
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/d1d73bfb
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/d1d73bfb
Branch: refs/heads/master
Commit: d1d73bfbea3db4adead960fae3597bec7647fba6
Parents: 8684fe7
Author: David Smiley <ds...@apache.org>
Authored: Wed Mar 1 21:04:07 2017 -0500
Committer: David Smiley <ds...@apache.org>
Committed: Wed Mar 1 21:04:07 2017 -0500
----------------------------------------------------------------------
solr/CHANGES.txt | 3 ++
.../solr/highlight/PostingsSolrHighlighter.java | 35 ++++++++++++++++----
.../solr/highlight/UnifiedSolrHighlighter.java | 28 ++++++++++++----
.../highlight/TestPostingsSolrHighlighter.java | 13 ++++++++
.../highlight/TestUnifiedSolrHighlighter.java | 13 ++++++++
.../solr/common/params/HighlightParams.java | 1 +
6 files changed, 80 insertions(+), 13 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/d1d73bfb/solr/CHANGES.txt
----------------------------------------------------------------------
diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index db5e3e6..fa02c39 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -137,6 +137,9 @@ New Features
* SOLR-10158: Add support for "preload" option in MMapDirectoryFactory.
(Amrit Sarkar via Uwe Schindler)
+* SOLR-10153 & SOLR-10152: The Unified and Postings based highlighters: Add hl.bs.type=SEPARATOR along with new param
+ hl.bs.separator to break passages by a provided single character. (Amrit Sarkar, David Smiley)
+
Bug Fixes
----------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/d1d73bfb/solr/core/src/java/org/apache/solr/highlight/PostingsSolrHighlighter.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/highlight/PostingsSolrHighlighter.java b/solr/core/src/java/org/apache/solr/highlight/PostingsSolrHighlighter.java
index 9fcf9f3..5ea3db1 100644
--- a/solr/core/src/java/org/apache/solr/highlight/PostingsSolrHighlighter.java
+++ b/solr/core/src/java/org/apache/solr/highlight/PostingsSolrHighlighter.java
@@ -26,12 +26,14 @@ import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.search.Query;
+import org.apache.lucene.search.postingshighlight.CustomSeparatorBreakIterator;
import org.apache.lucene.search.postingshighlight.DefaultPassageFormatter;
import org.apache.lucene.search.postingshighlight.Passage;
import org.apache.lucene.search.postingshighlight.PassageFormatter;
import org.apache.lucene.search.postingshighlight.PassageScorer;
import org.apache.lucene.search.postingshighlight.PostingsHighlighter;
import org.apache.lucene.search.postingshighlight.WholeBreakIterator;
+import org.apache.solr.common.SolrException;
import org.apache.solr.common.params.HighlightParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList;
@@ -239,12 +241,33 @@ public class PostingsSolrHighlighter extends SolrHighlighter implements PluginIn
@Override
protected BreakIterator getBreakIterator(String field) {
- String language = params.getFieldParam(field, HighlightParams.BS_LANGUAGE);
- String country = params.getFieldParam(field, HighlightParams.BS_COUNTRY);
- String variant = params.getFieldParam(field, HighlightParams.BS_VARIANT);
- Locale locale = parseLocale(language, country, variant);
String type = params.getFieldParam(field, HighlightParams.BS_TYPE);
- return parseBreakIterator(type, locale);
+ if ("WHOLE".equals(type)) {
+ return new WholeBreakIterator();
+ } else if ("SEPARATOR".equals(type)) {
+ char customSep = parseBiSepChar(params.getFieldParam(field, HighlightParams.BS_SEP));
+ return new CustomSeparatorBreakIterator(customSep);
+ } else {
+ String language = params.getFieldParam(field, HighlightParams.BS_LANGUAGE);
+ String country = params.getFieldParam(field, HighlightParams.BS_COUNTRY);
+ String variant = params.getFieldParam(field, HighlightParams.BS_VARIANT);
+ Locale locale = parseLocale(language, country, variant);
+ return parseBreakIterator(type, locale);
+ }
+ }
+
+ /**
+ * parse custom separator char for {@link CustomSeparatorBreakIterator}
+ */
+ protected char parseBiSepChar(String sepChar) {
+ if (sepChar == null) {
+ throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, HighlightParams.BS_SEP + " not passed");
+ }
+ if (sepChar.length() != 1) {
+ throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, HighlightParams.BS_SEP +
+ " must be a single char but got: '" + sepChar + "'");
+ }
+ return sepChar.charAt(0);
}
@Override
@@ -276,8 +299,6 @@ public class PostingsSolrHighlighter extends SolrHighlighter implements PluginIn
return BreakIterator.getWordInstance(locale);
} else if ("CHARACTER".equals(type)) {
return BreakIterator.getCharacterInstance(locale);
- } else if ("WHOLE".equals(type)) {
- return new WholeBreakIterator();
} else {
throw new IllegalArgumentException("Unknown " + HighlightParams.BS_TYPE + ": " + type);
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/d1d73bfb/solr/core/src/java/org/apache/solr/highlight/UnifiedSolrHighlighter.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/highlight/UnifiedSolrHighlighter.java b/solr/core/src/java/org/apache/solr/highlight/UnifiedSolrHighlighter.java
index c80e522..e9c842c 100644
--- a/solr/core/src/java/org/apache/solr/highlight/UnifiedSolrHighlighter.java
+++ b/solr/core/src/java/org/apache/solr/highlight/UnifiedSolrHighlighter.java
@@ -28,12 +28,14 @@ import java.util.function.Predicate;
import org.apache.lucene.document.Document;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.Query;
+import org.apache.lucene.search.postingshighlight.CustomSeparatorBreakIterator;
import org.apache.lucene.search.postingshighlight.WholeBreakIterator;
import org.apache.lucene.search.uhighlight.DefaultPassageFormatter;
import org.apache.lucene.search.uhighlight.LengthGoalBreakIterator;
import org.apache.lucene.search.uhighlight.PassageFormatter;
import org.apache.lucene.search.uhighlight.PassageScorer;
import org.apache.lucene.search.uhighlight.UnifiedHighlighter;
+import org.apache.solr.common.SolrException;
import org.apache.solr.common.params.HighlightParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList;
@@ -298,18 +300,20 @@ public class UnifiedSolrHighlighter extends SolrHighlighter implements PluginInf
// Use a default fragsize the same as the regex Fragmenter (original Highlighter) since we're
// both likely shooting for sentence-like patterns.
int fragsize = params.getFieldInt(field, HighlightParams.FRAGSIZE, LuceneRegexFragmenter.DEFAULT_FRAGMENT_SIZE);
- if (fragsize == 0) { // special value; no fragmenting
+ String type = params.getFieldParam(field, HighlightParams.BS_TYPE);
+ if (fragsize == 0 || "WHOLE".equals(type)) { // 0 is special value; no fragmenting
return new WholeBreakIterator();
+ } else if ("SEPARATOR".equals(type)) {
+ char customSep = parseBiSepChar(params.getFieldParam(field, HighlightParams.BS_SEP));
+ return new CustomSeparatorBreakIterator(customSep);
}
-
String language = params.getFieldParam(field, HighlightParams.BS_LANGUAGE);
String country = params.getFieldParam(field, HighlightParams.BS_COUNTRY);
String variant = params.getFieldParam(field, HighlightParams.BS_VARIANT);
Locale locale = parseLocale(language, country, variant);
- String type = params.getFieldParam(field, HighlightParams.BS_TYPE);
BreakIterator baseBI = parseBreakIterator(type, locale);
- if (fragsize <= 1 || baseBI instanceof WholeBreakIterator) { // no real minimum size
+ if (fragsize <= 1) { // no real minimum size
return baseBI;
}
return LengthGoalBreakIterator.createMinLength(baseBI, fragsize);
@@ -317,6 +321,20 @@ public class UnifiedSolrHighlighter extends SolrHighlighter implements PluginInf
}
/**
+ * parse custom separator char for {@link CustomSeparatorBreakIterator}
+ */
+ protected char parseBiSepChar(String sepChar) {
+ if (sepChar == null) {
+ throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, HighlightParams.BS_SEP + " not passed");
+ }
+ if (sepChar.length() != 1) {
+ throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, HighlightParams.BS_SEP +
+ " must be a single char but got: '" + sepChar + "'");
+ }
+ return sepChar.charAt(0);
+ }
+
+ /**
* parse a break iterator type for the specified locale
*/
protected BreakIterator parseBreakIterator(String type, Locale locale) {
@@ -328,8 +346,6 @@ public class UnifiedSolrHighlighter extends SolrHighlighter implements PluginInf
return BreakIterator.getWordInstance(locale);
} else if ("CHARACTER".equals(type)) {
return BreakIterator.getCharacterInstance(locale);
- } else if ("WHOLE".equals(type)) {
- return new WholeBreakIterator();
} else {
throw new IllegalArgumentException("Unknown " + HighlightParams.BS_TYPE + ": " + type);
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/d1d73bfb/solr/core/src/test/org/apache/solr/highlight/TestPostingsSolrHighlighter.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/highlight/TestPostingsSolrHighlighter.java b/solr/core/src/test/org/apache/solr/highlight/TestPostingsSolrHighlighter.java
index 074f9f4..3f25464 100644
--- a/solr/core/src/test/org/apache/solr/highlight/TestPostingsSolrHighlighter.java
+++ b/solr/core/src/test/org/apache/solr/highlight/TestPostingsSolrHighlighter.java
@@ -145,6 +145,19 @@ public class TestPostingsSolrHighlighter extends SolrTestCaseJ4 {
"//lst[@name='highlighting']/lst[@name='103']/arr[@name='text']/str='<em>Document</em> one has a first sentence. <em>Document</em> two has a second sentence.'");
}
+ public void testBreakIterator3() {
+ assertU(adoc("text", "This document contains # special characters, while the other document contains the same # special character.", "id", "103"));
+ assertU(adoc("text", "While the other document contains the same # special character.", "id", "104"));
+ assertU(commit());
+ assertQ("different breakiterator",
+ req("q", "text:document", "sort", "id asc", "hl", "true", "hl.bs.type", "SEPARATOR","hl.bs.separator","#"),
+ "//lst[@name='highlighting']/lst[@name='103']/arr[@name='text']/str='This <em>document</em> contains #'");
+ assertQ("different breakiterator",
+ req("q", "text:document", "sort", "id asc", "hl", "true", "hl.bs.type", "SEPARATOR","hl.bs.separator","#"),
+ "//lst[@name='highlighting']/lst[@name='104']/arr[@name='text']/str='While the other <em>document</em> contains the same #'");
+
+ }
+
public void testEncoder() {
assertU(adoc("text", "Document one has a first <i>sentence</i>.", "id", "103"));
assertU(commit());
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/d1d73bfb/solr/core/src/test/org/apache/solr/highlight/TestUnifiedSolrHighlighter.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/highlight/TestUnifiedSolrHighlighter.java b/solr/core/src/test/org/apache/solr/highlight/TestUnifiedSolrHighlighter.java
index d452829..9835518 100644
--- a/solr/core/src/test/org/apache/solr/highlight/TestUnifiedSolrHighlighter.java
+++ b/solr/core/src/test/org/apache/solr/highlight/TestUnifiedSolrHighlighter.java
@@ -221,6 +221,19 @@ public class TestUnifiedSolrHighlighter extends SolrTestCaseJ4 {
req("q", "text:document", "sort", "id asc", "hl", "true", "hl.fragsize", "0"),
"//lst[@name='highlighting']/lst[@name='103']/arr[@name='text']/str='<em>Document</em> one has a first sentence. <em>Document</em> two has a second sentence.'");
}
+
+ public void testBreakIteratorCustom() {
+ assertU(adoc("text", "This document contains # special characters, while the other document contains the same # special character.", "id", "103"));
+ assertU(adoc("text", "While the other document contains the same # special character.", "id", "104"));
+ assertU(commit());
+ assertQ("CUSTOM breakiterator",
+ req("q", "text:document", "sort", "id asc", "hl", "true", "hl.bs.type", "SEPARATOR","hl.bs.separator","#","hl.fragsize", "-1"),
+ "//lst[@name='highlighting']/lst[@name='103']/arr[@name='text']/str='This <em>document</em> contains #'");
+ assertQ("different breakiterator",
+ req("q", "text:document", "sort", "id asc", "hl", "true", "hl.bs.type", "SEPARATOR","hl.bs.separator","#","hl.fragsize", "-1"),
+ "//lst[@name='highlighting']/lst[@name='104']/arr[@name='text']/str='While the other <em>document</em> contains the same #'");
+
+ }
public void testFragsize() {
// test default is 70... so make a sentence that is a little less (closer to 70 than end of text)
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/d1d73bfb/solr/solrj/src/java/org/apache/solr/common/params/HighlightParams.java
----------------------------------------------------------------------
diff --git a/solr/solrj/src/java/org/apache/solr/common/params/HighlightParams.java b/solr/solrj/src/java/org/apache/solr/common/params/HighlightParams.java
index 57f6734..3741214 100644
--- a/solr/solrj/src/java/org/apache/solr/common/params/HighlightParams.java
+++ b/solr/solrj/src/java/org/apache/solr/common/params/HighlightParams.java
@@ -63,6 +63,7 @@ public interface HighlightParams {
public static final String BS_LANGUAGE = HIGHLIGHT+".bs.language"; // FVH, UH, PH
public static final String BS_COUNTRY = HIGHLIGHT+".bs.country"; // FVH, UH, PH
public static final String BS_VARIANT = HIGHLIGHT+".bs.variant"; // FVH, UH, PH
+ public static final String BS_SEP = HIGHLIGHT+".bs.separator"; // UH, PH
// formatting
public static final String FORMATTER = HIGHLIGHT+".formatter"; // OH