You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ds...@apache.org on 2020/07/06 13:55:17 UTC

[lucene-solr] branch branch_8_6 updated: SOLR-14628: hl.fragsizeIsMinimum now defaults to true (#1651)

This is an automated email from the ASF dual-hosted git repository.

dsmiley pushed a commit to branch branch_8_6
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git


The following commit(s) were added to refs/heads/branch_8_6 by this push:
     new 6a715e2  SOLR-14628: hl.fragsizeIsMinimum now defaults to true (#1651)
6a715e2 is described below

commit 6a715e2ecc3ff8a1d0312314d16196c5639881d0
Author: David Smiley <ds...@apache.org>
AuthorDate: Mon Jul 6 09:31:37 2020 -0400

    SOLR-14628: hl.fragsizeIsMinimum now defaults to true (#1651)
    
    Fragments size like <= Solr 8.4 and addresses a significant performance regression
    
    (cherry picked from commit 31e7ad9ee28763a0303f0d0a6f190b530d3b3ac1)
---
 solr/CHANGES.txt                                         |  3 +++
 .../apache/solr/highlight/UnifiedSolrHighlighter.java    |  3 ++-
 .../solr/highlight/TestUnifiedSolrHighlighter.java       | 16 ++++++++--------
 solr/solr-ref-guide/src/highlighting.adoc                | 13 +++++++++----
 solr/solr-ref-guide/src/solr-upgrade-notes.adoc          |  8 ++++++++
 5 files changed, 30 insertions(+), 13 deletions(-)

diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index 4836bbc..e773dbf 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -201,6 +201,9 @@ Bug Fixes
 * SOLR-14584: Correct SOLR_SSL_KEY_STORE and SOLR_SSL_TRUST_STORE example comments in solr.in.sh and solr.in.cmd files
   (Aren Cambre via Christine Poerschke)
 
+* SOLR-14628: hl.fragsizeIsMinimum now defaults to true, thus fragments size like previous to Solr 8.5 and addresses a
+  significant performance regression. (David Smiley, Michal Hlavac)
+
 Other Changes
 ---------------------
 * SOLR-14197: SolrResourceLoader: marked many methods as deprecated, and in some cases rerouted exiting logic to avoid
diff --git a/solr/core/src/java/org/apache/solr/highlight/UnifiedSolrHighlighter.java b/solr/core/src/java/org/apache/solr/highlight/UnifiedSolrHighlighter.java
index b226663..6f2c1a9 100644
--- a/solr/core/src/java/org/apache/solr/highlight/UnifiedSolrHighlighter.java
+++ b/solr/core/src/java/org/apache/solr/highlight/UnifiedSolrHighlighter.java
@@ -325,8 +325,9 @@ public class UnifiedSolrHighlighter extends SolrHighlighter implements PluginInf
       if (fragsize <= 1) { // no real minimum size
         return baseBI;
       }
+
       float fragalign = params.getFieldFloat(field, HighlightParams.FRAGALIGNRATIO, 0.5f);
-      if (params.getFieldBool(field, HighlightParams.FRAGSIZEISMINIMUM, false)) {
+      if (params.getFieldBool(field, HighlightParams.FRAGSIZEISMINIMUM, true)) {
         return LengthGoalBreakIterator.createMinLength(baseBI, fragsize, fragalign);
       }
       return LengthGoalBreakIterator.createClosestToLength(baseBI, fragsize, fragalign);
diff --git a/solr/core/src/test/org/apache/solr/highlight/TestUnifiedSolrHighlighter.java b/solr/core/src/test/org/apache/solr/highlight/TestUnifiedSolrHighlighter.java
index 9d890b1..a065fe2 100644
--- a/solr/core/src/test/org/apache/solr/highlight/TestUnifiedSolrHighlighter.java
+++ b/solr/core/src/test/org/apache/solr/highlight/TestUnifiedSolrHighlighter.java
@@ -256,14 +256,14 @@ public class TestUnifiedSolrHighlighter extends SolrTestCaseJ4 {
     assertQ("CUSTOM breakiterator with fragsize 70 minimum",
         req("q", "text:document", "sort", "id asc", "hl", "true", "hl.bs.type", "SEPARATOR","hl.bs.separator","#","hl.fragsize", "70", "hl.fragsizeIsMinimum", "true"),
         "//lst[@name='highlighting']/lst[@name='103']/arr[@name='text']/str='This <em>document</em> contains # special characters, while the other <em>document</em> contains the same #'");
-    assertQ("CUSTOM breakiterator with fragsize 70",
-        req("q", "text:document", "sort", "id asc", "hl", "true", "hl.bs.type", "SEPARATOR","hl.bs.separator","#","hl.fragsize", "70"),
+    assertQ("CUSTOM breakiterator with fragsize 70 avg",
+        req("q", "text:document", "sort", "id asc", "hl", "true", "hl.bs.type", "SEPARATOR","hl.bs.separator","#","hl.fragsize", "70", "hl.fragsizeIsMinimum", "false"),
         "//lst[@name='highlighting']/lst[@name='103']/arr[@name='text']/str='This <em>document</em> contains #'");
-    assertQ("CUSTOM breakiterator with fragsize 90",
-        req("q", "text:document", "sort", "id asc", "hl", "true", "hl.bs.type", "SEPARATOR","hl.bs.separator","#","hl.fragsize", "90"),
+    assertQ("CUSTOM breakiterator with fragsize 90 avg",
+        req("q", "text:document", "sort", "id asc", "hl", "true", "hl.bs.type", "SEPARATOR","hl.bs.separator","#","hl.fragsize", "90", "hl.fragsizeIsMinimum", "false"),
         "//lst[@name='highlighting']/lst[@name='103']/arr[@name='text']/str='This <em>document</em> contains #'");
-    assertQ("CUSTOM breakiterator with fragsize 100",
-        req("q", "text:document", "sort", "id asc", "hl", "true", "hl.bs.type", "SEPARATOR","hl.bs.separator","#","hl.fragsize", "100"),
+    assertQ("CUSTOM breakiterator with fragsize 100 avg",
+        req("q", "text:document", "sort", "id asc", "hl", "true", "hl.bs.type", "SEPARATOR","hl.bs.separator","#","hl.fragsize", "100", "hl.fragsizeIsMinimum", "false"),
         "//lst[@name='highlighting']/lst[@name='103']/arr[@name='text']/str='This <em>document</em> contains # special characters, while the other <em>document</em> contains the same #'");
   }
 
@@ -279,10 +279,10 @@ public class TestUnifiedSolrHighlighter extends SolrTestCaseJ4 {
         req("q", "text:seventy", "hl", "true", "hl.fragsizeIsMinimum", "true", "hl.fragsize", "60"),
         "//lst[@name='highlighting']/lst[@name='10']/arr[@name='text']/str='This is a sentence just under <em>seventy</em> chars in length blah blah. '");
     assertQ("smaller fragsize",
-        req("q", "text:seventy", "hl", "true"),
+        req("q", "text:seventy", "hl", "true", "hl.fragsizeIsMinimum", "false"),
         "//lst[@name='highlighting']/lst[@name='10']/arr[@name='text']/str='This is a sentence just under <em>seventy</em> chars in length blah blah. '");
     assertQ("default fragsize",
-        req("q", "text:seventy", "hl", "true", "hl.fragsize", "90"),
+        req("q", "text:seventy", "hl", "true", "hl.fragsize", "90", "hl.fragsizeIsMinimum", "false"),
         "//lst[@name='highlighting']/lst[@name='10']/arr[@name='text']/str='This is a sentence just under <em>seventy</em> chars in length blah blah. Next sentence is here.'");
   }
   
diff --git a/solr/solr-ref-guide/src/highlighting.adoc b/solr/solr-ref-guide/src/highlighting.adoc
index cdbc873..a1ddd5b 100644
--- a/solr/solr-ref-guide/src/highlighting.adoc
+++ b/solr/solr-ref-guide/src/highlighting.adoc
@@ -227,12 +227,17 @@ By default, the Unified Highlighter will usually pick the right offset source (s
 The offset source can be explicitly configured to one of: `ANALYSIS`, `POSTINGS`, `POSTINGS_WITH_TERM_VECTORS`, or `TERM_VECTORS`.
 
 `hl.fragAlignRatio`::
-Fragment alignment can influence where the match in a passage is positioned. This floating point value is used to break the `hl.fragsize` around the match. The default value of `0.5` means to align the match to the middle. A value of `0.0` would mean to align the match to the left, while `1.0` to align it to the right. _Note: there are situations where the requested alignment is not plausible. This depends on the length of the match, the used breakiterator and the text content around the [...]
+This parameter influences where the first match (i.e. highlighted text) in a passage is positioned.
+The default value of `0.5` means to align the match to the middle.
+A value of `0.0` means to align the match to the left, while `1.0` to align it to the right.
+This setting is a best-effort hint, as there are a variety of factors.
+When there's lots of text to be highlighted, lowering this number can help performance a lot.
 
 `hl.fragsizeIsMinimum`::
-By default this parameter is `false`. In this case the highlighter will accept a shorter fragment than `hl.fragsize`, if that is closer to the target than what the next longer choice would be.
-+
-When treating `hl.fragsize` as a strict minimum length is acceptable, turning this on has some performance benefits to consider.
+When `true` (the default), the `hl.fragsize` parameter is treated as a (soft) minimum fragment size;
+provided there is enough text, the fragment is at least this size.
+When `false`, it's an optimal target -- the highlighter will _on average_ produce highlights of this length.
+A `false` setting is slower, particularly when there's lots of text and `hl.bs.type=SENTENCE`.
 
 `hl.tag.ellipsis`::
 By default, each snippet is returned as a separate value (as is done with the other highlighters). Set this parameter to instead return one string with this text as the delimiter. _Note: this is likely to be removed in the future._
diff --git a/solr/solr-ref-guide/src/solr-upgrade-notes.adoc b/solr/solr-ref-guide/src/solr-upgrade-notes.adoc
index 930743b..8b3bb81 100644
--- a/solr/solr-ref-guide/src/solr-upgrade-notes.adoc
+++ b/solr/solr-ref-guide/src/solr-upgrade-notes.adoc
@@ -81,6 +81,12 @@ For more information about how to use this, see the section <<exporting-result-s
 
 * The `stats`, `facet`, and `timeseries` expressions now support percentiles and standard deviation aggregations.
 
+*Highlighting*
+
+For the Unified Highlighter: The setting `hl.fragsizeIsMinimum` now defaults to `false` because `true` was found to be a significant performance regression when highlighting lots of text.
+This will yield longer highlights on average compared to Solr 8.5 but relatively unchanged compared to previous releases.
+Furthermore, if your application highlights lots of text, you may want to experiment with lowering `hl.fragAlignRatio` to trade ideal fragment alignment for better performance.
+
 *Restricting file paths*
 
 Several Solr APIs allow specifying a file system location. This includes core creation, backup, restore and others. Prior to Solr 8.6 these parameters were not validated, and Solr would allow any absolute or relative path. From 8.6 we by default allow only paths that are relative to `SOLR_HOME`, `SOLR_DATA_HOME` and `coreRootDir`. If you need to create a core or store a backup outside these pre-allowed paths, you now need to tell Solr about what paths to allow.
@@ -182,6 +188,8 @@ This tool is not yet officially documented in the Reference Guide, but draft doc
 Solr's Unified Highlighter now has two parameters to help control passage sizing, `hl.fragAlignRatio` and `hl.fragsizeIsMinimum`.
 See the section <<highlighting.adoc#the-unified-highlighter,The Unified Highlighter>> for details about these new parameters.
 Regardless of the settings, the passages may be sized differently than before.
+_Warning: These default settings were found to be a significant performance regression for apps that highlight lots of text with the default sentence break iterator.
+See the 8.6 upgrade notes for advise you can apply in 8.5._
 
 *Shared Library System Parameter*