You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@solr.apache.org by md...@apache.org on 2021/06/14 16:42:13 UTC

[solr] branch main updated: SOLR-15471 Rename parameter to langid.allowlist (#172)

This is an automated email from the ASF dual-hosted git repository.

mdrob pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/solr.git


The following commit(s) were added to refs/heads/main by this push:
     new 777d701  SOLR-15471 Rename parameter to langid.allowlist (#172)
777d701 is described below

commit 777d7010029f092946aaa53833f9ffdd487cd680
Author: Mike Drob <md...@apache.org>
AuthorDate: Mon Jun 14 11:42:04 2021 -0500

    SOLR-15471 Rename parameter to langid.allowlist (#172)
---
 solr/CHANGES.txt                                   |  2 ++
 .../apache/solr/update/processor/LangIdParams.java |  4 +++-
 .../LanguageIdentifierUpdateProcessor.java         | 23 +++++++++++++---------
 .../src/detecting-languages-during-indexing.adoc   |  4 ++--
 .../src/major-changes-in-solr-9.adoc               |  2 ++
 5 files changed, 23 insertions(+), 12 deletions(-)

diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index 0e1099d..5eca8f9 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -315,6 +315,8 @@ Other Changes
 
 * SOLR-15385: Address many rawtypes warnings, resulting in several modified signatures in the public API. (Mike Drob, David Smiley)
 
+* SOLR-15471: Rename lang id whitelist parameter to allowlist (Mike Drob)
+
 Bug Fixes
 ---------------------
 * SOLR-14546: Fix for a relatively hard to hit issue in OverseerTaskProcessor that could lead to out of order execution
diff --git a/solr/contrib/langid/src/java/org/apache/solr/update/processor/LangIdParams.java b/solr/contrib/langid/src/java/org/apache/solr/update/processor/LangIdParams.java
index 4e19eab..4dc04ee 100644
--- a/solr/contrib/langid/src/java/org/apache/solr/update/processor/LangIdParams.java
+++ b/solr/contrib/langid/src/java/org/apache/solr/update/processor/LangIdParams.java
@@ -29,7 +29,9 @@ public interface LangIdParams {
   String OVERWRITE  = LANGUAGE_ID + ".overwrite";            // Overwrite if existing language value in LANG_FIELD
   String THRESHOLD  = LANGUAGE_ID + ".threshold";            // Detection threshold
   String ENFORCE_SCHEMA =  LANGUAGE_ID + ".enforceSchema";   // Enforces that output fields exist in schema
-  String LANG_WHITELIST  = LANGUAGE_ID + ".whitelist";       // Allowed languages
+  @Deprecated(since = "9.0.0")
+  String LANG_WHITELIST = LANGUAGE_ID + ".whitelist";        // Old property name for allowed languages
+  String LANG_ALLOWLIST = LANGUAGE_ID + ".allowlist";        // Allowed languages
   String LCMAP =  LANGUAGE_ID + ".lcmap";                    // Maps detected langcode to other value
   String MAP_ENABLE =  LANGUAGE_ID + ".map";                 // Turns on or off the field mapping
   String MAP_FL =  LANGUAGE_ID + ".map.fl";                  // Field list for mapping
diff --git a/solr/contrib/langid/src/java/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessor.java b/solr/contrib/langid/src/java/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessor.java
index 90aef72..0043e46 100644
--- a/solr/contrib/langid/src/java/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessor.java
+++ b/solr/contrib/langid/src/java/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessor.java
@@ -45,7 +45,7 @@ import org.slf4j.LoggerFactory;
  *   Identifies the language of a set of input fields.
  *   Also supports mapping of field names based on detected language.
  * </p>
- * See <a href="https://lucene.apache.org/solr/guide/detecting-languages-during-indexing.html">Detecting Languages During Indexing</a> in reference guide
+ * See <a href="https://solr.apache.org/guide/detecting-languages-during-indexing.html">Detecting Languages During Indexing</a> in reference guide
  * @since 3.5
  * @lucene.experimental
  */
@@ -71,7 +71,7 @@ public abstract class LanguageIdentifierUpdateProcessor extends UpdateRequestPro
   protected boolean mapIndividual;
   protected boolean enforceSchema;
   protected double threshold;
-  protected HashSet<String> langWhitelist;
+  protected HashSet<String> langAllowlist;
   protected HashSet<String> mapIndividualFieldsSet;
   protected HashSet<String> allMapFieldsSet;
   protected HashMap<String,String> lcMap;
@@ -108,11 +108,16 @@ public abstract class LanguageIdentifierUpdateProcessor extends UpdateRequestPro
         fallbackFields = params.get(FALLBACK_FIELDS).split(",");
       }
       overwrite = params.getBool(OVERWRITE, false);
-      langWhitelist = new HashSet<>();
+      langAllowlist = new HashSet<>();
       threshold = params.getDouble(THRESHOLD, DOCID_THRESHOLD_DEFAULT);
-      if(params.get(LANG_WHITELIST, "").length() > 0) {
-        for(String lang : params.get(LANG_WHITELIST, "").split(",")) {
-          langWhitelist.add(lang);
+      String legacyAllowList = params.get(LANG_WHITELIST, "");
+      if(legacyAllowList.length() > 0) {
+        // nowarn compile time string concatenation
+        log.warn(LANG_WHITELIST + " parameter is deprecated; use " + LANG_ALLOWLIST + " instead."); // nowarn
+      }
+      if(params.get(LANG_ALLOWLIST, legacyAllowList).length() > 0) {
+        for(String lang : params.get(LANG_ALLOWLIST, "").split(",")) {
+          langAllowlist.add(lang);
         }
       }
 
@@ -234,7 +239,7 @@ public abstract class LanguageIdentifierUpdateProcessor extends UpdateRequestPro
         doc.setField(langField, docLang);
       }
     } else {
-      // langField is set, we sanity check it against whitelist and fallback
+      // langField is set, we check it against allowlist and fallback
       docLang = resolveLanguage(doc.getFieldValue(langField).toString(), fallbackLang);
       docLangs.add(docLang);
       log.debug("Field {} already contained value {}, not overwriting.", langField, docLang);
@@ -344,7 +349,7 @@ public abstract class LanguageIdentifierUpdateProcessor extends UpdateRequestPro
     } else {
       DetectedLanguage lang = languages.get(0);
       String normalizedLang = normalizeLangCode(lang.getLangCode());
-      if(langWhitelist.isEmpty() || langWhitelist.contains(normalizedLang)) {
+      if(langAllowlist.isEmpty() || langAllowlist.contains(normalizedLang)) {
         if (log.isDebugEnabled()) {
           log.debug("Language detected {} with certainty {}", normalizedLang, lang.getCertainty());
         }
@@ -356,7 +361,7 @@ public abstract class LanguageIdentifierUpdateProcessor extends UpdateRequestPro
         }
       } else {
         if (log.isDebugEnabled()) {
-          log.debug("Detected a language not in whitelist ({}), using fallback {}", lang.getLangCode(), fallbackLang);
+          log.debug("Detected a language not in allowlist ({}), using fallback {}", lang.getLangCode(), fallbackLang);
         }
         langStr = fallbackLang;
       }
diff --git a/solr/solr-ref-guide/src/detecting-languages-during-indexing.adoc b/solr/solr-ref-guide/src/detecting-languages-during-indexing.adoc
index 7924c1c..d318458 100644
--- a/solr/solr-ref-guide/src/detecting-languages-during-indexing.adoc
+++ b/solr/solr-ref-guide/src/detecting-languages-during-indexing.adoc
@@ -119,7 +119,7 @@ With longer text fields, a high threshold such as `0.8` will give good results.
 +
 The default is `0.5`.
 
-`langid.whitelist`::
+`langid.allowlist`::
 Specifies a list of allowed language identification codes. Use this in combination with `langid.map` to ensure that you only index documents into fields that are in your schema.
 
 `langid.map`::
@@ -141,7 +141,7 @@ A comma-separated list of fields for use with `langid.map.individual` that is di
 Specifies a language code to use if no language is detected or specified in `langid.fallbackFields`.
 
 `langid.fallbackFields`::
-If no language is detected that meets the `langid.threshold` score, or if the detected language is not on the `langid.whitelist`, this field specifies language codes to be used as fallback values.
+If no language is detected that meets the `langid.threshold` score, or if the detected language is not on the `langid.allowlist`, this field specifies language codes to be used as fallback values.
 +
 If no appropriate fallback languages are found, Solr will use the language code specified in `langid.fallback`.
 
diff --git a/solr/solr-ref-guide/src/major-changes-in-solr-9.adoc b/solr/solr-ref-guide/src/major-changes-in-solr-9.adoc
index cc2a9ab..3a08339 100644
--- a/solr/solr-ref-guide/src/major-changes-in-solr-9.adoc
+++ b/solr/solr-ref-guide/src/major-changes-in-solr-9.adoc
@@ -156,6 +156,8 @@ _(raw; not yet edited)_
 
 * SOLR-15409: Zookeeper client libraries upgraded to 3.7.0, which may not be compatible with your existing server installations
 
+* SOLR-15471: The language identification "whitelist" configuration is now an "allowlist" to better convey the meaning of the property
+
 === Upgrade Prerequisites in Solr 9
 
 * Upgrade all collections in stateFormat=1 to stateFormat=2 *before* upgrading to Solr 9, as Solr 9 does not support the