You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@solr.apache.org by md...@apache.org on 2021/06/14 16:42:13 UTC
[solr] branch main updated: SOLR-15471 Rename parameter to
langid.allowlist (#172)
This is an automated email from the ASF dual-hosted git repository.
mdrob pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/solr.git
The following commit(s) were added to refs/heads/main by this push:
new 777d701 SOLR-15471 Rename parameter to langid.allowlist (#172)
777d701 is described below
commit 777d7010029f092946aaa53833f9ffdd487cd680
Author: Mike Drob <md...@apache.org>
AuthorDate: Mon Jun 14 11:42:04 2021 -0500
SOLR-15471 Rename parameter to langid.allowlist (#172)
---
solr/CHANGES.txt | 2 ++
.../apache/solr/update/processor/LangIdParams.java | 4 +++-
.../LanguageIdentifierUpdateProcessor.java | 23 +++++++++++++---------
.../src/detecting-languages-during-indexing.adoc | 4 ++--
.../src/major-changes-in-solr-9.adoc | 2 ++
5 files changed, 23 insertions(+), 12 deletions(-)
diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index 0e1099d..5eca8f9 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -315,6 +315,8 @@ Other Changes
* SOLR-15385: Address many rawtypes warnings, resulting in several modified signatures in the public API. (Mike Drob, David Smiley)
+* SOLR-15471: Rename lang id whitelist parameter to allowlist (Mike Drob)
+
Bug Fixes
---------------------
* SOLR-14546: Fix for a relatively hard to hit issue in OverseerTaskProcessor that could lead to out of order execution
diff --git a/solr/contrib/langid/src/java/org/apache/solr/update/processor/LangIdParams.java b/solr/contrib/langid/src/java/org/apache/solr/update/processor/LangIdParams.java
index 4e19eab..4dc04ee 100644
--- a/solr/contrib/langid/src/java/org/apache/solr/update/processor/LangIdParams.java
+++ b/solr/contrib/langid/src/java/org/apache/solr/update/processor/LangIdParams.java
@@ -29,7 +29,9 @@ public interface LangIdParams {
String OVERWRITE = LANGUAGE_ID + ".overwrite"; // Overwrite if existing language value in LANG_FIELD
String THRESHOLD = LANGUAGE_ID + ".threshold"; // Detection threshold
String ENFORCE_SCHEMA = LANGUAGE_ID + ".enforceSchema"; // Enforces that output fields exist in schema
- String LANG_WHITELIST = LANGUAGE_ID + ".whitelist"; // Allowed languages
+ @Deprecated(since = "9.0.0")
+ String LANG_WHITELIST = LANGUAGE_ID + ".whitelist"; // Old property name for allowed languages
+ String LANG_ALLOWLIST = LANGUAGE_ID + ".allowlist"; // Allowed languages
String LCMAP = LANGUAGE_ID + ".lcmap"; // Maps detected langcode to other value
String MAP_ENABLE = LANGUAGE_ID + ".map"; // Turns on or off the field mapping
String MAP_FL = LANGUAGE_ID + ".map.fl"; // Field list for mapping
diff --git a/solr/contrib/langid/src/java/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessor.java b/solr/contrib/langid/src/java/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessor.java
index 90aef72..0043e46 100644
--- a/solr/contrib/langid/src/java/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessor.java
+++ b/solr/contrib/langid/src/java/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessor.java
@@ -45,7 +45,7 @@ import org.slf4j.LoggerFactory;
* Identifies the language of a set of input fields.
* Also supports mapping of field names based on detected language.
* </p>
- * See <a href="https://lucene.apache.org/solr/guide/detecting-languages-during-indexing.html">Detecting Languages During Indexing</a> in reference guide
+ * See <a href="https://solr.apache.org/guide/detecting-languages-during-indexing.html">Detecting Languages During Indexing</a> in reference guide
* @since 3.5
* @lucene.experimental
*/
@@ -71,7 +71,7 @@ public abstract class LanguageIdentifierUpdateProcessor extends UpdateRequestPro
protected boolean mapIndividual;
protected boolean enforceSchema;
protected double threshold;
- protected HashSet<String> langWhitelist;
+ protected HashSet<String> langAllowlist;
protected HashSet<String> mapIndividualFieldsSet;
protected HashSet<String> allMapFieldsSet;
protected HashMap<String,String> lcMap;
@@ -108,11 +108,16 @@ public abstract class LanguageIdentifierUpdateProcessor extends UpdateRequestPro
fallbackFields = params.get(FALLBACK_FIELDS).split(",");
}
overwrite = params.getBool(OVERWRITE, false);
- langWhitelist = new HashSet<>();
+ langAllowlist = new HashSet<>();
threshold = params.getDouble(THRESHOLD, DOCID_THRESHOLD_DEFAULT);
- if(params.get(LANG_WHITELIST, "").length() > 0) {
- for(String lang : params.get(LANG_WHITELIST, "").split(",")) {
- langWhitelist.add(lang);
+ String legacyAllowList = params.get(LANG_WHITELIST, "");
+ if(legacyAllowList.length() > 0) {
+ // nowarn compile time string concatenation
+ log.warn(LANG_WHITELIST + " parameter is deprecated; use " + LANG_ALLOWLIST + " instead."); // nowarn
+ }
+ if(params.get(LANG_ALLOWLIST, legacyAllowList).length() > 0) {
+ for(String lang : params.get(LANG_ALLOWLIST, "").split(",")) {
+ langAllowlist.add(lang);
}
}
@@ -234,7 +239,7 @@ public abstract class LanguageIdentifierUpdateProcessor extends UpdateRequestPro
doc.setField(langField, docLang);
}
} else {
- // langField is set, we sanity check it against whitelist and fallback
+ // langField is set, we check it against allowlist and fallback
docLang = resolveLanguage(doc.getFieldValue(langField).toString(), fallbackLang);
docLangs.add(docLang);
log.debug("Field {} already contained value {}, not overwriting.", langField, docLang);
@@ -344,7 +349,7 @@ public abstract class LanguageIdentifierUpdateProcessor extends UpdateRequestPro
} else {
DetectedLanguage lang = languages.get(0);
String normalizedLang = normalizeLangCode(lang.getLangCode());
- if(langWhitelist.isEmpty() || langWhitelist.contains(normalizedLang)) {
+ if(langAllowlist.isEmpty() || langAllowlist.contains(normalizedLang)) {
if (log.isDebugEnabled()) {
log.debug("Language detected {} with certainty {}", normalizedLang, lang.getCertainty());
}
@@ -356,7 +361,7 @@ public abstract class LanguageIdentifierUpdateProcessor extends UpdateRequestPro
}
} else {
if (log.isDebugEnabled()) {
- log.debug("Detected a language not in whitelist ({}), using fallback {}", lang.getLangCode(), fallbackLang);
+ log.debug("Detected a language not in allowlist ({}), using fallback {}", lang.getLangCode(), fallbackLang);
}
langStr = fallbackLang;
}
diff --git a/solr/solr-ref-guide/src/detecting-languages-during-indexing.adoc b/solr/solr-ref-guide/src/detecting-languages-during-indexing.adoc
index 7924c1c..d318458 100644
--- a/solr/solr-ref-guide/src/detecting-languages-during-indexing.adoc
+++ b/solr/solr-ref-guide/src/detecting-languages-during-indexing.adoc
@@ -119,7 +119,7 @@ With longer text fields, a high threshold such as `0.8` will give good results.
+
The default is `0.5`.
-`langid.whitelist`::
+`langid.allowlist`::
Specifies a list of allowed language identification codes. Use this in combination with `langid.map` to ensure that you only index documents into fields that are in your schema.
`langid.map`::
@@ -141,7 +141,7 @@ A comma-separated list of fields for use with `langid.map.individual` that is di
Specifies a language code to use if no language is detected or specified in `langid.fallbackFields`.
`langid.fallbackFields`::
-If no language is detected that meets the `langid.threshold` score, or if the detected language is not on the `langid.whitelist`, this field specifies language codes to be used as fallback values.
+If no language is detected that meets the `langid.threshold` score, or if the detected language is not on the `langid.allowlist`, this field specifies language codes to be used as fallback values.
+
If no appropriate fallback languages are found, Solr will use the language code specified in `langid.fallback`.
diff --git a/solr/solr-ref-guide/src/major-changes-in-solr-9.adoc b/solr/solr-ref-guide/src/major-changes-in-solr-9.adoc
index cc2a9ab..3a08339 100644
--- a/solr/solr-ref-guide/src/major-changes-in-solr-9.adoc
+++ b/solr/solr-ref-guide/src/major-changes-in-solr-9.adoc
@@ -156,6 +156,8 @@ _(raw; not yet edited)_
* SOLR-15409: Zookeeper client libraries upgraded to 3.7.0, which may not be compatible with your existing server installations
+* SOLR-15471: The language identification "whitelist" configuration is now an "allowlist" to better convey the meaning of the property
+
=== Upgrade Prerequisites in Solr 9
* Upgrade all collections in stateFormat=1 to stateFormat=2 *before* upgrading to Solr 9, as Solr 9 does not support the