You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ja...@apache.org on 2013/07/02 16:56:23 UTC
svn commit: r1498961 - in /lucene/dev/branches/branch_4x: ./ solr/
solr/contrib/ solr/contrib/langid/src/java/org/apache/solr/update/processor/
solr/contrib/langid/src/test/org/apache/solr/update/processor/
Author: janhoy
Date: Tue Jul 2 14:56:22 2013
New Revision: 1498961
URL: http://svn.apache.org/r1498961
Log:
SOLR-4412: LanguageIdentifier lcmap for language field (merge from trunk)
Modified:
lucene/dev/branches/branch_4x/ (props changed)
lucene/dev/branches/branch_4x/solr/ (props changed)
lucene/dev/branches/branch_4x/solr/CHANGES.txt (contents, props changed)
lucene/dev/branches/branch_4x/solr/contrib/ (props changed)
lucene/dev/branches/branch_4x/solr/contrib/langid/src/java/org/apache/solr/update/processor/LangIdParams.java
lucene/dev/branches/branch_4x/solr/contrib/langid/src/java/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessor.java
lucene/dev/branches/branch_4x/solr/contrib/langid/src/test/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessorFactoryTestCase.java
Modified: lucene/dev/branches/branch_4x/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/solr/CHANGES.txt?rev=1498961&r1=1498960&r2=1498961&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/solr/CHANGES.txt (original)
+++ lucene/dev/branches/branch_4x/solr/CHANGES.txt Tue Jul 2 14:56:22 2013
@@ -223,6 +223,9 @@ Optimizations
* SOLR-4955: Admin UI - Show address bar on top for Schema + Config (steffkes)
+* SOLR-4412: New parameter langid.lcmap to map detected language code to be placed
+ in "language" field (janhoy)
+
Other Changes
----------------------
Modified: lucene/dev/branches/branch_4x/solr/contrib/langid/src/java/org/apache/solr/update/processor/LangIdParams.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/solr/contrib/langid/src/java/org/apache/solr/update/processor/LangIdParams.java?rev=1498961&r1=1498960&r2=1498961&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/solr/contrib/langid/src/java/org/apache/solr/update/processor/LangIdParams.java (original)
+++ lucene/dev/branches/branch_4x/solr/contrib/langid/src/java/org/apache/solr/update/processor/LangIdParams.java Tue Jul 2 14:56:22 2013
@@ -31,6 +31,7 @@ public interface LangIdParams {
String THRESHOLD = LANGUAGE_ID + ".threshold"; // Detection threshold
String ENFORCE_SCHEMA = LANGUAGE_ID + ".enforceSchema"; // Enforces that output fields exist in schema
String LANG_WHITELIST = LANGUAGE_ID + ".whitelist"; // Allowed languages
+ String LCMAP = LANGUAGE_ID + ".lcmap"; // Maps detected langcode to other value
String MAP_ENABLE = LANGUAGE_ID + ".map"; // Turns on or off the field mapping
String MAP_FL = LANGUAGE_ID + ".map.fl"; // Field list for mapping
String MAP_OVERWRITE = LANGUAGE_ID + ".map.overwrite"; // Whether to overwrite existing fields
Modified: lucene/dev/branches/branch_4x/solr/contrib/langid/src/java/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessor.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/solr/contrib/langid/src/java/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessor.java?rev=1498961&r1=1498960&r2=1498961&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/solr/contrib/langid/src/java/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessor.java (original)
+++ lucene/dev/branches/branch_4x/solr/contrib/langid/src/java/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessor.java Tue Jul 2 14:56:22 2013
@@ -75,6 +75,7 @@ public abstract class LanguageIdentifier
protected HashSet<String> mapIndividualFieldsSet;
protected HashSet<String> allMapFieldsSet;
protected HashMap<String,String> lcMap;
+ protected HashMap<String,String> mapLcMap;
protected IndexSchema schema;
// Regex patterns
@@ -138,13 +139,26 @@ public abstract class LanguageIdentifier
allMapFieldsSet.addAll(mapIndividualFieldsSet);
}
- // Language Code mapping
+ // Normalize detected langcode onto normalized langcode
lcMap = new HashMap<String,String>();
+ if(params.get(LCMAP) != null) {
+ for(String mapping : params.get(LCMAP).split("[, ]")) {
+ String[] keyVal = mapping.split(":");
+ if(keyVal.length == 2) {
+ lcMap.put(keyVal[0], keyVal[1]);
+ } else {
+ log.error("Unsupported format for langid.lcmap: "+mapping+". Skipping this mapping.");
+ }
+ }
+ }
+
+ // Language Code mapping
+ mapLcMap = new HashMap<String,String>();
if(params.get(MAP_LCMAP) != null) {
for(String mapping : params.get(MAP_LCMAP).split("[, ]")) {
String[] keyVal = mapping.split(":");
if(keyVal.length == 2) {
- lcMap.put(keyVal[0], keyVal[1]);
+ mapLcMap.put(keyVal[0], keyVal[1]);
} else {
log.error("Unsupported format for langid.map.lcmap: "+mapping+". Skipping this mapping.");
}
@@ -322,10 +336,11 @@ public abstract class LanguageIdentifier
langStr = fallbackLang;
} else {
DetectedLanguage lang = languages.get(0);
- if(langWhitelist.isEmpty() || langWhitelist.contains(lang.getLangCode())) {
- log.debug("Language detected {} with certainty {}", lang.getLangCode(), lang.getCertainty());
+ String normalizedLang = normalizeLangCode(lang.getLangCode());
+ if(langWhitelist.isEmpty() || langWhitelist.contains(normalizedLang)) {
+ log.debug("Language detected {} with certainty {}", normalizedLang, lang.getCertainty());
if(lang.getCertainty() >= threshold) {
- langStr = lang.getLangCode();
+ langStr = normalizedLang;
} else {
log.debug("Detected language below threshold {}, using fallback {}", threshold, fallbackLang);
langStr = fallbackLang;
@@ -345,6 +360,20 @@ public abstract class LanguageIdentifier
}
/**
+ * Looks up language code in map (langid.lcmap) and returns mapped value
+ * @param langCode the language code string returned from detector
+ * @return the normalized/mapped language code
+ */
+ protected String normalizeLangCode(String langCode) {
+ if (lcMap.containsKey(langCode)) {
+ String lc = lcMap.get(langCode);
+ log.debug("Doing langcode normalization mapping from "+langCode+" to "+lc);
+ return lc;
+ }
+ return langCode;
+ }
+
+ /**
* Returns the name of the field to map the current contents into, so that they are properly analyzed. For instance
* if the currentField is "text" and the code is "en", the new field would by default be "text_en".
* This method also performs custom regex pattern replace if configured. If enforceSchema=true
@@ -355,7 +384,7 @@ public abstract class LanguageIdentifier
* @return The new schema field name, based on pattern and replace, or null if illegal
*/
protected String getMappedField(String currentField, String language) {
- String lc = lcMap.containsKey(language) ? lcMap.get(language) : language;
+ String lc = mapLcMap.containsKey(language) ? mapLcMap.get(language) : language;
String newFieldName = langPattern.matcher(mapPattern.matcher(currentField).replaceFirst(mapReplaceStr)).replaceFirst(lc);
if(enforceSchema && schema.getFieldOrNull(newFieldName) == null) {
log.warn("Unsuccessful field name mapping from {} to {}, field does not exist and enforceSchema=true; skipping mapping.", currentField, newFieldName);
Modified: lucene/dev/branches/branch_4x/solr/contrib/langid/src/test/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessorFactoryTestCase.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/solr/contrib/langid/src/test/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessorFactoryTestCase.java?rev=1498961&r1=1498960&r2=1498961&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/solr/contrib/langid/src/test/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessorFactoryTestCase.java (original)
+++ lucene/dev/branches/branch_4x/solr/contrib/langid/src/test/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessorFactoryTestCase.java Tue Jul 2 14:56:22 2013
@@ -116,6 +116,22 @@ public abstract class LanguageIdentifier
}
@Test
+ public void testMapLangcode() throws Exception {
+ parameters = new ModifiableSolrParams();
+ parameters.add("langid.fl", "name");
+ parameters.add("langid.lcmap", "zh_cn:zh zh_tw:zh");
+ parameters.set("langid.enforceSchema", "false");
+ liProcessor = createLangIdProcessor(parameters);
+
+ assertEquals("zh", liProcessor.resolveLanguage("zh_cn", "NA"));
+ assertEquals("zh", liProcessor.resolveLanguage("zh_tw", "NA"));
+ assertEquals("no", liProcessor.resolveLanguage("no", "NA"));
+ List<DetectedLanguage> langs = new ArrayList<DetectedLanguage>();
+ langs.add(new DetectedLanguage("zh_cn", 0.8));
+ assertEquals("zh", liProcessor.resolveLanguage(langs, "NA"));
+ }
+
+ @Test
public void testPreExisting() throws Exception {
SolrInputDocument doc;
parameters = new ModifiableSolrParams();