You are viewing a plain text version of this content. The canonical link for it is here.
Posted to solr-user@lucene.apache.org by lsanchez <ls...@scarlet.be> on 2014/09/09 12:02:48 UTC
Re: Language detection for multivalued field
Hi all,
I don't know if this can help somebody, I've changed the method process of
the class LanguageIdentifierUpdateProcessor in order to support of
multivalued fields and it works pretty well
protected SolrInputDocument process(SolrInputDocument doc) {
String docLang = null;
HashSet<String> docLangs = new HashSet<String>();
String fallbackLang = getFallbackLang(doc, fallbackFields,
fallbackValue);
if(langField == null || !doc.containsKey(langField) ||
(doc.containsKey(langField) && overwrite)) {
String allText = concatFields(doc, inputFields);
List<DetectedLanguage> languagelist = detectLanguage(allText);
docLang = resolveLanguage(languagelist, fallbackLang);
docLangs.add(docLang);
log.debug("Detected main document language from fields " +
inputFields.toString() + ": "+docLang);
if(doc.containsKey(langField) && overwrite) {
log.debug("Overwritten old value "+doc.getFieldValue(langField));
}
if(langField != null && langField.length() != 0) {
doc.setField(langField, docLang);
}
} else {
// langField is set, we sanity check it against whitelist and fallback
docLang = resolveLanguage((String) doc.getFieldValue(langField),
fallbackLang);
docLangs.add(docLang);
log.debug("Field "+langField+" already contained value "+docLang+",
not overwriting.");
}
if(enableMapping) {
for (String fieldName : allMapFieldsSet) {
if(doc.containsKey(fieldName)) {
String fieldLang="";
if(mapIndividual && mapIndividualFieldsSet.contains(fieldName)) {
Collection c = doc.getFieldValues(fieldName);
for (Object o : c){
if(o instanceof String ){
List<DetectedLanguage> languagelist =
detectLanguage((String) o);
fieldLang = resolveLanguage(languagelist, docLang);
docLangs.add(fieldLang);
log.debug("Mapping multivalued field "+fieldName+"
using individually detected language "+fieldLang);
String mappedOutputField = getMappedField(fieldName,
fieldLang);
if (mappedOutputField != null) {
log.debug("Mapping multivalued field {} to {}",
doc.getFieldValue(docIdField), fieldLang);
SolrInputField inField = new SolrInputField
(fieldName);
Collection currentContent
=doc.getFieldValues(mappedOutputField);
if (currentContent != null &&
currentContent.size()>0){
doc.addField(mappedOutputField, o);
}
else{
inField.setValue(o,
doc.getField(fieldName).getBoost());
doc.setField(mappedOutputField,
inField.getValue(), inField.getBoost());
}
if(!mapKeepOrig) {
log.debug("Removing old field {}", fieldName);
doc.removeField(fieldName);
}
} else {
throw new
SolrException(SolrException.ErrorCode.BAD_REQUEST, "Invalid output field
mapping for "
+ fieldName + " field and language: " +
fieldLang);
}
}
}
} else {
fieldLang = docLang;
log.debug("Mapping field "+fieldName+" using document global
language "+fieldLang);
String mappedOutputField = getMappedField(fieldName, fieldLang);
if (mappedOutputField != null) {
log.debug("Mapping field {} to {}",
doc.getFieldValue(docIdField), fieldLang);
SolrInputField inField = doc.getField(fieldName);
doc.setField(mappedOutputField, inField.getValue(),
inField.getBoost());
if(!mapKeepOrig) {
log.debug("Removing old field {}", fieldName);
doc.removeField(fieldName);
}
} else {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
"Invalid output field mapping for "
+ fieldName + " field and language: " + fieldLang);
}
}
}
}
}
// Set the languages field to an array of all detected languages
if(langsField != null && langsField.length() != 0) {
doc.setField(langsField, docLangs.toArray());
}
return doc;
}
--
View this message in context: http://lucene.472066.n3.nabble.com/Language-detection-for-multivalued-field-tp4096996p4157573.html
Sent from the Solr - User mailing list archive at Nabble.com.