You are viewing a plain text version of this content. The canonical link for it is here.
Posted to solr-user@lucene.apache.org by lsanchez <ls...@scarlet.be> on 2014/09/09 12:02:48 UTC

Re: Language detection for multivalued field

Hi all,
I don't know if this can help somebody, I've changed the method process of
the class LanguageIdentifierUpdateProcessor in order to support of
multivalued fields and it works pretty well


protected SolrInputDocument process(SolrInputDocument doc) {
    String docLang = null;
    HashSet<String> docLangs = new HashSet<String>();
    String fallbackLang = getFallbackLang(doc, fallbackFields,
fallbackValue);

    if(langField == null || !doc.containsKey(langField) ||
(doc.containsKey(langField) && overwrite)) {
      String allText = concatFields(doc, inputFields);
      List<DetectedLanguage> languagelist = detectLanguage(allText);
      docLang = resolveLanguage(languagelist, fallbackLang);
      docLangs.add(docLang);
      log.debug("Detected main document language from fields " +
inputFields.toString() + ": "+docLang);

      if(doc.containsKey(langField) && overwrite) {
        log.debug("Overwritten old value "+doc.getFieldValue(langField));
      }
      if(langField != null && langField.length() != 0) {
        doc.setField(langField, docLang);
      }
    } else {
      // langField is set, we sanity check it against whitelist and fallback
      docLang = resolveLanguage((String) doc.getFieldValue(langField),
fallbackLang);
      docLangs.add(docLang);
      log.debug("Field "+langField+" already contained value "+docLang+",
not overwriting.");
    }

    if(enableMapping) {
      for (String fieldName : allMapFieldsSet) {
        if(doc.containsKey(fieldName)) {
          String fieldLang="";
          if(mapIndividual && mapIndividualFieldsSet.contains(fieldName)) {
            
            Collection c = doc.getFieldValues(fieldName);
            for (Object o : c){
                if(o instanceof String ){
                    List<DetectedLanguage> languagelist =
detectLanguage((String) o);
                    fieldLang = resolveLanguage(languagelist, docLang);
                    docLangs.add(fieldLang);
                    log.debug("Mapping multivalued  field "+fieldName+"
using individually detected language "+fieldLang);
                    String mappedOutputField = getMappedField(fieldName,
fieldLang);
                    if (mappedOutputField != null) {
                        log.debug("Mapping multivalued field {} to {}",
doc.getFieldValue(docIdField), fieldLang);
                        SolrInputField inField = new SolrInputField
(fieldName);
                        Collection currentContent
=doc.getFieldValues(mappedOutputField);
                        if (currentContent != null &&
currentContent.size()>0){
                            doc.addField(mappedOutputField, o);
                            
                        }
                        else{
                            inField.setValue(o,
doc.getField(fieldName).getBoost());
                            doc.setField(mappedOutputField,
inField.getValue(), inField.getBoost());
                        }
                        
                                               
                        
                        if(!mapKeepOrig) {
                          log.debug("Removing old field {}", fieldName);
                          doc.removeField(fieldName);
                        }
                      } else {
                        throw new
SolrException(SolrException.ErrorCode.BAD_REQUEST, "Invalid output field
mapping for "
                                + fieldName + " field and language: " +
fieldLang);
                      }
                }
            }
            
          } else {
            
            fieldLang = docLang;
            log.debug("Mapping field "+fieldName+" using document global
language "+fieldLang);
            String mappedOutputField = getMappedField(fieldName, fieldLang);

            if (mappedOutputField != null) {
              log.debug("Mapping field {} to {}",
doc.getFieldValue(docIdField), fieldLang);
              SolrInputField inField = doc.getField(fieldName);
              doc.setField(mappedOutputField, inField.getValue(),
inField.getBoost());
              if(!mapKeepOrig) {
                log.debug("Removing old field {}", fieldName);
                doc.removeField(fieldName);
              }
            } else {
              throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
"Invalid output field mapping for "
                      + fieldName + " field and language: " + fieldLang);
            }
          }
          
        }
      }
    }

    // Set the languages field to an array of all detected languages
    if(langsField != null && langsField.length() != 0) {
      doc.setField(langsField, docLangs.toArray());
    }

    return doc;
  }



--
View this message in context: http://lucene.472066.n3.nabble.com/Language-detection-for-multivalued-field-tp4096996p4157573.html
Sent from the Solr - User mailing list archive at Nabble.com.