You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sa...@apache.org on 2014/12/05 19:53:51 UTC

svn commit: r1643390 - in /lucene/dev/branches/branch_5x/solr: ./ contrib/ contrib/langid/src/java/org/apache/solr/update/processor/ contrib/langid/src/test/org/apache/solr/update/processor/

Author: sarowe
Date: Fri Dec  5 18:53:50 2014
New Revision: 1643390

URL: http://svn.apache.org/r1643390
Log:
SOLR-3881: Avoid OOMs in LanguageIdentifierUpdateProcessor:
- Added langid.maxFieldValueChars and langid.maxTotalChars params to limit
  input, by default 10k and 20k chars, respectively.
- Moved input concatenation to Tika implementation; the langdetect
  implementation instead appends each input piece via the langdetect API. 
(merged trunk r1643377)

Modified:
    lucene/dev/branches/branch_5x/solr/   (props changed)
    lucene/dev/branches/branch_5x/solr/CHANGES.txt   (contents, props changed)
    lucene/dev/branches/branch_5x/solr/contrib/   (props changed)
    lucene/dev/branches/branch_5x/solr/contrib/langid/src/java/org/apache/solr/update/processor/LangDetectLanguageIdentifierUpdateProcessor.java
    lucene/dev/branches/branch_5x/solr/contrib/langid/src/java/org/apache/solr/update/processor/LangIdParams.java
    lucene/dev/branches/branch_5x/solr/contrib/langid/src/java/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessor.java
    lucene/dev/branches/branch_5x/solr/contrib/langid/src/java/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessor.java
    lucene/dev/branches/branch_5x/solr/contrib/langid/src/test/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessorFactoryTest.java

Modified: lucene/dev/branches/branch_5x/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/solr/CHANGES.txt?rev=1643390&r1=1643389&r2=1643390&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/solr/CHANGES.txt (original)
+++ lucene/dev/branches/branch_5x/solr/CHANGES.txt Fri Dec  5 18:53:50 2014
@@ -268,6 +268,13 @@ Bug Fixes
 
 * SOLR-6763: Shard leader elections should not persist across session expiry
   (Alan Woodward, Mark Miller)
+  
+* SOLR-3881: Avoid OOMs in LanguageIdentifierUpdateProcessor:
+  - Added langid.maxFieldValueChars and langid.maxTotalChars params to limit
+    input, by default 10k and 20k chars, respectively.
+  - Moved input concatenation to Tika implementation; the langdetect
+    implementation instead appends each input piece via the langdetect API.
+  (Vitaliy Zhovtyuk, Tomás Fernández Löbbe, Rob Tulloh, Steve Rowe)
 
 Optimizations
 ----------------------

Modified: lucene/dev/branches/branch_5x/solr/contrib/langid/src/java/org/apache/solr/update/processor/LangDetectLanguageIdentifierUpdateProcessor.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/solr/contrib/langid/src/java/org/apache/solr/update/processor/LangDetectLanguageIdentifierUpdateProcessor.java?rev=1643390&r1=1643389&r2=1643390&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/solr/contrib/langid/src/java/org/apache/solr/update/processor/LangDetectLanguageIdentifierUpdateProcessor.java (original)
+++ lucene/dev/branches/branch_5x/solr/contrib/langid/src/java/org/apache/solr/update/processor/LangDetectLanguageIdentifierUpdateProcessor.java Fri Dec  5 18:53:50 2014
@@ -18,6 +18,7 @@ package org.apache.solr.update.processor
  */
 
 import java.util.ArrayList;
+import java.util.Collection;
 import java.util.Collections;
 import java.util.List;
 
@@ -28,6 +29,7 @@ import com.cybozu.labs.langdetect.Detect
 import com.cybozu.labs.langdetect.DetectorFactory;
 import com.cybozu.labs.langdetect.LangDetectException;
 import com.cybozu.labs.langdetect.Language;
+import org.apache.solr.common.SolrInputDocument;
 
 /**
  * Identifies the language of a set of input fields using http://code.google.com/p/language-detection
@@ -43,15 +45,32 @@ public class LangDetectLanguageIdentifie
   }
 
   @Override
-  protected List<DetectedLanguage> detectLanguage(String content) {
-    if (content.trim().length() == 0) { // to be consistent with the tika impl?
-      log.debug("No input text to detect language from, returning empty list");
-      return Collections.emptyList();
-    }
-    
+  protected List<DetectedLanguage> detectLanguage(SolrInputDocument doc) {
     try {
       Detector detector = DetectorFactory.create();
-      detector.append(content);
+      detector.setMaxTextLength(maxTotalChars);
+
+      for (String fieldName : inputFields) {
+        log.debug("Appending field " + fieldName);
+        if (doc.containsKey(fieldName)) {
+          Collection<Object> fieldValues = doc.getFieldValues(fieldName);
+          if (fieldValues != null) {
+            for (Object content : fieldValues) {
+              if (content instanceof String) {
+                String stringContent = (String) content;
+                if (stringContent.length() > maxFieldValueChars) {
+                  detector.append(stringContent.substring(0, maxFieldValueChars));
+                } else {
+                  detector.append(stringContent);
+                }
+                detector.append(" ");
+              } else {
+                log.warn("Field " + fieldName + " not a String value, not including in detection");
+              }
+            }
+          }
+        }
+      }
       ArrayList<Language> langlist = detector.getProbabilities();
       ArrayList<DetectedLanguage> solrLangList = new ArrayList<>();
       for (Language l: langlist) {

Modified: lucene/dev/branches/branch_5x/solr/contrib/langid/src/java/org/apache/solr/update/processor/LangIdParams.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/solr/contrib/langid/src/java/org/apache/solr/update/processor/LangIdParams.java?rev=1643390&r1=1643389&r2=1643390&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/solr/contrib/langid/src/java/org/apache/solr/update/processor/LangIdParams.java (original)
+++ lucene/dev/branches/branch_5x/solr/contrib/langid/src/java/org/apache/solr/update/processor/LangIdParams.java Fri Dec  5 18:53:50 2014
@@ -41,12 +41,16 @@ public interface LangIdParams {
   String MAP_LCMAP =  LANGUAGE_ID + ".map.lcmap";            // Enables mapping multiple langs to same output field
   String MAP_PATTERN =  LANGUAGE_ID + ".map.pattern";        // RegEx pattern to match field name
   String MAP_REPLACE =  LANGUAGE_ID + ".map.replace";        // Replace pattern
+  String MAX_FIELD_VALUE_CHARS = LANGUAGE_ID + ".maxFieldValueChars";   // Maximum number of characters to use per field for language detection
+  String MAX_TOTAL_CHARS = LANGUAGE_ID + ".maxTotalChars";   // Maximum number of characters to use per all concatenated fields for language detection
 
   String DOCID_FIELD_DEFAULT = "id";
   String DOCID_LANGFIELD_DEFAULT = null;
   String DOCID_LANGSFIELD_DEFAULT = null;
   String MAP_PATTERN_DEFAULT = "(.*)";
   String MAP_REPLACE_DEFAULT = "$1_{lang}";
+  int MAX_FIELD_VALUE_CHARS_DEFAULT = 10000;
+  int MAX_TOTAL_CHARS_DEFAULT = 20000;
 
   // TODO: This default threshold accepts even "uncertain" detections. 
   // Increase &langid.threshold above 0.5 to return only certain detections

Modified: lucene/dev/branches/branch_5x/solr/contrib/langid/src/java/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessor.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/solr/contrib/langid/src/java/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessor.java?rev=1643390&r1=1643389&r2=1643390&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/solr/contrib/langid/src/java/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessor.java (original)
+++ lucene/dev/branches/branch_5x/solr/contrib/langid/src/java/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessor.java Fri Dec  5 18:53:50 2014
@@ -78,6 +78,8 @@ public abstract class LanguageIdentifier
   protected HashMap<String,String> lcMap;
   protected HashMap<String,String> mapLcMap;
   protected IndexSchema schema;
+  protected int maxFieldValueChars;
+  protected int maxTotalChars;
 
   // Regex patterns
   protected final Pattern tikaSimilarityPattern = Pattern.compile(".*\\((.*?)\\)");
@@ -169,8 +171,21 @@ public abstract class LanguageIdentifier
 
       mapPattern = Pattern.compile(params.get(MAP_PATTERN, MAP_PATTERN_DEFAULT));
       mapReplaceStr = params.get(MAP_REPLACE, MAP_REPLACE_DEFAULT);
-
-
+      maxFieldValueChars = params.getInt(MAX_FIELD_VALUE_CHARS, MAX_FIELD_VALUE_CHARS_DEFAULT);
+      maxTotalChars = params.getInt(MAX_TOTAL_CHARS, MAX_TOTAL_CHARS_DEFAULT);
+      if (maxFieldValueChars > maxTotalChars) {
+        if (maxTotalChars == MAX_TOTAL_CHARS_DEFAULT) {
+          // If the user specified only maxFieldValueChars, make maxTotalChars the same as it
+          log.warn(MAX_FIELD_VALUE_CHARS + " (" + maxFieldValueChars + ") is less than " + MAX_TOTAL_CHARS + " ("
+              + maxTotalChars + ").  Setting " + MAX_TOTAL_CHARS + " to " + maxFieldValueChars + ".");
+          maxTotalChars = maxFieldValueChars;
+        } else {
+          // If the user specified maxTotalChars, make maxFieldValueChars the same as it
+          log.warn(MAX_FIELD_VALUE_CHARS + " (" + maxFieldValueChars + ") is less than " + MAX_TOTAL_CHARS + " ("
+              + maxTotalChars + ").  Setting " + MAX_FIELD_VALUE_CHARS + " to " + maxTotalChars + ".");
+          maxFieldValueChars = maxTotalChars;
+        }
+      }
     }
     log.debug("LangId configured");
 
@@ -203,11 +218,10 @@ public abstract class LanguageIdentifier
     String fallbackLang = getFallbackLang(doc, fallbackFields, fallbackValue);
 
     if(langField == null || !doc.containsKey(langField) || (doc.containsKey(langField) && overwrite)) {
-      String allText = concatFields(doc, inputFields);
-      List<DetectedLanguage> languagelist = detectLanguage(allText);
+      List<DetectedLanguage> languagelist = detectLanguage(doc);
       docLang = resolveLanguage(languagelist, fallbackLang);
       docLangs.add(docLang);
-      log.debug("Detected main document language from fields "+inputFields+": "+docLang);
+      log.debug("Detected main document language from fields "+ Arrays.toString(inputFields) +": "+docLang);
 
       if(doc.containsKey(langField) && overwrite) {
         log.debug("Overwritten old value "+doc.getFieldValue(langField));
@@ -227,8 +241,7 @@ public abstract class LanguageIdentifier
         if(doc.containsKey(fieldName)) {
           String fieldLang;
           if(mapIndividual && mapIndividualFieldsSet.contains(fieldName)) {
-            String text = (String) doc.getFieldValue(fieldName);
-            List<DetectedLanguage> languagelist = detectLanguage(text);
+            List<DetectedLanguage> languagelist = detectLanguage(doc);
             fieldLang = resolveLanguage(languagelist, docLang);
             docLangs.add(fieldLang);
             log.debug("Mapping field "+fieldName+" using individually detected language "+fieldLang);
@@ -284,37 +297,13 @@ public abstract class LanguageIdentifier
     return lang;
   }
 
-  /*
-   * Concatenates content from multiple fields
-   */
-  protected String concatFields(SolrInputDocument doc, String[] fields) {
-    StringBuilder sb = new StringBuilder();
-    for (String fieldName : inputFields) {
-      log.debug("Appending field "+fieldName);
-      if (doc.containsKey(fieldName)) {
-        Collection<Object> fieldValues = doc.getFieldValues(fieldName);
-        if (fieldValues != null) {
-          for (Object content : fieldValues) {
-            if (content instanceof String) {
-              sb.append((String) content);
-              sb.append(" ");
-            } else {
-              log.warn("Field " + fieldName + " not a String value, not including in detection");
-            }
-          }
-        }
-      }
-    }
-    return sb.toString();
-  }
-
   /**
    * Detects language(s) from a string.
    * Classes wishing to implement their own language detection module should override this method.
    * @param content The content to identify
    * @return List of detected language(s) according to RFC-3066
    */
-  protected abstract List<DetectedLanguage> detectLanguage(String content);
+  protected abstract List<DetectedLanguage> detectLanguage(SolrInputDocument content);
 
   /**
    * Chooses a language based on the list of candidates detected

Modified: lucene/dev/branches/branch_5x/solr/contrib/langid/src/java/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessor.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/solr/contrib/langid/src/java/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessor.java?rev=1643390&r1=1643389&r2=1643390&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/solr/contrib/langid/src/java/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessor.java (original)
+++ lucene/dev/branches/branch_5x/solr/contrib/langid/src/java/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessor.java Fri Dec  5 18:53:50 2014
@@ -24,6 +24,9 @@ import org.apache.solr.request.SolrQuery
 import org.apache.solr.response.SolrQueryResponse;
 import org.apache.tika.language.LanguageIdentifier;
 
+import org.apache.solr.common.SolrInputDocument;
+import java.util.Collection;
+
 /**
  * Identifies the language of a set of input fields using Tika's
  * LanguageIdentifier.
@@ -40,9 +43,10 @@ public class TikaLanguageIdentifierUpdat
   }
   
   @Override
-  protected List<DetectedLanguage> detectLanguage(String content) {
+  protected List<DetectedLanguage> detectLanguage(SolrInputDocument doc) {
     List<DetectedLanguage> languages = new ArrayList<>();
-    if(content.trim().length() != 0) { 
+    String content = concatFields(doc);
+    if (content.length() != 0) {
       LanguageIdentifier identifier = new LanguageIdentifier(content);
       // FIXME: Hack - we get the distance from toString and calculate our own certainty score
       Double distance = Double.parseDouble(tikaSimilarityPattern.matcher(identifier.toString()).replaceFirst("$1"));
@@ -57,4 +61,59 @@ public class TikaLanguageIdentifierUpdat
     }
     return languages;
   }
+
+
+  /**
+   * Concatenates content from multiple fields
+   */
+  protected String concatFields(SolrInputDocument doc) {
+    StringBuilder sb = new StringBuilder(getExpectedSize(doc, inputFields));
+    for (String fieldName : inputFields) {
+      log.debug("Appending field " + fieldName);
+      if (doc.containsKey(fieldName)) {
+        Collection<Object> fieldValues = doc.getFieldValues(fieldName);
+        if (fieldValues != null) {
+          for (Object content : fieldValues) {
+            if (content instanceof String) {
+              String stringContent = (String) content;
+              if (stringContent.length() > maxFieldValueChars) {
+                sb.append(stringContent.substring(0, maxFieldValueChars));
+              } else {
+                sb.append(stringContent);
+}
+              sb.append(" ");
+              if (sb.length() > maxTotalChars) {
+                sb.setLength(maxTotalChars);
+                break;
+              }
+            } else {
+              log.warn("Field " + fieldName + " not a String value, not including in detection");
+            }
+          }
+        }
+      }
+    }
+    return sb.toString();
+  }
+
+  /**
+   * Calculate expected string size.
+   *
+   * @param doc           solr input document
+   * @param fields        fields to select
+   * @return expected size of string value
+   */
+  private int getExpectedSize(SolrInputDocument doc, String[] fields) {
+    int docSize = 0;
+    for (String field : fields) {
+      Collection<Object> contents = doc.getFieldValues(field);
+      for (Object content : contents) {
+        if (content instanceof String) {
+          docSize += Math.min(((String) content).length(), maxFieldValueChars);
+        }
+      }
+      docSize = Math.min(docSize, maxTotalChars);
+    }
+    return docSize;
+  }
 }

Modified: lucene/dev/branches/branch_5x/solr/contrib/langid/src/test/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessorFactoryTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/solr/contrib/langid/src/test/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessorFactoryTest.java?rev=1643390&r1=1643389&r2=1643390&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/solr/contrib/langid/src/test/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessorFactoryTest.java (original)
+++ lucene/dev/branches/branch_5x/solr/contrib/langid/src/test/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessorFactoryTest.java Fri Dec  5 18:53:50 2014
@@ -17,11 +17,166 @@ package org.apache.solr.update.processor
  * limitations under the License.
  */
 
+import org.apache.solr.common.SolrInputDocument;
 import org.apache.solr.common.params.ModifiableSolrParams;
+import org.junit.Test;
 
 public class TikaLanguageIdentifierUpdateProcessorFactoryTest extends LanguageIdentifierUpdateProcessorFactoryTestCase {
   @Override
   protected LanguageIdentifierUpdateProcessor createLangIdProcessor(ModifiableSolrParams parameters) throws Exception {
     return new TikaLanguageIdentifierUpdateProcessor(_parser.buildRequestFrom(h.getCore(), parameters, null), resp, null);
   }
+
+
+  @Test
+  public void testMaxFieldValueChars() throws Exception {
+    SolrInputDocument doc = new SolrInputDocument();
+    String valueF1 = "Apache Lucene is a free/open source information retrieval software library, originally created in Java by Doug Cutting. It is supported by the Apache Software Foundation and is released under the Apache Software License.";
+    String valueF2 = "An open-source search server based on the Lucene Java search library. News, documentation, resources, and download.";
+    doc.addField("foo_s", valueF1);
+
+    ModifiableSolrParams parameters = new ModifiableSolrParams();
+    parameters.add("langid.fl", "foo_s");
+    parameters.add("langid.langField", "language");
+    parameters.add("langid.enforceSchema", "false");
+    TikaLanguageIdentifierUpdateProcessor p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters);
+    assertEquals(valueF1, p.concatFields(doc).trim());
+
+    parameters = new ModifiableSolrParams();
+    parameters.add("langid.fl", "foo_s");
+    parameters.add("langid.langField", "language");
+    parameters.add("langid.enforceSchema", "false");
+    parameters.add("langid.maxFieldValueChars", "6");
+    p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters);
+    assertEquals("Apache", p.concatFields(doc).trim());
+
+    doc.addField("bar_s", valueF2);
+
+    parameters = new ModifiableSolrParams();
+    parameters.add("langid.fl", "foo_s,bar_s");
+    parameters.add("langid.langField", "language");
+    parameters.add("langid.enforceSchema", "false");
+    p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters);
+    assertEquals(valueF1 + " " + valueF2, p.concatFields(doc).trim());
+
+    parameters = new ModifiableSolrParams();
+    parameters.add("langid.fl", "foo_s,bar_s");
+    parameters.add("langid.langField", "language");
+    parameters.add("langid.enforceSchema", "false");
+    parameters.add("langid.maxFieldValueChars", "6");
+    p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters);
+    assertEquals("Apache" + " " + "An ope", p.concatFields(doc).trim());
+
+    parameters = new ModifiableSolrParams();
+    parameters.add("langid.fl", "foo_s,bar_s");
+    parameters.add("langid.langField", "language");
+    parameters.add("langid.enforceSchema", "false");
+    parameters.add("langid.maxFieldValueChars", "100000");
+    p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters);
+    assertEquals(valueF1 + " " + valueF2, p.concatFields(doc).trim());
+
+}
+
+  @Test
+  public void testMaxTotalChars() throws Exception {
+    SolrInputDocument doc = new SolrInputDocument();
+    String valueF1 = "Apache Lucene is a free/open source information retrieval software library, originally created in Java by Doug Cutting. It is supported by the Apache Software Foundation and is released under the Apache Software License.";
+    String valueF2 = "An open-source search server based on the Lucene Java search library. News, documentation, resources, and download.";
+    doc.addField("foo_s", valueF1);
+
+    ModifiableSolrParams parameters = new ModifiableSolrParams();
+    parameters.add("langid.fl", "foo_s");
+    parameters.add("langid.langField", "language");
+    parameters.add("langid.enforceSchema", "false");
+    TikaLanguageIdentifierUpdateProcessor p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters);
+    assertEquals(valueF1, p.concatFields(doc).trim());
+
+    parameters = new ModifiableSolrParams();
+    parameters.add("langid.fl", "foo_s");
+    parameters.add("langid.langField", "language");
+    parameters.add("langid.enforceSchema", "false");
+    parameters.add("langid.maxTotalChars", "6");
+    p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters);
+    assertEquals("Apache", p.concatFields(doc).trim());
+
+    doc.addField("bar_s", valueF2);
+
+    parameters = new ModifiableSolrParams();
+    parameters.add("langid.fl", "foo_s,bar_s");
+    parameters.add("langid.langField", "language");
+    parameters.add("langid.enforceSchema", "false");
+    p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters);
+    assertEquals(valueF1 + " " + valueF2, p.concatFields(doc).trim());
+
+    parameters = new ModifiableSolrParams();
+    parameters.add("langid.fl", "foo_s,bar_s");
+    parameters.add("langid.langField", "language");
+    parameters.add("langid.enforceSchema", "false");
+    parameters.add("langid.maxTotalChars", "6");
+    p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters);
+    assertEquals("Apache", p.concatFields(doc).trim());
+
+    parameters = new ModifiableSolrParams();
+    parameters.add("langid.fl", "foo_s,bar_s");
+    parameters.add("langid.langField", "language");
+    parameters.add("langid.enforceSchema", "false");
+    parameters.add("langid.maxTotalChars", "100000");
+    p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters);
+    assertEquals(valueF1 + " " + valueF2, p.concatFields(doc).trim());
+
+  }
+
+
+  @Test
+  public void testMaxFieldValueCharsAndMaxTotalChars() throws Exception {
+    SolrInputDocument doc = new SolrInputDocument();
+    String valueF1 = "Apache Lucene is a free/open source information retrieval software library, originally created in Java by Doug Cutting. It is supported by the Apache Software Foundation and is released under the Apache Software License.";
+    String valueF2 = "An open-source search server based on the Lucene Java search library. News, documentation, resources, and download.";
+    doc.addField("foo_s", valueF1);
+
+    ModifiableSolrParams parameters = new ModifiableSolrParams();
+    parameters.add("langid.fl", "foo_s");
+    parameters.add("langid.langField", "language");
+    parameters.add("langid.enforceSchema", "false");
+    TikaLanguageIdentifierUpdateProcessor p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters);
+    assertEquals(valueF1, p.concatFields(doc).trim());
+
+    parameters = new ModifiableSolrParams();
+    parameters.add("langid.fl", "foo_s");
+    parameters.add("langid.langField", "language");
+    parameters.add("langid.enforceSchema", "false");
+    parameters.add("langid.maxFieldValueChars", "8");
+    parameters.add("langid.maxTotalChars", "6");
+    p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters);
+    assertEquals("Apache", p.concatFields(doc).trim());
+
+    doc.addField("bar_s", valueF2);
+
+    parameters = new ModifiableSolrParams();
+    parameters.add("langid.fl", "foo_s,bar_s");
+    parameters.add("langid.langField", "language");
+    parameters.add("langid.enforceSchema", "false");
+    p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters);
+    assertEquals(valueF1 + " " + valueF2, p.concatFields(doc).trim());
+
+    parameters = new ModifiableSolrParams();
+    parameters.add("langid.fl", "foo_s,bar_s");
+    parameters.add("langid.langField", "language");
+    parameters.add("langid.enforceSchema", "false");
+    parameters.add("langid.maxFieldValueChars", "3");
+    parameters.add("langid.maxTotalChars", "8");
+    p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters);
+    assertEquals("Apa An", p.concatFields(doc).trim());
+
+    parameters = new ModifiableSolrParams();
+    parameters.add("langid.fl", "foo_s,bar_s");
+    parameters.add("langid.langField", "language");
+    parameters.add("langid.enforceSchema", "false");
+    parameters.add("langid.maxFieldValueChars", "10000");
+    parameters.add("langid.maxTotalChars", "100000");
+    p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters);
+    assertEquals(valueF1 + " " + valueF2, p.concatFields(doc).trim());
+
+  }
+
 }