You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ro...@apache.org on 2014/12/08 16:41:03 UTC
svn commit: r1643845 [2/2] - in /lucene/dev/branches/lucene2878: ./ lucene/
lucene/analysis/ lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/
lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/ lucene/core/
lucene/core/src/java/org...
Modified: lucene/dev/branches/lucene2878/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterWithThreads.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene2878/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterWithThreads.java?rev=1643845&r1=1643844&r2=1643845&view=diff
==============================================================================
--- lucene/dev/branches/lucene2878/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterWithThreads.java (original)
+++ lucene/dev/branches/lucene2878/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterWithThreads.java Mon Dec 8 15:41:02 2014
@@ -110,6 +110,9 @@ public class TestIndexWriterWithThreads
}
break;
}
+ } catch (AlreadyClosedException ace) {
+ // OK: abort closes the writer
+ break;
} catch (Throwable t) {
//t.printStackTrace(System.out);
if (noErrors) {
@@ -166,6 +169,9 @@ public class TestIndexWriterWithThreads
dir.setMaxSizeInBytes(0);
try {
writer.commit();
+ } catch (AlreadyClosedException ace) {
+ // OK: abort closes the writer
+ assertTrue(writer.deleter.isClosed());
} finally {
writer.close();
}
@@ -300,6 +306,9 @@ public class TestIndexWriterWithThreads
writer.commit();
writer.close();
success = true;
+ } catch (AlreadyClosedException ace) {
+ // OK: abort closes the writer
+ assertTrue(writer.deleter.isClosed());
} catch (IOException ioe) {
writer.rollback();
failure.clearDoFail();
@@ -329,10 +338,22 @@ public class TestIndexWriterWithThreads
public void _testSingleThreadFailure(MockDirectoryWrapper.Failure failure) throws IOException {
MockDirectoryWrapper dir = newMockDirectory();
- IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random()))
- .setMaxBufferedDocs(2)
- .setMergeScheduler(new ConcurrentMergeScheduler())
- .setCommitOnClose(false));
+ IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random()))
+ .setMaxBufferedDocs(2)
+ .setMergeScheduler(new ConcurrentMergeScheduler())
+ .setCommitOnClose(false);
+
+ if (iwc.getMergeScheduler() instanceof ConcurrentMergeScheduler) {
+ iwc.setMergeScheduler(new SuppressingConcurrentMergeScheduler() {
+ @Override
+ protected boolean isOK(Throwable th) {
+ return th instanceof AlreadyClosedException ||
+ (th instanceof IllegalStateException && th.getMessage().contains("this writer hit an unrecoverable error"));
+ }
+ });
+ }
+
+ IndexWriter writer = new IndexWriter(dir, iwc);
final Document doc = new Document();
FieldType customType = new FieldType(TextField.TYPE_STORED);
customType.setStoreTermVectors(true);
@@ -353,11 +374,13 @@ public class TestIndexWriterWithThreads
} catch (IOException ioe) {
}
failure.clearDoFail();
- writer.addDocument(doc);
try {
+ writer.addDocument(doc);
writer.commit();
- } finally {
writer.close();
+ } catch (AlreadyClosedException ace) {
+ // OK: abort closes the writer
+ assertTrue(writer.deleter.isClosed());
}
dir.close();
}
Modified: lucene/dev/branches/lucene2878/lucene/sandbox/src/test/org/apache/lucene/codecs/idversion/TestIDVersionPostingsFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene2878/lucene/sandbox/src/test/org/apache/lucene/codecs/idversion/TestIDVersionPostingsFormat.java?rev=1643845&r1=1643844&r2=1643845&view=diff
==============================================================================
--- lucene/dev/branches/lucene2878/lucene/sandbox/src/test/org/apache/lucene/codecs/idversion/TestIDVersionPostingsFormat.java (original)
+++ lucene/dev/branches/lucene2878/lucene/sandbox/src/test/org/apache/lucene/codecs/idversion/TestIDVersionPostingsFormat.java Mon Dec 8 15:41:02 2014
@@ -52,6 +52,7 @@ import org.apache.lucene.search.IndexSea
import org.apache.lucene.search.LiveFieldValues;
import org.apache.lucene.search.SearcherFactory;
import org.apache.lucene.search.SearcherManager;
+import org.apache.lucene.store.AlreadyClosedException;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
@@ -597,8 +598,21 @@ public class TestIDVersionPostingsFormat
} catch (IllegalArgumentException iae) {
// expected
}
+ try {
+ w.addDocument(doc);
+ fail("should have hit exc");
+ } catch (AlreadyClosedException ace) {
+ // expected
+ }
+ dir.close();
+ }
- doc = new Document();
+ public void testInvalidVersions2() throws IOException {
+ Directory dir = newDirectory();
+ IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random()));
+ iwc.setCodec(TestUtil.alwaysPostingsFormat(new IDVersionPostingsFormat()));
+ RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);
+ Document doc = new Document();
// Long.MAX_VALUE:
doc.add(new StringAndPayloadField("id", "id", new BytesRef(new byte[] {(byte)0x7f, (byte)0xff, (byte)0xff, (byte)0xff, (byte)0xff, (byte)0xff, (byte)0xff, (byte)0xff})));
try {
@@ -608,7 +622,12 @@ public class TestIDVersionPostingsFormat
} catch (IllegalArgumentException iae) {
// expected
}
- w.close();
+ try {
+ w.addDocument(doc);
+ fail("should have hit exc");
+ } catch (AlreadyClosedException ace) {
+ // expected
+ }
dir.close();
}
Modified: lucene/dev/branches/lucene2878/lucene/spatial/src/java/org/apache/lucene/spatial/prefix/tree/NumberRangePrefixTree.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene2878/lucene/spatial/src/java/org/apache/lucene/spatial/prefix/tree/NumberRangePrefixTree.java?rev=1643845&r1=1643844&r2=1643845&view=diff
==============================================================================
--- lucene/dev/branches/lucene2878/lucene/spatial/src/java/org/apache/lucene/spatial/prefix/tree/NumberRangePrefixTree.java (original)
+++ lucene/dev/branches/lucene2878/lucene/spatial/src/java/org/apache/lucene/spatial/prefix/tree/NumberRangePrefixTree.java Mon Dec 8 15:41:02 2014
@@ -232,20 +232,34 @@ public abstract class NumberRangePrefixT
public Shape toRangeShape(Shape start, Shape end) {
if (!(start instanceof LevelledValue && end instanceof LevelledValue))
throw new IllegalArgumentException("Must pass "+LevelledValue.class+" but got "+start.getClass());
- LevelledValue minLV = (LevelledValue) start;
- LevelledValue maxLV = (LevelledValue) end;
- if (minLV.equals(maxLV))
- return minLV;
- //Optimize precision of the range, e.g. April 1st to April 30th is April.
- minLV = minLV.getLVAtLevel(truncateStartVals(minLV, 0));
- maxLV = maxLV.getLVAtLevel(truncateEndVals(maxLV, 0));
- int cmp = comparePrefixLV(minLV, maxLV);
+ LevelledValue startLV = (LevelledValue) start;
+ LevelledValue endLV = (LevelledValue) end;
+ //note: this normalization/optimization process is actually REQUIRED based on assumptions elsewhere.
+ //Normalize start & end
+ startLV = startLV.getLVAtLevel(truncateStartVals(startLV, 0)); // chops off trailing min-vals (zeroes)
+ endLV = endLV.getLVAtLevel(truncateEndVals(endLV, 0)); // chops off trailing max-vals
+ //Optimize to just start or end if it's equivalent, e.g. April to April 1st is April 1st.
+ int cmp = comparePrefixLV(startLV, endLV);
if (cmp > 0) {
throw new IllegalArgumentException("Wrong order: "+start+" TO "+end);
}
- if (cmp == 0 && minLV.getLevel() == maxLV.getLevel())
- return minLV;
- return new NRShape(minLV, maxLV);
+ if (cmp == 0) {//one is a prefix of the other
+ if (startLV.getLevel() == endLV.getLevel()) {
+ //same
+ return startLV;
+ } else if (endLV.getLevel() > startLV.getLevel()) {
+ // e.g. April to April 1st
+ if (truncateStartVals(endLV, startLV.getLevel()) == startLV.getLevel()) {
+ return endLV;
+ }
+ } else {//minLV level > maxLV level
+ // e.g. April 30 to April
+ if (truncateEndVals(startLV, endLV.getLevel()) == endLV.getLevel()) {
+ return startLV;
+ }
+ }
+ }
+ return new NRShape(startLV, endLV);
}
/** From lv.getLevel on up, it returns the first Level seen with val != 0. It doesn't check past endLevel. */
Modified: lucene/dev/branches/lucene2878/lucene/spatial/src/test/org/apache/lucene/spatial/prefix/DateNRStrategyTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene2878/lucene/spatial/src/test/org/apache/lucene/spatial/prefix/DateNRStrategyTest.java?rev=1643845&r1=1643844&r2=1643845&view=diff
==============================================================================
--- lucene/dev/branches/lucene2878/lucene/spatial/src/test/org/apache/lucene/spatial/prefix/DateNRStrategyTest.java (original)
+++ lucene/dev/branches/lucene2878/lucene/spatial/src/test/org/apache/lucene/spatial/prefix/DateNRStrategyTest.java Mon Dec 8 15:41:02 2014
@@ -27,6 +27,7 @@ import org.junit.Ignore;
import org.junit.Test;
import java.io.IOException;
+import java.text.ParseException;
import java.util.Calendar;
public class DateNRStrategyTest extends RandomSpatialOpStrategyTestCase {
@@ -65,12 +66,6 @@ public class DateNRStrategyTest extends
testOperationRandomShapes(SpatialOperation.Contains);
}
- @Test @Ignore("see LUCENE-5692")
- @Repeat(iterations = ITERATIONS)
- public void testDisjoint() throws IOException {
- testOperationRandomShapes(SpatialOperation.IsDisjointTo);
- }
-
@Test
public void testWithinSame() throws IOException {
final Calendar cal = tree.newCal();
Modified: lucene/dev/branches/lucene2878/lucene/spatial/src/test/org/apache/lucene/spatial/prefix/tree/DateRangePrefixTreeTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene2878/lucene/spatial/src/test/org/apache/lucene/spatial/prefix/tree/DateRangePrefixTreeTest.java?rev=1643845&r1=1643844&r2=1643845&view=diff
==============================================================================
--- lucene/dev/branches/lucene2878/lucene/spatial/src/test/org/apache/lucene/spatial/prefix/tree/DateRangePrefixTreeTest.java (original)
+++ lucene/dev/branches/lucene2878/lucene/spatial/src/test/org/apache/lucene/spatial/prefix/tree/DateRangePrefixTreeTest.java Mon Dec 8 15:41:02 2014
@@ -157,7 +157,9 @@ public class DateRangePrefixTreeTest ext
assertEquals("2014", tree.parseShape("[2014-01-01 TO 2014-12-31]").toString());
- assertEquals("2014", tree.parseShape("[2014-01 TO 2014]").toString());
+ assertEquals("2014", tree.parseShape("[2014-01 TO 2014]").toString());
+ assertEquals("2014-01", tree.parseShape("[2014 TO 2014-01]").toString());
+ assertEquals("2014-12", tree.parseShape("[2014-12 TO 2014]").toString());
assertEquals("[2014 TO 2014-04-06]", tree.parseShape("[2014-01 TO 2014-04-06]").toString());
Modified: lucene/dev/branches/lucene2878/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleSetupAndRestoreClassEnv.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene2878/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleSetupAndRestoreClassEnv.java?rev=1643845&r1=1643844&r2=1643845&view=diff
==============================================================================
--- lucene/dev/branches/lucene2878/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleSetupAndRestoreClassEnv.java (original)
+++ lucene/dev/branches/lucene2878/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleSetupAndRestoreClassEnv.java Mon Dec 8 15:41:02 2014
@@ -36,6 +36,9 @@ import org.apache.lucene.codecs.assertin
import org.apache.lucene.codecs.asserting.AssertingPostingsFormat;
import org.apache.lucene.codecs.cheapbastard.CheapBastardCodec;
import org.apache.lucene.codecs.compressing.CompressingCodec;
+import org.apache.lucene.codecs.lucene50.Lucene50Codec;
+import org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat;
+import org.apache.lucene.codecs.lucene50.Lucene50StoredFieldsFormat.Mode;
import org.apache.lucene.codecs.mockrandom.MockRandomPostingsFormat;
import org.apache.lucene.codecs.simpletext.SimpleTextCodec;
import org.apache.lucene.index.RandomCodec;
@@ -44,7 +47,9 @@ import org.apache.lucene.search.similari
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.util.LuceneTestCase.SuppressCodecs;
import org.junit.internal.AssumptionViolatedException;
+
import com.carrotsearch.randomizedtesting.RandomizedContext;
+import com.carrotsearch.randomizedtesting.generators.RandomPicks;
import static org.apache.lucene.util.LuceneTestCase.INFOSTREAM;
import static org.apache.lucene.util.LuceneTestCase.LiveIWCFlushMode;
@@ -198,6 +203,8 @@ final class TestRuleSetupAndRestoreClass
codec = new AssertingCodec();
} else if ("Compressing".equals(TEST_CODEC) || ("random".equals(TEST_CODEC) && randomVal == 6 && !shouldAvoidCodec("Compressing"))) {
codec = CompressingCodec.randomInstance(random);
+ } else if ("Lucene50".equals(TEST_CODEC) || ("random".equals(TEST_CODEC) && randomVal == 5 && !shouldAvoidCodec("Lucene50"))) {
+ codec = new Lucene50Codec(RandomPicks.randomFrom(random, Lucene50StoredFieldsFormat.Mode.values()));
} else if (!"random".equals(TEST_CODEC)) {
codec = Codec.forName(TEST_CODEC);
} else if ("random".equals(TEST_POSTINGSFORMAT)) {
Modified: lucene/dev/branches/lucene2878/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene2878/solr/CHANGES.txt?rev=1643845&r1=1643844&r2=1643845&view=diff
==============================================================================
--- lucene/dev/branches/lucene2878/solr/CHANGES.txt (original)
+++ lucene/dev/branches/lucene2878/solr/CHANGES.txt Mon Dec 8 15:41:02 2014
@@ -110,6 +110,12 @@ Upgrading from Solr 4.x
* SolrCore.reload(ConfigSet coreConfig, SolrCore prev) was deprecated in 4.10.3 and
removed in 5.0. use SolrCore.reload(ConfigSet coreConfig). See SOLR-5864.
+* The "termIndexInterval" option in solrconfig.xml has been a No-Op in the default codec
+ since Solr 4.0, and has been removed completely in 5.0. If you get an "Illegal parameter
+ 'termIndexInterval'" error when upgrading, you can safely remove this option from your
+ configs. If you have a strong need to configure this, you must explicitly configure your
+ schema with a custom codec. See SOLR-6560 and for more details.
+
Detailed Change List
----------------------
@@ -296,6 +302,13 @@ Bug Fixes
* SOLR-6763: Shard leader elections should not persist across session expiry
(Alan Woodward, Mark Miller)
+
+* SOLR-3881: Avoid OOMs in LanguageIdentifierUpdateProcessor:
+ - Added langid.maxFieldValueChars and langid.maxTotalChars params to limit
+ input, by default 10k and 20k chars, respectively.
+ - Moved input concatenation to Tika implementation; the langdetect
+ implementation instead appends each input piece via the langdetect API.
+ (Vitaliy Zhovtyuk, Tomás Fernández Löbbe, Rob Tulloh, Steve Rowe)
Optimizations
----------------------
@@ -448,6 +461,9 @@ Other Changes
* SOLR-6752: Buffer Cache allocate/lost metrics should be exposed.
(Mike Drob via Mark Miller)
+* SOLR-6560: Purge termIndexInterval from example/test configs
+ (Tom Burton-West, hossman)
+
================== 4.10.3 ==================
Bug Fixes
Modified: lucene/dev/branches/lucene2878/solr/contrib/dataimporthandler/src/java/org/apache/solr/handler/dataimport/EntityProcessorBase.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene2878/solr/contrib/dataimporthandler/src/java/org/apache/solr/handler/dataimport/EntityProcessorBase.java?rev=1643845&r1=1643844&r2=1643845&view=diff
==============================================================================
--- lucene/dev/branches/lucene2878/solr/contrib/dataimporthandler/src/java/org/apache/solr/handler/dataimport/EntityProcessorBase.java (original)
+++ lucene/dev/branches/lucene2878/solr/contrib/dataimporthandler/src/java/org/apache/solr/handler/dataimport/EntityProcessorBase.java Mon Dec 8 15:41:02 2014
@@ -49,7 +49,7 @@ public class EntityProcessorBase extends
protected DIHCacheSupport cacheSupport = null;
- protected Zipper zipper;
+ private Zipper zipper;
@Override
Modified: lucene/dev/branches/lucene2878/solr/contrib/langid/src/java/org/apache/solr/update/processor/LangDetectLanguageIdentifierUpdateProcessor.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene2878/solr/contrib/langid/src/java/org/apache/solr/update/processor/LangDetectLanguageIdentifierUpdateProcessor.java?rev=1643845&r1=1643844&r2=1643845&view=diff
==============================================================================
--- lucene/dev/branches/lucene2878/solr/contrib/langid/src/java/org/apache/solr/update/processor/LangDetectLanguageIdentifierUpdateProcessor.java (original)
+++ lucene/dev/branches/lucene2878/solr/contrib/langid/src/java/org/apache/solr/update/processor/LangDetectLanguageIdentifierUpdateProcessor.java Mon Dec 8 15:41:02 2014
@@ -18,6 +18,7 @@ package org.apache.solr.update.processor
*/
import java.util.ArrayList;
+import java.util.Collection;
import java.util.Collections;
import java.util.List;
@@ -28,6 +29,7 @@ import com.cybozu.labs.langdetect.Detect
import com.cybozu.labs.langdetect.DetectorFactory;
import com.cybozu.labs.langdetect.LangDetectException;
import com.cybozu.labs.langdetect.Language;
+import org.apache.solr.common.SolrInputDocument;
/**
* Identifies the language of a set of input fields using http://code.google.com/p/language-detection
@@ -43,15 +45,32 @@ public class LangDetectLanguageIdentifie
}
@Override
- protected List<DetectedLanguage> detectLanguage(String content) {
- if (content.trim().length() == 0) { // to be consistent with the tika impl?
- log.debug("No input text to detect language from, returning empty list");
- return Collections.emptyList();
- }
-
+ protected List<DetectedLanguage> detectLanguage(SolrInputDocument doc) {
try {
Detector detector = DetectorFactory.create();
- detector.append(content);
+ detector.setMaxTextLength(maxTotalChars);
+
+ for (String fieldName : inputFields) {
+ log.debug("Appending field " + fieldName);
+ if (doc.containsKey(fieldName)) {
+ Collection<Object> fieldValues = doc.getFieldValues(fieldName);
+ if (fieldValues != null) {
+ for (Object content : fieldValues) {
+ if (content instanceof String) {
+ String stringContent = (String) content;
+ if (stringContent.length() > maxFieldValueChars) {
+ detector.append(stringContent.substring(0, maxFieldValueChars));
+ } else {
+ detector.append(stringContent);
+ }
+ detector.append(" ");
+ } else {
+ log.warn("Field " + fieldName + " not a String value, not including in detection");
+ }
+ }
+ }
+ }
+ }
ArrayList<Language> langlist = detector.getProbabilities();
ArrayList<DetectedLanguage> solrLangList = new ArrayList<>();
for (Language l: langlist) {
Modified: lucene/dev/branches/lucene2878/solr/contrib/langid/src/java/org/apache/solr/update/processor/LangIdParams.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene2878/solr/contrib/langid/src/java/org/apache/solr/update/processor/LangIdParams.java?rev=1643845&r1=1643844&r2=1643845&view=diff
==============================================================================
--- lucene/dev/branches/lucene2878/solr/contrib/langid/src/java/org/apache/solr/update/processor/LangIdParams.java (original)
+++ lucene/dev/branches/lucene2878/solr/contrib/langid/src/java/org/apache/solr/update/processor/LangIdParams.java Mon Dec 8 15:41:02 2014
@@ -41,12 +41,16 @@ public interface LangIdParams {
String MAP_LCMAP = LANGUAGE_ID + ".map.lcmap"; // Enables mapping multiple langs to same output field
String MAP_PATTERN = LANGUAGE_ID + ".map.pattern"; // RegEx pattern to match field name
String MAP_REPLACE = LANGUAGE_ID + ".map.replace"; // Replace pattern
+ String MAX_FIELD_VALUE_CHARS = LANGUAGE_ID + ".maxFieldValueChars"; // Maximum number of characters to use per field for language detection
+ String MAX_TOTAL_CHARS = LANGUAGE_ID + ".maxTotalChars"; // Maximum number of characters to use per all concatenated fields for language detection
String DOCID_FIELD_DEFAULT = "id";
String DOCID_LANGFIELD_DEFAULT = null;
String DOCID_LANGSFIELD_DEFAULT = null;
String MAP_PATTERN_DEFAULT = "(.*)";
String MAP_REPLACE_DEFAULT = "$1_{lang}";
+ int MAX_FIELD_VALUE_CHARS_DEFAULT = 10000;
+ int MAX_TOTAL_CHARS_DEFAULT = 20000;
// TODO: This default threshold accepts even "uncertain" detections.
// Increase &langid.threshold above 0.5 to return only certain detections
Modified: lucene/dev/branches/lucene2878/solr/contrib/langid/src/java/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessor.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene2878/solr/contrib/langid/src/java/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessor.java?rev=1643845&r1=1643844&r2=1643845&view=diff
==============================================================================
--- lucene/dev/branches/lucene2878/solr/contrib/langid/src/java/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessor.java (original)
+++ lucene/dev/branches/lucene2878/solr/contrib/langid/src/java/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessor.java Mon Dec 8 15:41:02 2014
@@ -78,6 +78,8 @@ public abstract class LanguageIdentifier
protected HashMap<String,String> lcMap;
protected HashMap<String,String> mapLcMap;
protected IndexSchema schema;
+ protected int maxFieldValueChars;
+ protected int maxTotalChars;
// Regex patterns
protected final Pattern tikaSimilarityPattern = Pattern.compile(".*\\((.*?)\\)");
@@ -169,8 +171,21 @@ public abstract class LanguageIdentifier
mapPattern = Pattern.compile(params.get(MAP_PATTERN, MAP_PATTERN_DEFAULT));
mapReplaceStr = params.get(MAP_REPLACE, MAP_REPLACE_DEFAULT);
-
-
+ maxFieldValueChars = params.getInt(MAX_FIELD_VALUE_CHARS, MAX_FIELD_VALUE_CHARS_DEFAULT);
+ maxTotalChars = params.getInt(MAX_TOTAL_CHARS, MAX_TOTAL_CHARS_DEFAULT);
+ if (maxFieldValueChars > maxTotalChars) {
+ if (maxTotalChars == MAX_TOTAL_CHARS_DEFAULT) {
+ // If the user specified only maxFieldValueChars, make maxTotalChars the same as it
+ log.warn(MAX_FIELD_VALUE_CHARS + " (" + maxFieldValueChars + ") is less than " + MAX_TOTAL_CHARS + " ("
+ + maxTotalChars + "). Setting " + MAX_TOTAL_CHARS + " to " + maxFieldValueChars + ".");
+ maxTotalChars = maxFieldValueChars;
+ } else {
+ // If the user specified maxTotalChars, make maxFieldValueChars the same as it
+ log.warn(MAX_FIELD_VALUE_CHARS + " (" + maxFieldValueChars + ") is less than " + MAX_TOTAL_CHARS + " ("
+ + maxTotalChars + "). Setting " + MAX_FIELD_VALUE_CHARS + " to " + maxTotalChars + ".");
+ maxFieldValueChars = maxTotalChars;
+ }
+ }
}
log.debug("LangId configured");
@@ -203,11 +218,10 @@ public abstract class LanguageIdentifier
String fallbackLang = getFallbackLang(doc, fallbackFields, fallbackValue);
if(langField == null || !doc.containsKey(langField) || (doc.containsKey(langField) && overwrite)) {
- String allText = concatFields(doc, inputFields);
- List<DetectedLanguage> languagelist = detectLanguage(allText);
+ List<DetectedLanguage> languagelist = detectLanguage(doc);
docLang = resolveLanguage(languagelist, fallbackLang);
docLangs.add(docLang);
- log.debug("Detected main document language from fields "+inputFields+": "+docLang);
+ log.debug("Detected main document language from fields "+ Arrays.toString(inputFields) +": "+docLang);
if(doc.containsKey(langField) && overwrite) {
log.debug("Overwritten old value "+doc.getFieldValue(langField));
@@ -227,8 +241,7 @@ public abstract class LanguageIdentifier
if(doc.containsKey(fieldName)) {
String fieldLang;
if(mapIndividual && mapIndividualFieldsSet.contains(fieldName)) {
- String text = (String) doc.getFieldValue(fieldName);
- List<DetectedLanguage> languagelist = detectLanguage(text);
+ List<DetectedLanguage> languagelist = detectLanguage(doc);
fieldLang = resolveLanguage(languagelist, docLang);
docLangs.add(fieldLang);
log.debug("Mapping field "+fieldName+" using individually detected language "+fieldLang);
@@ -284,37 +297,13 @@ public abstract class LanguageIdentifier
return lang;
}
- /*
- * Concatenates content from multiple fields
- */
- protected String concatFields(SolrInputDocument doc, String[] fields) {
- StringBuilder sb = new StringBuilder();
- for (String fieldName : inputFields) {
- log.debug("Appending field "+fieldName);
- if (doc.containsKey(fieldName)) {
- Collection<Object> fieldValues = doc.getFieldValues(fieldName);
- if (fieldValues != null) {
- for (Object content : fieldValues) {
- if (content instanceof String) {
- sb.append((String) content);
- sb.append(" ");
- } else {
- log.warn("Field " + fieldName + " not a String value, not including in detection");
- }
- }
- }
- }
- }
- return sb.toString();
- }
-
/**
* Detects language(s) from a string.
* Classes wishing to implement their own language detection module should override this method.
* @param content The content to identify
* @return List of detected language(s) according to RFC-3066
*/
- protected abstract List<DetectedLanguage> detectLanguage(String content);
+ protected abstract List<DetectedLanguage> detectLanguage(SolrInputDocument content);
/**
* Chooses a language based on the list of candidates detected
Modified: lucene/dev/branches/lucene2878/solr/contrib/langid/src/java/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessor.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene2878/solr/contrib/langid/src/java/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessor.java?rev=1643845&r1=1643844&r2=1643845&view=diff
==============================================================================
--- lucene/dev/branches/lucene2878/solr/contrib/langid/src/java/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessor.java (original)
+++ lucene/dev/branches/lucene2878/solr/contrib/langid/src/java/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessor.java Mon Dec 8 15:41:02 2014
@@ -24,6 +24,9 @@ import org.apache.solr.request.SolrQuery
import org.apache.solr.response.SolrQueryResponse;
import org.apache.tika.language.LanguageIdentifier;
+import org.apache.solr.common.SolrInputDocument;
+import java.util.Collection;
+
/**
* Identifies the language of a set of input fields using Tika's
* LanguageIdentifier.
@@ -40,9 +43,10 @@ public class TikaLanguageIdentifierUpdat
}
@Override
- protected List<DetectedLanguage> detectLanguage(String content) {
+ protected List<DetectedLanguage> detectLanguage(SolrInputDocument doc) {
List<DetectedLanguage> languages = new ArrayList<>();
- if(content.trim().length() != 0) {
+ String content = concatFields(doc);
+ if (content.length() != 0) {
LanguageIdentifier identifier = new LanguageIdentifier(content);
// FIXME: Hack - we get the distance from toString and calculate our own certainty score
Double distance = Double.parseDouble(tikaSimilarityPattern.matcher(identifier.toString()).replaceFirst("$1"));
@@ -57,4 +61,59 @@ public class TikaLanguageIdentifierUpdat
}
return languages;
}
+
+
+ /**
+ * Concatenates content from multiple fields
+ */
+ protected String concatFields(SolrInputDocument doc) {
+ StringBuilder sb = new StringBuilder(getExpectedSize(doc, inputFields));
+ for (String fieldName : inputFields) {
+ log.debug("Appending field " + fieldName);
+ if (doc.containsKey(fieldName)) {
+ Collection<Object> fieldValues = doc.getFieldValues(fieldName);
+ if (fieldValues != null) {
+ for (Object content : fieldValues) {
+ if (content instanceof String) {
+ String stringContent = (String) content;
+ if (stringContent.length() > maxFieldValueChars) {
+ sb.append(stringContent.substring(0, maxFieldValueChars));
+ } else {
+ sb.append(stringContent);
+}
+ sb.append(" ");
+ if (sb.length() > maxTotalChars) {
+ sb.setLength(maxTotalChars);
+ break;
+ }
+ } else {
+ log.warn("Field " + fieldName + " not a String value, not including in detection");
+ }
+ }
+ }
+ }
+ }
+ return sb.toString();
+ }
+
+ /**
+ * Calculate expected string size.
+ *
+ * @param doc solr input document
+ * @param fields fields to select
+ * @return expected size of string value
+ */
+ private int getExpectedSize(SolrInputDocument doc, String[] fields) {
+ int docSize = 0;
+ for (String field : fields) {
+ Collection<Object> contents = doc.getFieldValues(field);
+ for (Object content : contents) {
+ if (content instanceof String) {
+ docSize += Math.min(((String) content).length(), maxFieldValueChars);
+ }
+ }
+ docSize = Math.min(docSize, maxTotalChars);
+ }
+ return docSize;
+ }
}
Modified: lucene/dev/branches/lucene2878/solr/contrib/langid/src/test/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessorFactoryTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene2878/solr/contrib/langid/src/test/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessorFactoryTest.java?rev=1643845&r1=1643844&r2=1643845&view=diff
==============================================================================
--- lucene/dev/branches/lucene2878/solr/contrib/langid/src/test/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessorFactoryTest.java (original)
+++ lucene/dev/branches/lucene2878/solr/contrib/langid/src/test/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessorFactoryTest.java Mon Dec 8 15:41:02 2014
@@ -17,11 +17,166 @@ package org.apache.solr.update.processor
* limitations under the License.
*/
+import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.params.ModifiableSolrParams;
+import org.junit.Test;
public class TikaLanguageIdentifierUpdateProcessorFactoryTest extends LanguageIdentifierUpdateProcessorFactoryTestCase {
@Override
protected LanguageIdentifierUpdateProcessor createLangIdProcessor(ModifiableSolrParams parameters) throws Exception {
return new TikaLanguageIdentifierUpdateProcessor(_parser.buildRequestFrom(h.getCore(), parameters, null), resp, null);
}
+
+
+ @Test
+ public void testMaxFieldValueChars() throws Exception {
+ SolrInputDocument doc = new SolrInputDocument();
+ String valueF1 = "Apache Lucene is a free/open source information retrieval software library, originally created in Java by Doug Cutting. It is supported by the Apache Software Foundation and is released under the Apache Software License.";
+ String valueF2 = "An open-source search server based on the Lucene Java search library. News, documentation, resources, and download.";
+ doc.addField("foo_s", valueF1);
+
+ ModifiableSolrParams parameters = new ModifiableSolrParams();
+ parameters.add("langid.fl", "foo_s");
+ parameters.add("langid.langField", "language");
+ parameters.add("langid.enforceSchema", "false");
+ TikaLanguageIdentifierUpdateProcessor p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters);
+ assertEquals(valueF1, p.concatFields(doc).trim());
+
+ parameters = new ModifiableSolrParams();
+ parameters.add("langid.fl", "foo_s");
+ parameters.add("langid.langField", "language");
+ parameters.add("langid.enforceSchema", "false");
+ parameters.add("langid.maxFieldValueChars", "6");
+ p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters);
+ assertEquals("Apache", p.concatFields(doc).trim());
+
+ doc.addField("bar_s", valueF2);
+
+ parameters = new ModifiableSolrParams();
+ parameters.add("langid.fl", "foo_s,bar_s");
+ parameters.add("langid.langField", "language");
+ parameters.add("langid.enforceSchema", "false");
+ p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters);
+ assertEquals(valueF1 + " " + valueF2, p.concatFields(doc).trim());
+
+ parameters = new ModifiableSolrParams();
+ parameters.add("langid.fl", "foo_s,bar_s");
+ parameters.add("langid.langField", "language");
+ parameters.add("langid.enforceSchema", "false");
+ parameters.add("langid.maxFieldValueChars", "6");
+ p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters);
+ assertEquals("Apache" + " " + "An ope", p.concatFields(doc).trim());
+
+ parameters = new ModifiableSolrParams();
+ parameters.add("langid.fl", "foo_s,bar_s");
+ parameters.add("langid.langField", "language");
+ parameters.add("langid.enforceSchema", "false");
+ parameters.add("langid.maxFieldValueChars", "100000");
+ p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters);
+ assertEquals(valueF1 + " " + valueF2, p.concatFields(doc).trim());
+
+}
+
+ @Test
+ public void testMaxTotalChars() throws Exception {
+ SolrInputDocument doc = new SolrInputDocument();
+ String valueF1 = "Apache Lucene is a free/open source information retrieval software library, originally created in Java by Doug Cutting. It is supported by the Apache Software Foundation and is released under the Apache Software License.";
+ String valueF2 = "An open-source search server based on the Lucene Java search library. News, documentation, resources, and download.";
+ doc.addField("foo_s", valueF1);
+
+ ModifiableSolrParams parameters = new ModifiableSolrParams();
+ parameters.add("langid.fl", "foo_s");
+ parameters.add("langid.langField", "language");
+ parameters.add("langid.enforceSchema", "false");
+ TikaLanguageIdentifierUpdateProcessor p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters);
+ assertEquals(valueF1, p.concatFields(doc).trim());
+
+ parameters = new ModifiableSolrParams();
+ parameters.add("langid.fl", "foo_s");
+ parameters.add("langid.langField", "language");
+ parameters.add("langid.enforceSchema", "false");
+ parameters.add("langid.maxTotalChars", "6");
+ p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters);
+ assertEquals("Apache", p.concatFields(doc).trim());
+
+ doc.addField("bar_s", valueF2);
+
+ parameters = new ModifiableSolrParams();
+ parameters.add("langid.fl", "foo_s,bar_s");
+ parameters.add("langid.langField", "language");
+ parameters.add("langid.enforceSchema", "false");
+ p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters);
+ assertEquals(valueF1 + " " + valueF2, p.concatFields(doc).trim());
+
+ parameters = new ModifiableSolrParams();
+ parameters.add("langid.fl", "foo_s,bar_s");
+ parameters.add("langid.langField", "language");
+ parameters.add("langid.enforceSchema", "false");
+ parameters.add("langid.maxTotalChars", "6");
+ p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters);
+ assertEquals("Apache", p.concatFields(doc).trim());
+
+ parameters = new ModifiableSolrParams();
+ parameters.add("langid.fl", "foo_s,bar_s");
+ parameters.add("langid.langField", "language");
+ parameters.add("langid.enforceSchema", "false");
+ parameters.add("langid.maxTotalChars", "100000");
+ p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters);
+ assertEquals(valueF1 + " " + valueF2, p.concatFields(doc).trim());
+
+ }
+
+
+ @Test
+ public void testMaxFieldValueCharsAndMaxTotalChars() throws Exception {
+ SolrInputDocument doc = new SolrInputDocument();
+ String valueF1 = "Apache Lucene is a free/open source information retrieval software library, originally created in Java by Doug Cutting. It is supported by the Apache Software Foundation and is released under the Apache Software License.";
+ String valueF2 = "An open-source search server based on the Lucene Java search library. News, documentation, resources, and download.";
+ doc.addField("foo_s", valueF1);
+
+ ModifiableSolrParams parameters = new ModifiableSolrParams();
+ parameters.add("langid.fl", "foo_s");
+ parameters.add("langid.langField", "language");
+ parameters.add("langid.enforceSchema", "false");
+ TikaLanguageIdentifierUpdateProcessor p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters);
+ assertEquals(valueF1, p.concatFields(doc).trim());
+
+ parameters = new ModifiableSolrParams();
+ parameters.add("langid.fl", "foo_s");
+ parameters.add("langid.langField", "language");
+ parameters.add("langid.enforceSchema", "false");
+ parameters.add("langid.maxFieldValueChars", "8");
+ parameters.add("langid.maxTotalChars", "6");
+ p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters);
+ assertEquals("Apache", p.concatFields(doc).trim());
+
+ doc.addField("bar_s", valueF2);
+
+ parameters = new ModifiableSolrParams();
+ parameters.add("langid.fl", "foo_s,bar_s");
+ parameters.add("langid.langField", "language");
+ parameters.add("langid.enforceSchema", "false");
+ p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters);
+ assertEquals(valueF1 + " " + valueF2, p.concatFields(doc).trim());
+
+ parameters = new ModifiableSolrParams();
+ parameters.add("langid.fl", "foo_s,bar_s");
+ parameters.add("langid.langField", "language");
+ parameters.add("langid.enforceSchema", "false");
+ parameters.add("langid.maxFieldValueChars", "3");
+ parameters.add("langid.maxTotalChars", "8");
+ p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters);
+ assertEquals("Apa An", p.concatFields(doc).trim());
+
+ parameters = new ModifiableSolrParams();
+ parameters.add("langid.fl", "foo_s,bar_s");
+ parameters.add("langid.langField", "language");
+ parameters.add("langid.enforceSchema", "false");
+ parameters.add("langid.maxFieldValueChars", "10000");
+ parameters.add("langid.maxTotalChars", "100000");
+ p = (TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters);
+ assertEquals(valueF1 + " " + valueF2, p.concatFields(doc).trim());
+
+ }
+
}
Modified: lucene/dev/branches/lucene2878/solr/contrib/morphlines-core/src/test-files/solr/collection1/conf/solrconfig.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene2878/solr/contrib/morphlines-core/src/test-files/solr/collection1/conf/solrconfig.xml?rev=1643845&r1=1643844&r2=1643845&view=diff
==============================================================================
--- lucene/dev/branches/lucene2878/solr/contrib/morphlines-core/src/test-files/solr/collection1/conf/solrconfig.xml (original)
+++ lucene/dev/branches/lucene2878/solr/contrib/morphlines-core/src/test-files/solr/collection1/conf/solrconfig.xml Mon Dec 8 15:41:02 2014
@@ -233,11 +233,6 @@
<unlockOnStartup>false</unlockOnStartup>
-->
- <!-- Expert: Controls how often Lucene loads terms into memory
- Default is 128 and is likely good for most everyone.
- -->
- <!-- <termIndexInterval>128</termIndexInterval> -->
-
<!-- If true, IndexReaders will be reopened (often more efficient)
instead of closed and then opened. Default: true
-->
Modified: lucene/dev/branches/lucene2878/solr/contrib/morphlines-core/src/test-files/solr/minimr/conf/solrconfig.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene2878/solr/contrib/morphlines-core/src/test-files/solr/minimr/conf/solrconfig.xml?rev=1643845&r1=1643844&r2=1643845&view=diff
==============================================================================
--- lucene/dev/branches/lucene2878/solr/contrib/morphlines-core/src/test-files/solr/minimr/conf/solrconfig.xml (original)
+++ lucene/dev/branches/lucene2878/solr/contrib/morphlines-core/src/test-files/solr/minimr/conf/solrconfig.xml Mon Dec 8 15:41:02 2014
@@ -248,11 +248,6 @@
<!--
<unlockOnStartup>false</unlockOnStartup>
-->
-
- <!-- Expert: Controls how often Lucene loads terms into memory
- Default is 128 and is likely good for most everyone.
- -->
- <!-- <termIndexInterval>128</termIndexInterval> -->
<!-- If true, IndexReaders will be reopened (often more efficient)
instead of closed and then opened. Default: true
Modified: lucene/dev/branches/lucene2878/solr/contrib/morphlines-core/src/test-files/solr/mrunit/conf/solrconfig.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene2878/solr/contrib/morphlines-core/src/test-files/solr/mrunit/conf/solrconfig.xml?rev=1643845&r1=1643844&r2=1643845&view=diff
==============================================================================
--- lucene/dev/branches/lucene2878/solr/contrib/morphlines-core/src/test-files/solr/mrunit/conf/solrconfig.xml (original)
+++ lucene/dev/branches/lucene2878/solr/contrib/morphlines-core/src/test-files/solr/mrunit/conf/solrconfig.xml Mon Dec 8 15:41:02 2014
@@ -251,11 +251,6 @@
<unlockOnStartup>false</unlockOnStartup>
-->
- <!-- Expert: Controls how often Lucene loads terms into memory
- Default is 128 and is likely good for most everyone.
- -->
- <!-- <termIndexInterval>128</termIndexInterval> -->
-
<!-- If true, IndexReaders will be reopened (often more efficient)
instead of closed and then opened. Default: true
-->
Modified: lucene/dev/branches/lucene2878/solr/contrib/morphlines-core/src/test-files/solr/solrcelltest/collection1/conf/solrconfig.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene2878/solr/contrib/morphlines-core/src/test-files/solr/solrcelltest/collection1/conf/solrconfig.xml?rev=1643845&r1=1643844&r2=1643845&view=diff
==============================================================================
--- lucene/dev/branches/lucene2878/solr/contrib/morphlines-core/src/test-files/solr/solrcelltest/collection1/conf/solrconfig.xml (original)
+++ lucene/dev/branches/lucene2878/solr/contrib/morphlines-core/src/test-files/solr/solrcelltest/collection1/conf/solrconfig.xml Mon Dec 8 15:41:02 2014
@@ -232,11 +232,6 @@
<!--
<unlockOnStartup>false</unlockOnStartup>
-->
-
- <!-- Expert: Controls how often Lucene loads terms into memory
- Default is 128 and is likely good for most everyone.
- -->
- <!-- <termIndexInterval>128</termIndexInterval> -->
<!-- If true, IndexReaders will be reopened (often more efficient)
instead of closed and then opened. Default: true
Modified: lucene/dev/branches/lucene2878/solr/contrib/morphlines-core/src/test-files/solr/solrcloud/conf/solrconfig.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene2878/solr/contrib/morphlines-core/src/test-files/solr/solrcloud/conf/solrconfig.xml?rev=1643845&r1=1643844&r2=1643845&view=diff
==============================================================================
--- lucene/dev/branches/lucene2878/solr/contrib/morphlines-core/src/test-files/solr/solrcloud/conf/solrconfig.xml (original)
+++ lucene/dev/branches/lucene2878/solr/contrib/morphlines-core/src/test-files/solr/solrcloud/conf/solrconfig.xml Mon Dec 8 15:41:02 2014
@@ -252,11 +252,6 @@
<unlockOnStartup>false</unlockOnStartup>
-->
- <!-- Expert: Controls how often Lucene loads terms into memory
- Default is 128 and is likely good for most everyone.
- -->
- <!-- <termIndexInterval>128</termIndexInterval> -->
-
<!-- If true, IndexReaders will be reopened (often more efficient)
instead of closed and then opened. Default: true
-->