You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ja...@apache.org on 2019/01/03 17:29:52 UTC
lucene-solr:master: SOLR-11774: langid.map.individual now works
together with langid.map.keepOrig
Repository: lucene-solr
Updated Branches:
refs/heads/master 6342ec699 -> 00f8f3a13
SOLR-11774: langid.map.individual now works together with langid.map.keepOrig
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/00f8f3a1
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/00f8f3a1
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/00f8f3a1
Branch: refs/heads/master
Commit: 00f8f3a13acd3c4da491e7169afdfbdc0f38e26d
Parents: 6342ec6
Author: Jan Høydahl <ja...@apache.org>
Authored: Thu Jan 3 18:26:27 2019 +0100
Committer: Jan Høydahl <ja...@apache.org>
Committed: Thu Jan 3 18:27:24 2019 +0100
----------------------------------------------------------------------
solr/CHANGES.txt | 6 +
...DetectLanguageIdentifierUpdateProcessor.java | 42 ++--
.../LanguageIdentifierUpdateProcessor.java | 105 +++------
.../OpenNLPLangDetectUpdateProcessor.java | 6 +-
.../processor/SolrInputDocumentReader.java | 224 +++++++++++++++++++
.../TikaLanguageIdentifierUpdateProcessor.java | 9 +-
...dentifierUpdateProcessorFactoryTestCase.java | 109 +++++++--
.../processor/SolrInputDocumentReaderTest.java | 108 +++++++++
8 files changed, 486 insertions(+), 123 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/00f8f3a1/solr/CHANGES.txt
----------------------------------------------------------------------
diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index 00c0081..8d081ae 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -76,6 +76,9 @@ Upgrade Notes
This choice used to be toggleable with an internal/expert "anonChildDocs" parameter flag which is now gone.
(David Smiley)
+* SOLR-11774: In 'langid' contrib, the LanguageIdentifierUpdateProcessor base class changed some method signatures.
+ If you have a custom language identifier implementation you will need to adapt your code.
+
New Features
----------------------
@@ -100,6 +103,9 @@ Bug Fixes
* SOLR-13058: Fix block that was synchronizing on the wrong collection in OverseerTaskProcessor (Gus Heck)
+* SOLR-11774: langid.map.individual now works together with langid.map.keepOrig. Also the detectLanguage() API
+ is changed to accept a Reader allowing for more memory efficient implementations (janhoy)
+
Improvements
----------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/00f8f3a1/solr/contrib/langid/src/java/org/apache/solr/update/processor/LangDetectLanguageIdentifierUpdateProcessor.java
----------------------------------------------------------------------
diff --git a/solr/contrib/langid/src/java/org/apache/solr/update/processor/LangDetectLanguageIdentifierUpdateProcessor.java b/solr/contrib/langid/src/java/org/apache/solr/update/processor/LangDetectLanguageIdentifierUpdateProcessor.java
index 8af05b3..8b4161a 100644
--- a/solr/contrib/langid/src/java/org/apache/solr/update/processor/LangDetectLanguageIdentifierUpdateProcessor.java
+++ b/solr/contrib/langid/src/java/org/apache/solr/update/processor/LangDetectLanguageIdentifierUpdateProcessor.java
@@ -16,9 +16,10 @@
*/
package org.apache.solr.update.processor;
+import java.io.IOException;
+import java.io.Reader;
import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
-import java.util.Collection;
import java.util.Collections;
import java.util.List;
@@ -29,7 +30,6 @@ import com.cybozu.labs.langdetect.Detector;
import com.cybozu.labs.langdetect.DetectorFactory;
import com.cybozu.labs.langdetect.LangDetectException;
import com.cybozu.labs.langdetect.Language;
-import org.apache.solr.common.SolrInputDocument;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -48,33 +48,26 @@ public class LangDetectLanguageIdentifierUpdateProcessor extends LanguageIdentif
super(req, rsp, next);
}
+ /**
+ * Detects language(s) from a reader, typically based on some fields in SolrInputDocument
+ * Classes wishing to implement their own language detection module should override this method.
+ *
+ * @param solrDocReader A reader serving the text from the document to detect
+ * @return List of detected language(s) according to RFC-3066
+ */
@Override
- protected List<DetectedLanguage> detectLanguage(SolrInputDocument doc) {
+ protected List<DetectedLanguage> detectLanguage(Reader solrDocReader) {
try {
Detector detector = DetectorFactory.create();
detector.setMaxTextLength(maxTotalChars);
- for (String fieldName : inputFields) {
- log.debug("Appending field " + fieldName);
- if (doc.containsKey(fieldName)) {
- Collection<Object> fieldValues = doc.getFieldValues(fieldName);
- if (fieldValues != null) {
- for (Object content : fieldValues) {
- if (content instanceof String) {
- String stringContent = (String) content;
- if (stringContent.length() > maxFieldValueChars) {
- detector.append(stringContent.substring(0, maxFieldValueChars));
- } else {
- detector.append(stringContent);
- }
- detector.append(" ");
- } else {
- log.warn("Field " + fieldName + " not a String value, not including in detection");
- }
- }
- }
- }
+ // TODO Work around bug in LangDetect 1.1 which does not expect a -1 return value at end of stream,
+ // but instead only looks at ready()
+ if (solrDocReader instanceof SolrInputDocumentReader) {
+ ((SolrInputDocumentReader)solrDocReader).setEodReturnValue(0);
}
+ detector.append(solrDocReader);
+
ArrayList<Language> langlist = detector.getProbabilities();
ArrayList<DetectedLanguage> solrLangList = new ArrayList<>();
for (Language l: langlist) {
@@ -84,6 +77,9 @@ public class LangDetectLanguageIdentifierUpdateProcessor extends LanguageIdentif
} catch (LangDetectException e) {
log.debug("Could not determine language, returning empty list: ", e);
return Collections.emptyList();
+ } catch (IOException e) {
+ log.warn("Could not determine language.", e);
+ return Collections.emptyList();
}
}
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/00f8f3a1/solr/contrib/langid/src/java/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessor.java
----------------------------------------------------------------------
diff --git a/solr/contrib/langid/src/java/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessor.java b/solr/contrib/langid/src/java/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessor.java
index 3679905..8528a7e 100644
--- a/solr/contrib/langid/src/java/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessor.java
+++ b/solr/contrib/langid/src/java/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessor.java
@@ -30,10 +30,10 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
+import java.io.Reader;
import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
import java.util.Arrays;
-import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
@@ -41,11 +41,11 @@ import java.util.regex.Pattern;
/**
- * Identifies the language of a set of input fields.
- * Also supports mapping of field names based
- * on detected language.
* <p>
- * See <a href="http://wiki.apache.org/solr/LanguageDetection">http://wiki.apache.org/solr/LanguageDetection</a>
+ * Identifies the language of a set of input fields.
+ * Also supports mapping of field names based on detected language.
+ * </p>
+ * See <a href="https://lucene.apache.org/solr/guide/7_4/detecting-languages-during-indexing.html">Detecting Languages During Indexing</a> in reference guide
* @since 3.5
* @lucene.experimental
*/
@@ -207,11 +207,10 @@ public abstract class LanguageIdentifierUpdateProcessor extends UpdateRequestPro
}
/**
- * This is the main, testable process method called from processAdd()
- * @param doc the SolrInputDocument to work on
- * @return the modified SolrInputDocument
+ * This is the main process method called from processAdd()
+ * @param doc the SolrInputDocument to modify
*/
- protected SolrInputDocument process(SolrInputDocument doc) {
+ protected void process(SolrInputDocument doc) {
String docLang = null;
HashSet<String> docLangs = new HashSet<>();
String fallbackLang = getFallbackLang(doc, fallbackFields, fallbackValue);
@@ -240,7 +239,7 @@ public abstract class LanguageIdentifierUpdateProcessor extends UpdateRequestPro
if(doc.containsKey(fieldName)) {
String fieldLang;
if(mapIndividual && mapIndividualFieldsSet.contains(fieldName)) {
- List<DetectedLanguage> languagelist = detectLanguage(doc);
+ List<DetectedLanguage> languagelist = detectLanguage(solrDocReader(doc, new String[]{fieldName}));
fieldLang = resolveLanguage(languagelist, docLang);
docLangs.add(fieldLang);
log.debug("Mapping field "+fieldName+" using individually detected language "+fieldLang);
@@ -270,8 +269,6 @@ public abstract class LanguageIdentifierUpdateProcessor extends UpdateRequestPro
if(langsField != null && langsField.length() != 0) {
doc.setField(langsField, docLangs.toArray());
}
-
- return doc;
}
/**
@@ -297,12 +294,21 @@ public abstract class LanguageIdentifierUpdateProcessor extends UpdateRequestPro
}
/**
- * Detects language(s) from a string.
+ * Detects language(s) from all configured fields
+ * @param doc The solr document
+ * @return List of detected language(s) according to RFC-3066
+ */
+ protected List<DetectedLanguage> detectLanguage(SolrInputDocument doc) {
+ return detectLanguage(solrDocReader(doc, inputFields));
+ }
+
+ /**
+ * Detects language(s) from a reader, typically based on some fields in SolrInputDocument
* Classes wishing to implement their own language detection module should override this method.
- * @param content The content to identify
+ * @param solrDocReader A reader serving the text from the document to detect
* @return List of detected language(s) according to RFC-3066
*/
- protected abstract List<DetectedLanguage> detectLanguage(SolrInputDocument content);
+ protected abstract List<DetectedLanguage> detectLanguage(Reader solrDocReader);
/**
* Chooses a language based on the list of candidates detected
@@ -400,67 +406,22 @@ public abstract class LanguageIdentifierUpdateProcessor extends UpdateRequestPro
this.enabled = enabled;
}
-
-
/**
- * Concatenates content from multiple fields
+ * Returns a reader that streams String content from fields.
+ * This is more memory efficient than building a full string buffer
+ * @param doc the solr document
+ * @param fields the field names to read
+ * @return a reader over the fields
*/
- protected String concatFields(SolrInputDocument doc) {
- StringBuilder sb = new StringBuilder(getExpectedSize(doc, inputFields));
- for (String fieldName : inputFields) {
- log.debug("Appending field " + fieldName);
- if (doc.containsKey(fieldName)) {
- Collection<Object> fieldValues = doc.getFieldValues(fieldName);
- if (fieldValues != null) {
- for (Object content : fieldValues) {
- if (content instanceof String) {
- String stringContent = (String) content;
- if (stringContent.length() > maxFieldValueChars) {
- sb.append(stringContent.substring(0, maxFieldValueChars));
- } else {
- sb.append(stringContent);
- }
- sb.append(" ");
- if (sb.length() > maxTotalChars) {
- sb.setLength(maxTotalChars);
- break;
- }
- } else {
- log.warn("Field " + fieldName + " not a String value, not including in detection");
- }
- }
- }
- }
- }
- return sb.toString();
+ protected SolrInputDocumentReader solrDocReader(SolrInputDocument doc, String[] fields) {
+ return new SolrInputDocumentReader(doc, fields, maxTotalChars, maxFieldValueChars, " ");
}
-
+
/**
- * Calculate expected string size.
- *
- * @param doc solr input document
- * @param fields fields to select
- * @return expected size of string value
+ * Concatenates content from input fields defined in langid.fl.
+ * For test purposes only
*/
- private int getExpectedSize(SolrInputDocument doc, String[] fields) {
- int docSize = 0;
- for (String field : fields) {
- if (doc.containsKey(field)) {
- Collection<Object> contents = doc.getFieldValues(field);
- if (contents != null) {
- for (Object content : contents) {
- if (content instanceof String) {
- docSize += Math.min(((String) content).length(), maxFieldValueChars);
- }
- }
-
- if (docSize > maxTotalChars) {
- docSize = maxTotalChars;
- break;
- }
- }
- }
- }
- return docSize;
+ protected String concatFields(SolrInputDocument doc) {
+ return SolrInputDocumentReader.asString(solrDocReader(doc, inputFields));
}
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/00f8f3a1/solr/contrib/langid/src/java/org/apache/solr/update/processor/OpenNLPLangDetectUpdateProcessor.java
----------------------------------------------------------------------
diff --git a/solr/contrib/langid/src/java/org/apache/solr/update/processor/OpenNLPLangDetectUpdateProcessor.java b/solr/contrib/langid/src/java/org/apache/solr/update/processor/OpenNLPLangDetectUpdateProcessor.java
index 83f4fe4..ab17133 100644
--- a/solr/contrib/langid/src/java/org/apache/solr/update/processor/OpenNLPLangDetectUpdateProcessor.java
+++ b/solr/contrib/langid/src/java/org/apache/solr/update/processor/OpenNLPLangDetectUpdateProcessor.java
@@ -16,6 +16,7 @@
*/
package org.apache.solr.update.processor;
+import java.io.Reader;
import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
import java.util.HashMap;
@@ -23,7 +24,6 @@ import java.util.List;
import java.util.Locale;
import java.util.Map;
-import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.response.SolrQueryResponse;
import org.slf4j.Logger;
@@ -54,9 +54,9 @@ public class OpenNLPLangDetectUpdateProcessor extends LanguageIdentifierUpdatePr
}
@Override
- protected List<DetectedLanguage> detectLanguage(SolrInputDocument doc) {
+ protected List<DetectedLanguage> detectLanguage(Reader solrDocReader) {
List<DetectedLanguage> languages = new ArrayList<>();
- String content = concatFields(doc);
+ String content = SolrInputDocumentReader.asString(solrDocReader);
if (content.length() != 0) {
LanguageDetectorME ldme = new LanguageDetectorME(model);
Language[] langs = ldme.predictLanguages(content);
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/00f8f3a1/solr/contrib/langid/src/java/org/apache/solr/update/processor/SolrInputDocumentReader.java
----------------------------------------------------------------------
diff --git a/solr/contrib/langid/src/java/org/apache/solr/update/processor/SolrInputDocumentReader.java b/solr/contrib/langid/src/java/org/apache/solr/update/processor/SolrInputDocumentReader.java
new file mode 100644
index 0000000..ed839de
--- /dev/null
+++ b/solr/contrib/langid/src/java/org/apache/solr/update/processor/SolrInputDocumentReader.java
@@ -0,0 +1,224 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.update.processor;
+
+import java.io.IOException;
+import java.io.Reader;
+import java.lang.invoke.MethodHandles;
+import java.util.Iterator;
+import java.util.List;
+import java.util.stream.Collectors;
+import java.util.stream.StreamSupport;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.common.SolrInputDocument;
+import org.apache.solr.common.SolrInputField;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Reader on top of SolrInputDocument that can "stream" a document as a character stream in a memory
+ * efficient way, to avoid potentially large intermediate string buffers containing whole document content.
+ * @lucene.experimental
+ */
+public class SolrInputDocumentReader extends Reader {
+ private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+
+ private SolrInputDocument doc;
+ private final String[] fields;
+ private final String fieldValueSep;
+ private final int maxTotalChars;
+ private final int maxCharsPerFieldValue;
+ private int totalCharsConsumed;
+
+ // Remember where we are at
+ private int currentFieldIdx = 0;
+ private int currentFieldValueIdx = 0;
+ private int currentFieldValueOffset = 0;
+ private boolean eod = false;
+ // Normally a Reader will return -1 at end of document, but to work around LangDetect's bug, we allow another value
+ private int eodReturnValue = -1;
+
+ /**
+ * Creates a character-stream reader that streams all String fields in the document with space as separator
+ *
+ * @param doc Solr input document
+ * @param maxCharsPerFieldValue max chars to consume per field value
+ * @param maxTotalChars max chars to consume total
+ */
+ public SolrInputDocumentReader(SolrInputDocument doc, int maxTotalChars, int maxCharsPerFieldValue) {
+ this(doc, getStringFields(doc), maxTotalChars, maxCharsPerFieldValue, " ");
+ }
+
+ /**
+ * Creates a character-stream reader that reads the listed fields in order, with
+ * max lengths as specified.
+ *
+ * @param doc Solr input document
+ * @param fields list of field names to include
+ * @param fieldValueSep separator to insert between field values
+ * @param maxCharsPerFieldValue max chars to consume per field value
+ * @param maxTotalChars max chars to consume total
+ */
+ public SolrInputDocumentReader(SolrInputDocument doc, String[] fields, int maxTotalChars,
+ int maxCharsPerFieldValue, String fieldValueSep) {
+ this.doc = doc;
+ this.fields = fields;
+ this.fieldValueSep = fieldValueSep;
+ if (fields == null || fields.length == 0) throw new IllegalArgumentException("fields cannot be empty");
+ this.maxTotalChars = maxTotalChars;
+ this.maxCharsPerFieldValue = maxCharsPerFieldValue;
+ }
+
+ @Override
+ public int read(char[] cbuf, int off, int len) throws IOException {
+ StringBuilder sb = new StringBuilder(len);
+ int numChars = fillBuffer(sb, len);
+
+ if (numChars > -1) {
+ sb.getChars(0, numChars, cbuf, off);
+ }
+ totalCharsConsumed += numChars;
+ return numChars;
+ }
+
+ private int fillBuffer(StringBuilder sb, int targetLen) {
+ if (eod) return eodReturnValue;
+ if (totalCharsConsumed + targetLen > maxTotalChars) {
+ targetLen = maxTotalChars - totalCharsConsumed;
+ }
+
+ while (sb.length() < targetLen && !eod) {
+ nextDocChunk(sb, targetLen);
+ }
+
+ if (sb.length() == 0) {
+ eod = true;
+ return eodReturnValue;
+ } else {
+ return sb.length();
+ }
+ }
+
+ private int nextDocChunk(StringBuilder sb, int maxChunkLength) {
+ if (currentFieldIdx > fields.length-1) {
+ return returnEod();
+ }
+
+ int startFieldValueIdx = currentFieldValueIdx;
+ int startFieldValueOffset = currentFieldValueOffset;
+
+ do {
+ SolrInputField f = doc.getField(fields[currentFieldIdx]);
+ if (f == null) {
+ log.debug("Field with name {} did not exist on docuemnt.", fields[currentFieldIdx]);
+ incField(sb);
+ continue;
+ }
+ Iterator<Object> fvIt = f.iterator();
+ currentFieldValueIdx = -1;
+ while (fvIt.hasNext() && sb.length() < maxChunkLength) {
+ currentFieldValueIdx++;
+ String fvStr = String.valueOf(fvIt.next());
+ if (currentFieldValueIdx < startFieldValueIdx) continue;
+ startFieldValueIdx = 0;
+ if (sb.length() > 0) {
+ if (maxChunkLength - sb.length() < fieldValueSep.length()) {
+ sb.append(fieldValueSep.substring(0,maxChunkLength - sb.length()));
+ } else {
+ sb.append(fieldValueSep);
+ }
+ }
+ currentFieldValueOffset = startFieldValueOffset;
+ startFieldValueOffset = 0;
+ int charsNeeded = maxChunkLength - sb.length();
+ int endOffset = fvStr.length();
+ if (fvStr.length() - currentFieldValueOffset > charsNeeded) {
+ endOffset = currentFieldValueOffset + charsNeeded;
+ }
+ if (endOffset - currentFieldValueOffset > maxCharsPerFieldValue) {
+ endOffset = maxCharsPerFieldValue - currentFieldValueOffset;
+ }
+ sb.append(fvStr.substring(currentFieldValueOffset, endOffset));
+ currentFieldValueOffset = endOffset == fvStr.length() ? 0 : endOffset;
+ }
+ if (sb.length() >= maxChunkLength) {
+ return returnValue(sb);
+ } else {
+ incField(sb);
+ }
+ } while (currentFieldIdx <= fields.length-1 && sb.length() < maxChunkLength);
+ return sb.length() == 0 ? eodReturnValue : sb.length();
+ }
+
+ private int returnEod() {
+ eod = true;
+ return eodReturnValue;
+ }
+
+ private int returnValue(StringBuilder sb) {
+ if (sb.length() == 0) {
+ return returnEod();
+ } else {
+ return sb.length();
+ }
+ }
+
+ private void incField(StringBuilder sb) {
+ currentFieldIdx++;
+ currentFieldValueIdx = 0;
+ currentFieldValueOffset = 0;
+ }
+
+ @Override
+ public void close() throws IOException { /* ignored */ }
+
+ @Override
+ public boolean ready() throws IOException {
+ return !eod;
+ }
+
+ /**
+ * Choose another return value than -1 for end of document reached.
+ * <b>Warning: Only to work around buggy consumers such as LangDetect 1.1</b>
+ * @param eodReturnValue integer which defaults to -1
+ */
+ public void setEodReturnValue(int eodReturnValue) {
+ this.eodReturnValue = eodReturnValue;
+ }
+
+ /**
+ * Gets the whole reader as a String
+ * @return string of concatenated fields
+ */
+ public static String asString(Reader reader) {
+ try {
+ return IOUtils.toString(reader);
+ } catch (IOException e) {
+ throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Failed reading doc content from reader", e);
+ }
+ }
+
+ protected static String[] getStringFields(SolrInputDocument doc) {
+ Iterable<SolrInputField> iterable = () -> doc.iterator();
+ List<String> strFields = StreamSupport.stream(iterable.spliterator(), false)
+ .filter(f -> f.getFirstValue() instanceof String)
+ .map(SolrInputField::getName).collect(Collectors.toList());
+ return strFields.toArray(new String[0]);
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/00f8f3a1/solr/contrib/langid/src/java/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessor.java
----------------------------------------------------------------------
diff --git a/solr/contrib/langid/src/java/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessor.java b/solr/contrib/langid/src/java/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessor.java
index 5c8146d..5a43bd5 100644
--- a/solr/contrib/langid/src/java/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessor.java
+++ b/solr/contrib/langid/src/java/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessor.java
@@ -16,6 +16,7 @@
*/
package org.apache.solr.update.processor;
+import java.io.Reader;
import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
import java.util.List;
@@ -23,8 +24,6 @@ import java.util.List;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.response.SolrQueryResponse;
import org.apache.tika.language.LanguageIdentifier;
-
-import org.apache.solr.common.SolrInputDocument;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -44,11 +43,11 @@ public class TikaLanguageIdentifierUpdateProcessor extends LanguageIdentifierUpd
SolrQueryResponse rsp, UpdateRequestProcessor next) {
super(req, rsp, next);
}
-
+
@Override
- protected List<DetectedLanguage> detectLanguage(SolrInputDocument doc) {
+ protected List<DetectedLanguage> detectLanguage(Reader solrDocReader) {
+ String content = SolrInputDocumentReader.asString(solrDocReader);
List<DetectedLanguage> languages = new ArrayList<>();
- String content = concatFields(doc);
if (content.length() != 0) {
LanguageIdentifier identifier = new LanguageIdentifier(content);
// FIXME: Hack - we get the distance from toString and calculate our own certainty score
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/00f8f3a1/solr/contrib/langid/src/test/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessorFactoryTestCase.java
----------------------------------------------------------------------
diff --git a/solr/contrib/langid/src/test/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessorFactoryTestCase.java b/solr/contrib/langid/src/test/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessorFactoryTestCase.java
index 21ecd7d..9fc3eb1 100644
--- a/solr/contrib/langid/src/test/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessorFactoryTestCase.java
+++ b/solr/contrib/langid/src/test/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessorFactoryTestCase.java
@@ -140,14 +140,14 @@ public abstract class LanguageIdentifierUpdateProcessorFactoryTestCase extends S
liProcessor = createLangIdProcessor(parameters);
doc = englishDoc();
- assertEquals("en", liProcessor.process(doc).getFieldValue("language"));
- assertEquals("en", liProcessor.process(doc).getFieldValue("languages"));
+ assertEquals("en", process(doc).getFieldValue("language"));
+ assertEquals("en", process(doc).getFieldValue("languages"));
doc = englishDoc();
doc.setField("language", "no");
- assertEquals("no", liProcessor.process(doc).getFieldValue("language"));
- assertEquals("no", liProcessor.process(doc).getFieldValue("languages"));
- assertNotNull(liProcessor.process(doc).getFieldValue("text_no"));
+ assertEquals("no", process(doc).getFieldValue("language"));
+ assertEquals("no", process(doc).getFieldValue("languages"));
+ assertNotNull(process(doc).getFieldValue("text_no"));
}
/**
@@ -166,14 +166,14 @@ public abstract class LanguageIdentifierUpdateProcessorFactoryTestCase extends S
liProcessor = createLangIdProcessor(parameters);
doc = englishDoc();
- assertEquals("en", liProcessor.process(doc).getFieldValue("language"));
- assertEquals("en", liProcessor.process(doc).getFieldValue("languages"));
+ assertEquals("en", process(doc).getFieldValue("language"));
+ assertEquals("en", process(doc).getFieldValue("languages"));
doc = englishDoc();
doc.setField("language", "no");
- assertEquals("no", liProcessor.process(doc).getFieldValue("language"));
- assertEquals("no", liProcessor.process(doc).getFieldValue("languages"));
- assertNotNull(liProcessor.process(doc).getFieldValue("text_multivalue_no"));
+ assertEquals("no", process(doc).getFieldValue("language"));
+ assertEquals("no", process(doc).getFieldValue("languages"));
+ assertNotNull(process(doc).getFieldValue("text_multivalue_no"));
}
/**
@@ -192,14 +192,14 @@ public abstract class LanguageIdentifierUpdateProcessorFactoryTestCase extends S
liProcessor = createLangIdProcessor(parameters);
doc = mixedEnglishRussianDoc();
- assertEquals("en", liProcessor.process(doc).getFieldValue("language"));
- assertEquals("en", liProcessor.process(doc).getFieldValue("languages"));
+ assertEquals("en", process(doc).getFieldValue("language"));
+ assertEquals("en", process(doc).getFieldValue("languages"));
doc = mixedEnglishRussianDoc();
doc.setField("language", "no");
- assertEquals("no", liProcessor.process(doc).getFieldValue("language"));
- assertEquals("no", liProcessor.process(doc).getFieldValue("languages"));
- assertNotNull(liProcessor.process(doc).getFieldValue("text_multivalue_no"));
+ assertEquals("no", process(doc).getFieldValue("language"));
+ assertEquals("no", process(doc).getFieldValue("languages"));
+ assertNotNull(process(doc).getFieldValue("text_multivalue_no"));
}
@Test
@@ -212,7 +212,7 @@ public abstract class LanguageIdentifierUpdateProcessorFactoryTestCase extends S
liProcessor = createLangIdProcessor(parameters);
doc = tooShortDoc();
- assertEquals("", liProcessor.process(doc).getFieldValue("language"));
+ assertEquals("", process(doc).getFieldValue("language"));
}
@Test
@@ -225,7 +225,7 @@ public abstract class LanguageIdentifierUpdateProcessorFactoryTestCase extends S
liProcessor = createLangIdProcessor(parameters);
doc = new SolrInputDocument();
- assertEquals("", liProcessor.process(doc).getFieldValue("language"));
+ assertEquals("", process(doc).getFieldValue("language"));
}
@Test
@@ -242,11 +242,11 @@ public abstract class LanguageIdentifierUpdateProcessorFactoryTestCase extends S
// Verify fallback to field fb (noop field does not exist and is skipped)
doc = tooShortDoc();
doc.addField("fb", "fbField");
- assertEquals("fbField", liProcessor.process(doc).getFieldValue("language"));
+ assertEquals("fbField", process(doc).getFieldValue("language"));
// Verify fallback to fallback value since no fallback fields exist
doc = tooShortDoc();
- assertEquals("fbVal", liProcessor.process(doc).getFieldValue("language"));
+ assertEquals("fbVal", process(doc).getFieldValue("language"));
}
@Test
@@ -272,6 +272,60 @@ public abstract class LanguageIdentifierUpdateProcessorFactoryTestCase extends S
assertEquals("fallback", liProcessor.resolveLanguage(langs, "fallback"));
}
+ @Test
+ public void testKeepOrig() throws Exception {
+ ModifiableSolrParams parameters = new ModifiableSolrParams();
+ parameters.set("langid.enforceSchema", "false");
+ parameters.set("langid.langField", "language");
+ parameters.set("langid.langsField", "languages");
+ parameters.set("langid.fl", "text");
+ parameters.set("langid.map", "true");
+ parameters.set("langid.map.keepOrig", "false");
+ liProcessor = createLangIdProcessor(parameters);
+
+ SolrInputDocument mappedNoOrig = process(englishDoc());
+ assertEquals("text_en", liProcessor.getMappedField("text", "en"));
+ assertEquals("en", mappedNoOrig.getFieldValue("language"));
+ assertTrue(mappedNoOrig.containsKey("text_en"));
+ assertFalse(mappedNoOrig.containsKey("text"));
+
+ // keepOrig true
+ parameters.set("langid.map.keepOrig", "true");
+ liProcessor = createLangIdProcessor(parameters);
+
+ SolrInputDocument mappedKeepOrig = process(englishDoc());
+ assertTrue(mappedKeepOrig.containsKey("text_en"));
+ assertTrue(mappedKeepOrig.containsKey("text"));
+ assertEquals(englishDoc().getFieldValue("text"), mappedKeepOrig.getFieldValue("text_en"));
+
+ // keepOrig and map individual
+ parameters.set("langid.map.individual", "true");
+ parameters.set("langid.fl", "text,text2");
+ liProcessor = createLangIdProcessor(parameters);
+
+ SolrInputDocument mappedIndividual = process(languagePerFieldDoc());
+ assertTrue(mappedIndividual.containsKey("text_en"));
+ assertTrue(mappedIndividual.containsKey("text"));
+ assertTrue(mappedIndividual.containsKey("text2_ru"));
+ assertTrue(mappedIndividual.containsKey("text2"));
+ assertEquals(languagePerFieldDoc().getFieldValue("text"), mappedIndividual.getFieldValue("text_en"));
+ }
+
+ @Test
+ public void testMapIndividual() throws Exception {
+ ModifiableSolrParams parameters = new ModifiableSolrParams();
+ parameters.set("langid.enforceSchema", "false");
+ parameters.set("langid.langField", "language");
+ parameters.set("langid.langsField", "languages");
+ parameters.set("langid.fl", "text,text2");
+ parameters.set("langid.map", "true");
+ parameters.set("langid.map.individual", "true");
+ liProcessor = createLangIdProcessor(parameters);
+
+ SolrInputDocument mappedIndividual = process(languagePerFieldDoc());
+ assertTrue(mappedIndividual.containsKey("text_en"));
+ assertTrue(mappedIndividual.containsKey("text2_ru"));
+ }
// Various utility methods
@@ -282,6 +336,12 @@ public abstract class LanguageIdentifierUpdateProcessorFactoryTestCase extends S
return doc;
}
+ private SolrInputDocument languagePerFieldDoc() {
+ SolrInputDocument doc = englishDoc();
+ doc.addField("text2", "The Apache Lucene — это свободная библиотека для высокоскоростного полнотекстового поиска, написанная на Java. Может быть использована для поиска в интернете и других областях компьютерной лингвистики (аналитическая философия).");
+ return doc;
+ }
+
/**
* Construct document containing multi-value fields in different languages.
* @return solr input document
@@ -307,7 +367,7 @@ public abstract class LanguageIdentifierUpdateProcessorFactoryTestCase extends S
if(liProcessor == null)
throw new Exception("Processor must be initialized before calling assertLang()");
SolrInputDocument doc = sid(fieldsAndValues);
- assertEquals(langCode, liProcessor.process(doc).getFieldValue(liProcessor.langField));
+ assertEquals(langCode, process(doc).getFieldValue(liProcessor.langField));
}
private SolrInputDocument sid(String... fieldsAndValues) {
@@ -317,4 +377,13 @@ public abstract class LanguageIdentifierUpdateProcessorFactoryTestCase extends S
}
return doc;
}
+
+ /*
+ Utility test method to process a clone of a document
+ */
+ private SolrInputDocument process(SolrInputDocument origDoc) {
+ SolrInputDocument modifiedDoc = origDoc.deepCopy();
+ liProcessor.process(modifiedDoc);
+ return modifiedDoc;
+ }
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/00f8f3a1/solr/contrib/langid/src/test/org/apache/solr/update/processor/SolrInputDocumentReaderTest.java
----------------------------------------------------------------------
diff --git a/solr/contrib/langid/src/test/org/apache/solr/update/processor/SolrInputDocumentReaderTest.java b/solr/contrib/langid/src/test/org/apache/solr/update/processor/SolrInputDocumentReaderTest.java
new file mode 100644
index 0000000..5e28a52
--- /dev/null
+++ b/solr/contrib/langid/src/test/org/apache/solr/update/processor/SolrInputDocumentReaderTest.java
@@ -0,0 +1,108 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.update.processor;
+
+import java.util.Arrays;
+
+import org.apache.solr.common.SolrInputDocument;
+import org.junit.Before;
+import org.junit.Test;
+
+import static org.junit.Assert.assertArrayEquals;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+public class SolrInputDocumentReaderTest {
+ private SolrInputDocument doc;
+ private String[] allFields;
+
+ @Before
+ public void setUp() throws Exception {
+ doc = new SolrInputDocument();
+ doc.addField("f1", "a b c");
+ doc.addField("f2", "multi");
+ doc.addField("f2", "valued");
+ doc.addField("f2", "field");
+ doc.addField("f3", 123);
+ doc.addField("f4", "12345678901234567890");
+ allFields = new String[] {"f1", "f2", "f3", "f4"};
+ }
+
+ @Test
+ public void readChunked() throws Exception {
+ SolrInputDocumentReader reader = new SolrInputDocumentReader(
+ doc,
+ allFields,
+ 20,
+ 18,
+ " - ");
+ assertTrue(reader.ready());
+ char[] chars = new char[1000];
+ int len;
+ assertEquals(9, len=reader.read(chars, 0, 9));
+ assertArrEqu("a b c - m", chars, len);
+ len += reader.read(chars, 9, 2);
+ assertArrEqu("a b c - mul", chars, len);
+ len += reader.read(chars, 11, 1);
+ assertArrEqu("a b c - mult", chars, len);
+ len += reader.read(chars, 12, 10);
+ // We now hit totalMaxChars
+ assertArrEqu("a b c - multi - valu", chars, len);
+ }
+
+ @Test
+ public void maxFieldValueLength() throws Exception {
+ SolrInputDocumentReader reader = new SolrInputDocumentReader(
+ doc,
+ allFields,
+ 21,
+ 2,
+ " - "
+ );
+ assertTrue(reader.ready());
+ char[] chars = new char[1000];
+ int len = reader.read(chars, 0, 22);
+ assertEquals(21, len);
+ assertArrEqu("a - mu - va - fi - 1", chars, len);
+ }
+
+ @Test
+ public void allStrFields() throws Exception {
+ SolrInputDocumentReader reader = new SolrInputDocumentReader(
+ doc,
+ 20000,
+ 10000
+ );
+ assertTrue(reader.ready());
+ char[] chars = new char[1000];
+ int len = reader.read(chars, 0, 1000);
+ assertEquals(45, len);
+ assertArrEqu("a b c multi valued field 12345678901234567890", chars, len);
+ }
+
+ @Test
+ public void testGetStringFields() throws Exception {
+ String[] expected = new String[] {"f1", "f2", "f4"};
+ assertArrayEquals(expected, SolrInputDocumentReader.getStringFields(doc));
+ }
+
+ private void assertArrEqu(String expected, char[] chars, int len) {
+ String str = new String(Arrays.copyOf(chars, len));
+ assertEquals(expected, str);
+ }
+
+}
\ No newline at end of file