You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by da...@apache.org on 2018/11/02 10:30:04 UTC
[6/7] lucene-solr:jira/gradle: Adding solr:analysis-extras module
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6c070b4a/solr/contrib/analysis-extras/src/main/java/org/apache/solr/update/processor/OpenNLPExtractNamedEntitiesUpdateProcessorFactory.java
----------------------------------------------------------------------
diff --git a/solr/contrib/analysis-extras/src/main/java/org/apache/solr/update/processor/OpenNLPExtractNamedEntitiesUpdateProcessorFactory.java b/solr/contrib/analysis-extras/src/main/java/org/apache/solr/update/processor/OpenNLPExtractNamedEntitiesUpdateProcessorFactory.java
new file mode 100644
index 0000000..d69c367
--- /dev/null
+++ b/solr/contrib/analysis-extras/src/main/java/org/apache/solr/update/processor/OpenNLPExtractNamedEntitiesUpdateProcessorFactory.java
@@ -0,0 +1,577 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.update.processor;
+
+import java.io.IOException;
+import java.lang.invoke.MethodHandles;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.regex.PatternSyntaxException;
+
+import opennlp.tools.util.Span;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.opennlp.OpenNLPTokenizer;
+import org.apache.lucene.analysis.opennlp.tools.NLPNERTaggerOp;
+import org.apache.lucene.analysis.opennlp.tools.OpenNLPOpsFactory;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.common.SolrInputDocument;
+import org.apache.solr.common.SolrInputField;
+import org.apache.solr.common.util.NamedList;
+import org.apache.solr.common.util.Pair;
+import org.apache.solr.core.SolrCore;
+import org.apache.solr.request.SolrQueryRequest;
+import org.apache.solr.response.SolrQueryResponse;
+import org.apache.solr.schema.FieldType;
+import org.apache.solr.update.AddUpdateCommand;
+import org.apache.solr.update.processor.FieldMutatingUpdateProcessor.FieldNameSelector;
+import org.apache.solr.update.processor.FieldMutatingUpdateProcessorFactory.SelectorParams;
+import org.apache.solr.util.plugin.SolrCoreAware;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import static org.apache.solr.common.SolrException.ErrorCode.SERVER_ERROR;
+
+/**
+ * Extracts named entities using an OpenNLP NER <code>modelFile</code> from the values found in
+ * any matching <code>source</code> field into a configured <code>dest</code> field, after
+ * first tokenizing the source text using the index analyzer on the configured
+ * <code>analyzerFieldType</code>, which must include <code>solr.OpenNLPTokenizerFactory</code>
+ * as the tokenizer. E.g.:
+ *
+ * <pre class="prettyprint">
+ * <fieldType name="opennlp-en-tokenization" class="solr.TextField">
+ * <analyzer>
+ * <tokenizer class="solr.OpenNLPTokenizerFactory"
+ * sentenceModel="en-sent.bin"
+ * tokenizerModel="en-tokenizer.bin"/>
+ * </analyzer>
+ * </fieldType>
+ * </pre>
+ *
+ * <p>See the <a href="http://opennlp.apache.org/models.html">OpenNLP website</a>
+ * for information on downloading pre-trained models.</p>
+ *
+ * Note that in order to use model files larger than 1MB on SolrCloud,
+ * <a href="https://lucene.apache.org/solr/guide/setting-up-an-external-zookeeper-ensemble#increasing-zookeeper-s-1mb-file-size-limit"
+ * >ZooKeeper server and client configuration is required</a>.
+ *
+ * <p>
+ * The <code>source</code> field(s) can be configured as either:
+ * </p>
+ * <ul>
+ * <li>One or more <code><str></code></li>
+ * <li>An <code><arr></code> of <code><str></code></li>
+ * <li>A <code><lst></code> containing
+ * {@link FieldMutatingUpdateProcessor FieldMutatingUpdateProcessorFactory style selector arguments}</li>
+ * </ul>
+ *
+ * <p>The <code>dest</code> field can be a single <code><str></code>
+ * containing the literal name of a destination field, or it may be a <code><lst></code> specifying a
+ * regex <code>pattern</code> and a <code>replacement</code> string. If the pattern + replacement option
+ * is used the pattern will be matched against all fields matched by the source selector, and the replacement
+ * string (including any capture groups specified from the pattern) will be evaluated a using
+ * {@link Matcher#replaceAll(String)} to generate the literal name of the destination field. Additionally,
+ * an occurrence of the string "{EntityType}" in the <code>dest</code> field specification, or in the
+ * <code>replacement</code> string, will be replaced with the entity type(s) returned for each entity by
+ * the OpenNLP NER model; as a result, if the model extracts more than one entity type, then more than one
+ * <code>dest</code> field will be populated.
+ * </p>
+ *
+ * <p>If the resolved <code>dest</code> field already exists in the document, then the
+ * named entities extracted from the <code>source</code> fields will be added to it.
+ * </p>
+ * <p>
+ * In the example below:
+ * </p>
+ * <ul>
+ * <li>Named entities will be extracted from the <code>text</code> field and added
+ * to the <code>names_ss</code> field</li>
+ * <li>Named entities will be extracted from both the <code>title</code> and
+ * <code>subtitle</code> fields and added into the <code>titular_people</code> field</li>
+ * <li>Named entities will be extracted from any field with a name ending in <code>_txt</code>
+ * -- except for <code>notes_txt</code> -- and added into the <code>people_ss</code> field</li>
+ * <li>Named entities will be extracted from any field with a name beginning with "desc" and
+ * ending in "s" (e.g. "descs" and "descriptions") and added to a field prefixed with "key_",
+ * not ending in "s", and suffixed with "_people". (e.g. "key_desc_people" or
+ * "key_description_people")</li>
+ * <li>Named entities will be extracted from the <code>summary</code> field and added
+ * to the <code>summary_person_ss</code> field, assuming that the modelFile only extracts
+ * entities of type "person".</li>
+ * </ul>
+ *
+ * <pre class="prettyprint">
+ * <updateRequestProcessorChain name="multiple-extract">
+ * <processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory">
+ * <str name="modelFile">en-test-ner-person.bin</str>
+ * <str name="analyzerFieldType">opennlp-en-tokenization</str>
+ * <str name="source">text</str>
+ * <str name="dest">people_s</str>
+ * </processor>
+ * <processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory">
+ * <str name="modelFile">en-test-ner-person.bin</str>
+ * <str name="analyzerFieldType">opennlp-en-tokenization</str>
+ * <arr name="source">
+ * <str>title</str>
+ * <str>subtitle</str>
+ * </arr>
+ * <str name="dest">titular_people</str>
+ * </processor>
+ * <processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory">
+ * <str name="modelFile">en-test-ner-person.bin</str>
+ * <str name="analyzerFieldType">opennlp-en-tokenization</str>
+ * <lst name="source">
+ * <str name="fieldRegex">.*_txt$</str>
+ * <lst name="exclude">
+ * <str name="fieldName">notes_txt</str>
+ * </lst>
+ * </lst>
+ * <str name="dest">people_s</str>
+ * </processor>
+ * <processor class="solr.processor.OpenNLPExtractNamedEntitiesUpdateProcessorFactory">
+ * <str name="modelFile">en-test-ner-person.bin</str>
+ * <str name="analyzerFieldType">opennlp-en-tokenization</str>
+ * <lst name="source">
+ * <str name="fieldRegex">^desc(.*)s$</str>
+ * </lst>
+ * <lst name="dest">
+ * <str name="pattern">^desc(.*)s$</str>
+ * <str name="replacement">key_desc$1_people</str>
+ * </lst>
+ * </processor>
+ * <processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory">
+ * <str name="modelFile">en-test-ner-person.bin</str>
+ * <str name="analyzerFieldType">opennlp-en-tokenization</str>
+ * <str name="source">summary</str>
+ * <str name="dest">summary_{EntityType}_s</str>
+ * </processor>
+ * <processor class="solr.LogUpdateProcessorFactory" />
+ * <processor class="solr.RunUpdateProcessorFactory" />
+ * </updateRequestProcessorChain>
+ * </pre>
+ *
+ * @since 7.3.0
+ */
+public class OpenNLPExtractNamedEntitiesUpdateProcessorFactory
+ extends UpdateRequestProcessorFactory implements SolrCoreAware {
+
+ private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+
+ public static final String SOURCE_PARAM = "source";
+ public static final String DEST_PARAM = "dest";
+ public static final String PATTERN_PARAM = "pattern";
+ public static final String REPLACEMENT_PARAM = "replacement";
+ public static final String MODEL_PARAM = "modelFile";
+ public static final String ANALYZER_FIELD_TYPE_PARAM = "analyzerFieldType";
+ public static final String ENTITY_TYPE = "{EntityType}";
+
+ private SelectorParams srcInclusions = new SelectorParams();
+ private Collection<SelectorParams> srcExclusions = new ArrayList<>();
+
+ private FieldNameSelector srcSelector = null;
+
+ private String modelFile = null;
+ private String analyzerFieldType = null;
+
+ /**
+ * If pattern is null, this this is a literal field name. If pattern is non-null then this
+ * is a replacement string that may contain meta-characters (ie: capture group identifiers)
+ * @see #pattern
+ */
+ private String dest = null;
+ /** @see #dest */
+ private Pattern pattern = null;
+
+ protected final FieldNameSelector getSourceSelector() {
+ if (null != srcSelector) return srcSelector;
+
+ throw new SolrException(SERVER_ERROR, "selector was never initialized, inform(SolrCore) never called???");
+ }
+
+ @SuppressWarnings("unchecked")
+ @Override
+ public void init(NamedList args) {
+
+ // high level (loose) check for which type of config we have.
+ //
+ // individual init methods do more strict syntax checking
+ if (0 <= args.indexOf(SOURCE_PARAM, 0) && 0 <= args.indexOf(DEST_PARAM, 0) ) {
+ initSourceSelectorSyntax(args);
+ } else if (0 <= args.indexOf(PATTERN_PARAM, 0) && 0 <= args.indexOf(REPLACEMENT_PARAM, 0)) {
+ initSimpleRegexReplacement(args);
+ } else {
+ throw new SolrException(SERVER_ERROR, "A combination of either '" + SOURCE_PARAM + "' + '"+
+ DEST_PARAM + "', or '" + REPLACEMENT_PARAM + "' + '" +
+ PATTERN_PARAM + "' init params are mandatory");
+ }
+
+ Object modelParam = args.remove(MODEL_PARAM);
+ if (null == modelParam) {
+ throw new SolrException(SERVER_ERROR, "Missing required init param '" + MODEL_PARAM + "'");
+ }
+ if ( ! (modelParam instanceof CharSequence)) {
+ throw new SolrException(SERVER_ERROR, "Init param '" + MODEL_PARAM + "' must be a <str>");
+ }
+ modelFile = modelParam.toString();
+
+ Object analyzerFieldTypeParam = args.remove(ANALYZER_FIELD_TYPE_PARAM);
+ if (null == analyzerFieldTypeParam) {
+ throw new SolrException(SERVER_ERROR, "Missing required init param '" + ANALYZER_FIELD_TYPE_PARAM + "'");
+ }
+ if ( ! (analyzerFieldTypeParam instanceof CharSequence)) {
+ throw new SolrException(SERVER_ERROR, "Init param '" + ANALYZER_FIELD_TYPE_PARAM + "' must be a <str>");
+ }
+ analyzerFieldType = analyzerFieldTypeParam.toString();
+
+ if (0 < args.size()) {
+ throw new SolrException(SERVER_ERROR, "Unexpected init param(s): '" + args.getName(0) + "'");
+ }
+
+ super.init(args);
+ }
+
+ /**
+ * init helper method that should only be called when we know for certain that both the
+ * "source" and "dest" init params do <em>not</em> exist.
+ */
+ @SuppressWarnings("unchecked")
+ private void initSimpleRegexReplacement(NamedList args) {
+ // The syntactic sugar for the case where there is only one regex pattern for source and the same pattern
+ // is used for the destination pattern...
+ //
+ // pattern != null && replacement != null
+ //
+ // ...as top level elements, with no other config options specified
+
+ // if we got here we know we had pattern and replacement, now check for the other two so that we can give a better
+ // message than "unexpected"
+ if (0 <= args.indexOf(SOURCE_PARAM, 0) || 0 <= args.indexOf(DEST_PARAM, 0) ) {
+ throw new SolrException(SERVER_ERROR,"Short hand syntax must not be mixed with full syntax. Found " +
+ PATTERN_PARAM + " and " + REPLACEMENT_PARAM + " but also found " + SOURCE_PARAM + " or " + DEST_PARAM);
+ }
+
+ assert args.indexOf(SOURCE_PARAM, 0) < 0;
+
+ Object patt = args.remove(PATTERN_PARAM);
+ Object replacement = args.remove(REPLACEMENT_PARAM);
+
+ if (null == patt || null == replacement) {
+ throw new SolrException(SERVER_ERROR, "Init params '" + PATTERN_PARAM + "' and '" +
+ REPLACEMENT_PARAM + "' are both mandatory if '" + SOURCE_PARAM + "' and '"+
+ DEST_PARAM + "' are not both specified");
+ }
+
+ if (0 != args.size()) {
+ throw new SolrException(SERVER_ERROR, "Init params '" + REPLACEMENT_PARAM + "' and '" +
+ PATTERN_PARAM + "' must be children of '" + DEST_PARAM +
+ "' to be combined with other options.");
+ }
+
+ if (!(replacement instanceof String)) {
+ throw new SolrException(SERVER_ERROR, "Init param '" + REPLACEMENT_PARAM + "' must be a string (i.e. <str>)");
+ }
+ if (!(patt instanceof String)) {
+ throw new SolrException(SERVER_ERROR, "Init param '" + PATTERN_PARAM + "' must be a string (i.e. <str>)");
+ }
+
+ dest = replacement.toString();
+ try {
+ this.pattern = Pattern.compile(patt.toString());
+ } catch (PatternSyntaxException pe) {
+ throw new SolrException(SERVER_ERROR, "Init param " + PATTERN_PARAM +
+ " is not a valid regex pattern: " + patt, pe);
+
+ }
+ srcInclusions = new SelectorParams();
+ srcInclusions.fieldRegex = Collections.singletonList(this.pattern);
+ }
+
+ /**
+ * init helper method that should only be called when we know for certain that both the
+ * "source" and "dest" init params <em>do</em> exist.
+ */
+ @SuppressWarnings("unchecked")
+ private void initSourceSelectorSyntax(NamedList args) {
+ // Full and complete syntax where source and dest are mandatory.
+ //
+ // source may be a single string or a selector.
+ // dest may be a single string or list containing pattern and replacement
+ //
+ // source != null && dest != null
+
+ // if we got here we know we had source and dest, now check for the other two so that we can give a better
+ // message than "unexpected"
+ if (0 <= args.indexOf(PATTERN_PARAM, 0) || 0 <= args.indexOf(REPLACEMENT_PARAM, 0) ) {
+ throw new SolrException(SERVER_ERROR,"Short hand syntax must not be mixed with full syntax. Found " +
+ SOURCE_PARAM + " and " + DEST_PARAM + " but also found " + PATTERN_PARAM + " or " + REPLACEMENT_PARAM);
+ }
+
+ Object d = args.remove(DEST_PARAM);
+ assert null != d;
+
+ List<Object> sources = args.getAll(SOURCE_PARAM);
+ assert null != sources;
+
+ if (1 == sources.size()) {
+ if (sources.get(0) instanceof NamedList) {
+ // nested set of selector options
+ NamedList selectorConfig = (NamedList) args.remove(SOURCE_PARAM);
+
+ srcInclusions = parseSelectorParams(selectorConfig);
+
+ List<Object> excList = selectorConfig.getAll("exclude");
+
+ for (Object excObj : excList) {
+ if (null == excObj) {
+ throw new SolrException(SERVER_ERROR, "Init param '" + SOURCE_PARAM +
+ "' child 'exclude' can not be null");
+ }
+ if (!(excObj instanceof NamedList)) {
+ throw new SolrException(SERVER_ERROR, "Init param '" + SOURCE_PARAM +
+ "' child 'exclude' must be <lst/>");
+ }
+ NamedList exc = (NamedList) excObj;
+ srcExclusions.add(parseSelectorParams(exc));
+ if (0 < exc.size()) {
+ throw new SolrException(SERVER_ERROR, "Init param '" + SOURCE_PARAM +
+ "' has unexpected 'exclude' sub-param(s): '"
+ + selectorConfig.getName(0) + "'");
+ }
+ // call once per instance
+ selectorConfig.remove("exclude");
+ }
+
+ if (0 < selectorConfig.size()) {
+ throw new SolrException(SERVER_ERROR, "Init param '" + SOURCE_PARAM +
+ "' contains unexpected child param(s): '" +
+ selectorConfig.getName(0) + "'");
+ }
+ // consume from the named list so it doesn't interfere with subsequent processing
+ sources.remove(0);
+ }
+ }
+ if (1 <= sources.size()) {
+ // source better be one or more strings
+ srcInclusions.fieldName = new HashSet<>(args.removeConfigArgs("source"));
+ }
+ if (srcInclusions == null) {
+ throw new SolrException(SERVER_ERROR,
+ "Init params do not specify any field from which to extract entities, please supply either "
+ + SOURCE_PARAM + " and " + DEST_PARAM + " or " + PATTERN_PARAM + " and " + REPLACEMENT_PARAM + ". See javadocs" +
+ "for OpenNLPExtractNamedEntitiesUpdateProcessor for further details.");
+ }
+
+ if (d instanceof NamedList) {
+ NamedList destList = (NamedList) d;
+
+ Object patt = destList.remove(PATTERN_PARAM);
+ Object replacement = destList.remove(REPLACEMENT_PARAM);
+
+ if (null == patt || null == replacement) {
+ throw new SolrException(SERVER_ERROR, "Init param '" + DEST_PARAM + "' children '" +
+ PATTERN_PARAM + "' and '" + REPLACEMENT_PARAM +
+ "' are both mandatory and can not be null");
+ }
+ if (! (patt instanceof String && replacement instanceof String)) {
+ throw new SolrException(SERVER_ERROR, "Init param '" + DEST_PARAM + "' children '" +
+ PATTERN_PARAM + "' and '" + REPLACEMENT_PARAM +
+ "' must both be strings (i.e. <str>)");
+ }
+ if (0 != destList.size()) {
+ throw new SolrException(SERVER_ERROR, "Init param '" + DEST_PARAM + "' has unexpected children: '"
+ + destList.getName(0) + "'");
+ }
+
+ try {
+ this.pattern = Pattern.compile(patt.toString());
+ } catch (PatternSyntaxException pe) {
+ throw new SolrException(SERVER_ERROR, "Init param '" + DEST_PARAM + "' child '" + PATTERN_PARAM +
+ " is not a valid regex pattern: " + patt, pe);
+ }
+ dest = replacement.toString();
+
+ } else if (d instanceof String) {
+ dest = d.toString();
+ } else {
+ throw new SolrException(SERVER_ERROR, "Init param '" + DEST_PARAM + "' must either be a string " +
+ "(i.e. <str>) or a list (i.e. <lst>) containing '" +
+ PATTERN_PARAM + "' and '" + REPLACEMENT_PARAM);
+ }
+
+ }
+
+ @Override
+ public void inform(final SolrCore core) {
+
+ srcSelector =
+ FieldMutatingUpdateProcessor.createFieldNameSelector
+ (core.getResourceLoader(), core, srcInclusions, FieldMutatingUpdateProcessor.SELECT_NO_FIELDS);
+
+ for (SelectorParams exc : srcExclusions) {
+ srcSelector = FieldMutatingUpdateProcessor.wrap
+ (srcSelector,
+ FieldMutatingUpdateProcessor.createFieldNameSelector
+ (core.getResourceLoader(), core, exc, FieldMutatingUpdateProcessor.SELECT_NO_FIELDS));
+ }
+ try {
+ OpenNLPOpsFactory.getNERTaggerModel(modelFile, core.getResourceLoader());
+ } catch (IOException e) {
+ throw new IllegalArgumentException(e);
+ }
+ }
+
+ @Override
+ public final UpdateRequestProcessor getInstance
+ (SolrQueryRequest req, SolrQueryResponse rsp, UpdateRequestProcessor next) {
+ final FieldNameSelector srcSelector = getSourceSelector();
+ return new UpdateRequestProcessor(next) {
+ private final NLPNERTaggerOp nerTaggerOp;
+ private Analyzer analyzer = null;
+ {
+ try {
+ nerTaggerOp = OpenNLPOpsFactory.getNERTagger(modelFile);
+ FieldType fieldType = req.getSchema().getFieldTypeByName(analyzerFieldType);
+ if (fieldType == null) {
+ throw new SolrException
+ (SERVER_ERROR, ANALYZER_FIELD_TYPE_PARAM + " '" + analyzerFieldType + "' not found in the schema.");
+ }
+ analyzer = fieldType.getIndexAnalyzer();
+ } catch (IOException e) {
+ throw new IllegalArgumentException(e);
+ }
+ }
+
+ @Override
+ public void processAdd(AddUpdateCommand cmd) throws IOException {
+
+ final SolrInputDocument doc = cmd.getSolrInputDocument();
+
+ // Destination may be regex replace string, or "{EntityType}" replaced by
+ // each entity's type, both of which can cause multiple output fields.
+ Map<String,SolrInputField> destMap = new HashMap<>();
+
+ // preserve initial values
+ for (final String fname : doc.getFieldNames()) {
+ if ( ! srcSelector.shouldMutate(fname)) continue;
+
+ Collection<Object> srcFieldValues = doc.getFieldValues(fname);
+ if (srcFieldValues == null || srcFieldValues.isEmpty()) continue;
+
+ String resolvedDest = dest;
+
+ if (pattern != null) {
+ Matcher matcher = pattern.matcher(fname);
+ if (matcher.find()) {
+ resolvedDest = matcher.replaceAll(dest);
+ } else {
+ log.debug("srcSelector.shouldMutate(\"{}\") returned true, " +
+ "but replacement pattern did not match, field skipped.", fname);
+ continue;
+ }
+ }
+
+ for (Object val : srcFieldValues) {
+ for (Pair<String,String> entity : extractTypedNamedEntities(val)) {
+ SolrInputField destField = null;
+ String entityName = entity.first();
+ String entityType = entity.second();
+ final String resolved = resolvedDest.replace(ENTITY_TYPE, entityType);
+ if (doc.containsKey(resolved)) {
+ destField = doc.getField(resolved);
+ } else {
+ SolrInputField targetField = destMap.get(resolved);
+ if (targetField == null) {
+ destField = new SolrInputField(resolved);
+ } else {
+ destField = targetField;
+ }
+ }
+ destField.addValue(entityName);
+
+ // put it in map to avoid concurrent modification...
+ destMap.put(resolved, destField);
+ }
+ }
+ }
+
+ for (Map.Entry<String,SolrInputField> entry : destMap.entrySet()) {
+ doc.put(entry.getKey(), entry.getValue());
+ }
+ super.processAdd(cmd);
+ }
+
+ /** Using configured NER model, extracts (name, type) pairs from the given source field value */
+ private List<Pair<String,String>> extractTypedNamedEntities(Object srcFieldValue) throws IOException {
+ List<Pair<String,String>> entitiesWithType = new ArrayList<>();
+ List<String> terms = new ArrayList<>();
+ List<Integer> startOffsets = new ArrayList<>();
+ List<Integer> endOffsets = new ArrayList<>();
+ String fullText = srcFieldValue.toString();
+ TokenStream tokenStream = analyzer.tokenStream("", fullText);
+ CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class);
+ OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
+ FlagsAttribute flagsAtt = tokenStream.addAttribute(FlagsAttribute.class);
+ tokenStream.reset();
+ synchronized (nerTaggerOp) {
+ while (tokenStream.incrementToken()) {
+ terms.add(termAtt.toString());
+ startOffsets.add(offsetAtt.startOffset());
+ endOffsets.add(offsetAtt.endOffset());
+ boolean endOfSentence = 0 != (flagsAtt.getFlags() & OpenNLPTokenizer.EOS_FLAG_BIT);
+ if (endOfSentence) { // extract named entities one sentence at a time
+ extractEntitiesFromSentence(fullText, terms, startOffsets, endOffsets, entitiesWithType);
+ }
+ }
+ tokenStream.end();
+ tokenStream.close();
+ if (!terms.isEmpty()) { // In case last token of last sentence isn't properly flagged with EOS_FLAG_BIT
+ extractEntitiesFromSentence(fullText, terms, startOffsets, endOffsets, entitiesWithType);
+ }
+ nerTaggerOp.reset(); // Forget all adaptive data collected during previous calls
+ }
+ return entitiesWithType;
+ }
+
+ private void extractEntitiesFromSentence(String fullText, List<String> terms, List<Integer> startOffsets,
+ List<Integer> endOffsets, List<Pair<String,String>> entitiesWithType) {
+ for (Span span : nerTaggerOp.getNames(terms.toArray(new String[terms.size()]))) {
+ String text = fullText.substring(startOffsets.get(span.getStart()), endOffsets.get(span.getEnd() - 1));
+ entitiesWithType.add(new Pair<>(text, span.getType()));
+ }
+ terms.clear();
+ startOffsets.clear();
+ endOffsets.clear();
+ }
+ };
+ }
+
+ /** macro */
+ private static SelectorParams parseSelectorParams(NamedList args) {
+ return FieldMutatingUpdateProcessorFactory.parseSelectorParams(args);
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6c070b4a/solr/contrib/analysis-extras/src/main/java/org/apache/solr/update/processor/package.html
----------------------------------------------------------------------
diff --git a/solr/contrib/analysis-extras/src/main/java/org/apache/solr/update/processor/package.html b/solr/contrib/analysis-extras/src/main/java/org/apache/solr/update/processor/package.html
new file mode 100644
index 0000000..1388c29
--- /dev/null
+++ b/solr/contrib/analysis-extras/src/main/java/org/apache/solr/update/processor/package.html
@@ -0,0 +1,24 @@
+<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<!-- not a package-info.java, because we already defined this package in core/ -->
+<html>
+ <body>
+ Update request processor invoking OpenNLP Named Entity Recognition over configured
+ source field(s), populating configured target field(s) with the results.
+ </body>
+</html>
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6c070b4a/solr/contrib/analysis-extras/src/main/java/overview.html
----------------------------------------------------------------------
diff --git a/solr/contrib/analysis-extras/src/main/java/overview.html b/solr/contrib/analysis-extras/src/main/java/overview.html
new file mode 100644
index 0000000..f3d70ca
--- /dev/null
+++ b/solr/contrib/analysis-extras/src/main/java/overview.html
@@ -0,0 +1,21 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html>
+<body>
+Apache Solr Search Server: Analysis Extras contrib
+</body>
+</html>
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6c070b4a/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/en-test-ner.bin
----------------------------------------------------------------------
diff --git a/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/en-test-ner.bin b/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/en-test-ner.bin
deleted file mode 100644
index b4d8cdc..0000000
Binary files a/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/en-test-ner.bin and /dev/null differ
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6c070b4a/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/en-test-sent.bin
----------------------------------------------------------------------
diff --git a/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/en-test-sent.bin b/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/en-test-sent.bin
deleted file mode 100644
index 6e19e6b..0000000
Binary files a/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/en-test-sent.bin and /dev/null differ
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6c070b4a/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/en-test-tokenizer.bin
----------------------------------------------------------------------
diff --git a/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/en-test-tokenizer.bin b/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/en-test-tokenizer.bin
deleted file mode 100644
index 796a744..0000000
Binary files a/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/en-test-tokenizer.bin and /dev/null differ
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6c070b4a/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/schema-folding-extra.xml
----------------------------------------------------------------------
diff --git a/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/schema-folding-extra.xml b/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/schema-folding-extra.xml
deleted file mode 100644
index 573ca53..0000000
--- a/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/schema-folding-extra.xml
+++ /dev/null
@@ -1,52 +0,0 @@
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- -->
-
-<schema name="test" version="1.0">
- <fieldType name="string" class="solr.StrField" sortMissingLast="true" multiValued="false"/>
-
-
- <fieldType name="text_icufolding" class="solr.TextField">
- <analyzer>
- <tokenizer class="solr.WhitespaceTokenizerFactory"/>
- <filter class="solr.ICUFoldingFilterFactory"/>
- </analyzer>
- </fieldType>
-
- <fieldType name="text_icunormalizer2" class="solr.TextField">
- <analyzer>
- <tokenizer class="solr.WhitespaceTokenizerFactory"/>
- <filter class="solr.ICUNormalizer2FilterFactory" name="nfkc_cf" mode="compose"/>
- </analyzer>
- </fieldType>
-
- <fieldType name="text_icutransform" class="solr.TextField">
- <analyzer>
- <tokenizer class="solr.WhitespaceTokenizerFactory"/>
- <filter class="solr.ICUTransformFilterFactory" id="Cyrillic-Latin"/>
- </analyzer>
- </fieldType>
-
-
- <field name="id" type="string" indexed="true" stored="true" required="true"/>
- <field name="content_icufolding" type="text_icufolding" indexed="true" stored="true"/>
- <field name="content_icunormalizer2" type="text_icunormalizer2" indexed="true" stored="true"/>
- <field name="content_icutransform" type="text_icutransform" indexed="true" stored="true"/>
-
-
- <uniqueKey>id</uniqueKey>
-
-</schema>
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6c070b4a/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/schema-icucollate-dv.xml
----------------------------------------------------------------------
diff --git a/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/schema-icucollate-dv.xml b/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/schema-icucollate-dv.xml
deleted file mode 100644
index 63f7330..0000000
--- a/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/schema-icucollate-dv.xml
+++ /dev/null
@@ -1,57 +0,0 @@
-<?xml version="1.0" ?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-
-<!-- Test schema file for CollationField (docvalues) -->
-
-<schema name="test" version="1.0">
-
- <fieldType name="string" class="solr.StrField" omitNorms="true" positionIncrementGap="0"/>
-
- <!-- basic text field -->
- <fieldType name="text" class="solr.TextField">
- <analyzer>
- <tokenizer class="solr.StandardTokenizerFactory"/>
- <filter class="solr.LowerCaseFilterFactory"/>
- </analyzer>
- </fieldType>
-
- <fieldType name="sort_ar_t" class="solr.ICUCollationField" locale="ar"/>
- <fieldType name="sort_de_t" class="solr.ICUCollationField" locale="de" strength="primary"/>
- <fieldType name="sort_tr_canon_t" class="solr.ICUCollationField" locale="tr" strength="primary"
- decomposition="canonical"/>
- <fieldType name="sort_da_t" class="solr.ICUCollationField" locale="da" strength="primary"/>
- <fieldType name="sort_custom_t" class="solr.ICUCollationField" custom="customrules.dat" strength="primary"/>
-
- <field name="id" type="string" indexed="true" stored="true" multiValued="false" required="false"/>
- <field name="text" type="text" indexed="true" stored="false"/>
- <field name="sort_ar" type="sort_ar_t" indexed="false" stored="false" multiValued="false" docValues="true"/>
- <field name="sort_de" type="sort_de_t" indexed="false" stored="false" multiValued="false" docValues="true"/>
- <field name="sort_tr_canon" type="sort_tr_canon_t" indexed="false" stored="false" multiValued="true"
- docValues="true"/>
- <field name="sort_da" type="sort_da_t" indexed="false" stored="false" multiValued="false" docValues="true"/>
- <field name="sort_custom" type="sort_custom_t" indexed="false" stored="false" multiValued="true" docValues="true"/>
-
- <uniqueKey>id</uniqueKey>
-
- <!-- copy our text to some sort fields with different orders -->
- <copyField source="text" dest="sort_ar"/>
- <copyField source="text" dest="sort_de"/>
- <copyField source="text" dest="sort_tr_canon"/>
- <copyField source="text" dest="sort_da"/>
- <copyField source="text" dest="sort_custom"/>
-</schema>
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6c070b4a/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/schema-icucollate.xml
----------------------------------------------------------------------
diff --git a/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/schema-icucollate.xml b/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/schema-icucollate.xml
deleted file mode 100644
index 9698013..0000000
--- a/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/schema-icucollate.xml
+++ /dev/null
@@ -1,57 +0,0 @@
-<?xml version="1.0" ?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-
-<!-- Test schema file for CollationField -->
-
-<schema name="test" version="1.0">
-
- <fieldType name="string" class="solr.StrField" omitNorms="true" positionIncrementGap="0"/>
-
- <!-- basic text field -->
- <fieldType name="text" class="solr.TextField">
- <analyzer>
- <tokenizer class="solr.StandardTokenizerFactory"/>
- <filter class="solr.LowerCaseFilterFactory"/>
- </analyzer>
- </fieldType>
-
- <fieldType name="sort_ar_t" class="solr.ICUCollationField" locale="ar"/>
- <fieldType name="sort_de_t" class="solr.ICUCollationField" locale="de" strength="primary"/>
- <fieldType name="sort_tr_canon_t" class="solr.ICUCollationField" locale="tr" strength="primary"
- decomposition="canonical"/>
- <fieldType name="sort_da_t" class="solr.ICUCollationField" locale="da" strength="primary"/>
- <fieldType name="sort_custom_t" class="solr.ICUCollationField" custom="customrules.dat" strength="primary"/>
-
- <field name="id" type="string" indexed="true" stored="true" multiValued="false" required="false"/>
- <field name="text" type="text" indexed="true" stored="false"/>
- <field name="sort_ar" type="sort_ar_t" indexed="true" stored="false" multiValued="false"/>
- <field name="sort_de" type="sort_de_t" indexed="true" stored="false" multiValued="false"/>
- <field name="sort_tr_canon" type="sort_tr_canon_t" indexed="true" stored="false" multiValued="false"/>
- <field name="sort_da" type="sort_da_t" indexed="true" stored="false" multiValued="false"/>
- <field name="sort_custom" type="sort_custom_t" indexed="true" stored="false" multiValued="false"/>
-
-
- <uniqueKey>id</uniqueKey>
-
- <!-- copy our text to some sort fields with different orders -->
- <copyField source="text" dest="sort_ar"/>
- <copyField source="text" dest="sort_de"/>
- <copyField source="text" dest="sort_tr_canon"/>
- <copyField source="text" dest="sort_da"/>
- <copyField source="text" dest="sort_custom"/>
-</schema>
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6c070b4a/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/schema-icucollateoptions.xml
----------------------------------------------------------------------
diff --git a/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/schema-icucollateoptions.xml b/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/schema-icucollateoptions.xml
deleted file mode 100644
index 59b8d25..0000000
--- a/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/schema-icucollateoptions.xml
+++ /dev/null
@@ -1,68 +0,0 @@
-<?xml version="1.0" ?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-
-<!-- Test schema file for CollationField options -->
-
-<schema name="test" version="1.0">
-
- <fieldType name="string" class="solr.StrField" omitNorms="true" positionIncrementGap="0"/>
-
- <!-- basic text field -->
- <fieldType name="text" class="solr.TextField">
- <analyzer>
- <tokenizer class="solr.StandardTokenizerFactory"/>
- <filter class="solr.LowerCaseFilterFactory"/>
- </analyzer>
- </fieldType>
-
- <!-- ignores punctuation and whitespace -->
- <fieldType name="sort_ignore_punctuation_t" class="solr.ICUCollationField"
- locale="en" strength="primary" alternate="shifted"/>
- <!-- ignores only whitespace -->
- <fieldType name="sort_ignore_space_t" class="solr.ICUCollationField"
- locale="en" strength="primary" alternate="shifted" variableTop=" "/>
- <!-- ignores only accents, but not case -->
- <fieldType name="sort_ignore_accents_t" class="solr.ICUCollationField"
- locale="en" strength="primary" caseLevel="true"/>
- <!-- sorts numerics in numeric order -->
- <fieldType name="sort_numerics_t" class="solr.ICUCollationField"
- locale="en" numeric="true"/>
- <!-- sorts uppercase before lowercase -->
- <fieldType name="sort_uppercase_first_t" class="solr.ICUCollationField"
- locale="en" strength="tertiary" caseFirst="upper"/>
-
-
- <field name="id" type="string" indexed="true" stored="true" multiValued="false" required="false"/>
- <field name="text" type="text" indexed="true" stored="false"/>
- <field name="sort_ignore_punctuation" type="sort_ignore_punctuation_t" indexed="true" stored="false"
- multiValued="false"/>
- <field name="sort_ignore_space" type="sort_ignore_space_t" indexed="true" stored="false" multiValued="false"/>
- <field name="sort_ignore_accents" type="sort_ignore_accents_t" indexed="true" stored="false" multiValued="false"/>
- <field name="sort_numerics" type="sort_numerics_t" indexed="true" stored="false" multiValued="false"/>
- <field name="sort_uppercase_first" type="sort_uppercase_first_t" indexed="true" stored="false" multiValued="false"/>
-
-
- <uniqueKey>id</uniqueKey>
-
- <!-- copy our text to some sort fields with different orders -->
- <copyField source="text" dest="sort_ignore_punctuation"/>
- <copyField source="text" dest="sort_ignore_space"/>
- <copyField source="text" dest="sort_ignore_accents"/>
- <copyField source="text" dest="sort_numerics"/>
- <copyField source="text" dest="sort_uppercase_first"/>
-</schema>
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6c070b4a/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/schema-opennlp-extract.xml
----------------------------------------------------------------------
diff --git a/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/schema-opennlp-extract.xml b/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/schema-opennlp-extract.xml
deleted file mode 100644
index fc13431..0000000
--- a/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/schema-opennlp-extract.xml
+++ /dev/null
@@ -1,49 +0,0 @@
-<?xml version="1.0" ?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-
-<schema name="test-opennlp-extract" version="1.6">
- <fieldType name="opennlp-en-tokenization" class="solr.TextField">
- <analyzer>
- <tokenizer class="solr.OpenNLPTokenizerFactory"
- sentenceModel="en-test-sent.bin"
- tokenizerModel="en-test-tokenizer.bin"/>
- </analyzer>
- </fieldType>
-
- <fieldType name="string" class="solr.StrField" sortMissingLast="true"/>
-
- <fieldType name="text" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
- <analyzer>
- <tokenizer class="solr.MockTokenizerFactory"/>
- <filter class="solr.LowerCaseFilterFactory"/>
- <filter class="solr.PorterStemFilterFactory"/>
- </analyzer>
- </fieldType>
-
- <field name="id" type="string" indexed="true" stored="true" multiValued="false" required="true"/>
- <field name="text" type="text" indexed="true" stored="false"/>
- <field name="subject" type="text" indexed="true" stored="true"/>
- <field name="title" type="text" indexed="true" stored="true"/>
- <field name="subtitle" type="text" indexed="true" stored="true"/>
- <field name="descs" type="text" indexed="true" stored="true"/>
- <field name="descriptions" type="text" indexed="true" stored="true"/>
-
- <dynamicField name="*_txt" type="text" indexed="true" stored="true"/>
- <dynamicField name="*_s" type="string" indexed="true" stored="true" multiValued="true"/>
- <dynamicField name="*_people" type="string" indexed="true" stored="true" multiValued="true"/>
-</schema>
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6c070b4a/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/solrconfig-icucollate.xml
----------------------------------------------------------------------
diff --git a/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/solrconfig-icucollate.xml b/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/solrconfig-icucollate.xml
deleted file mode 100644
index 90c52d7..0000000
--- a/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/solrconfig-icucollate.xml
+++ /dev/null
@@ -1,27 +0,0 @@
-<?xml version="1.0" ?>
-
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-
-<config>
- <luceneMatchVersion>${tests.luceneMatchVersion:LATEST}</luceneMatchVersion>
- <indexConfig>
- <useCompoundFile>${useCompoundFile:false}</useCompoundFile>
- </indexConfig>
- <requestHandler name="/select" class="solr.SearchHandler"></requestHandler>
- <directoryFactory name="DirectoryFactory" class="${solr.directoryFactory:solr.RAMDirectoryFactory}"/>
-</config>
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6c070b4a/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/solrconfig-opennlp-extract.xml
----------------------------------------------------------------------
diff --git a/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/solrconfig-opennlp-extract.xml b/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/solrconfig-opennlp-extract.xml
deleted file mode 100644
index 7fd793e..0000000
--- a/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/solrconfig-opennlp-extract.xml
+++ /dev/null
@@ -1,206 +0,0 @@
-<?xml version="1.0" ?>
-
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-
-<config>
- <luceneMatchVersion>${tests.luceneMatchVersion:LATEST}</luceneMatchVersion>
- <xi:include href="solrconfig.snippet.randomindexconfig.xml" xmlns:xi="http://www.w3.org/2001/XInclude"/>
- <requestHandler name="/select" class="solr.SearchHandler"></requestHandler>
- <requestHandler name="/update" class="solr.UpdateRequestHandler" />
- <directoryFactory name="DirectoryFactory" class="${solr.directoryFactory:solr.RAMDirectoryFactory}"/>
- <schemaFactory class="ClassicIndexSchemaFactory"/>
-
- <updateRequestProcessorChain name="extract-single">
- <processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory">
- <str name="modelFile">en-test-ner.bin</str>
- <str name="analyzerFieldType">opennlp-en-tokenization</str>
- <str name="source">source1_s</str>
- <str name="dest">dest_s</str>
- </processor>
- </updateRequestProcessorChain>
-
- <updateRequestProcessorChain name="extract-single-regex">
- <processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory">
- <str name="modelFile">en-test-ner.bin</str>
- <str name="analyzerFieldType">opennlp-en-tokenization</str>
- <str name="source">source1_s</str>
- <lst name="dest">
- <str name="pattern">source\d(_s)</str>
- <str name="replacement">dest$1</str>
- </lst>
- </processor>
- </updateRequestProcessorChain>
-
- <updateRequestProcessorChain name="extract-multi">
- <processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory">
- <str name="modelFile">en-test-ner.bin</str>
- <str name="analyzerFieldType">opennlp-en-tokenization</str>
- <str name="source">source1_s</str>
- <str name="source">source2_s</str>
- <str name="dest">dest_s</str>
- </processor>
- </updateRequestProcessorChain>
-
- <updateRequestProcessorChain name="extract-multi-regex">
- <processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory">
- <str name="modelFile">en-test-ner.bin</str>
- <str name="analyzerFieldType">opennlp-en-tokenization</str>
- <str name="source">source1_s</str>
- <str name="source">source2_s</str>
- <lst name="dest">
- <str name="pattern">source\d(_s)</str>
- <str name="replacement">dest$1</str>
- </lst>
- </processor>
- </updateRequestProcessorChain>
-
- <updateRequestProcessorChain name="extract-array">
- <processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory">
- <str name="modelFile">en-test-ner.bin</str>
- <str name="analyzerFieldType">opennlp-en-tokenization</str>
- <arr name="source">
- <str>source1_s</str>
- <str>source2_s</str>
- </arr>
- <str name="dest">dest_s</str>
- </processor>
- </updateRequestProcessorChain>
-
- <updateRequestProcessorChain name="extract-array-regex">
- <processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory">
- <str name="modelFile">en-test-ner.bin</str>
- <str name="analyzerFieldType">opennlp-en-tokenization</str>
- <arr name="source">
- <str>source1_s</str>
- <str>source2_s</str>
- </arr>
- <lst name="dest">
- <str name="pattern">source\d(_s)</str>
- <str name="replacement">dest$1</str>
- </lst>
- </processor>
- </updateRequestProcessorChain>
-
- <updateRequestProcessorChain name="extract-selector">
- <processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory">
- <str name="modelFile">en-test-ner.bin</str>
- <str name="analyzerFieldType">opennlp-en-tokenization</str>
- <lst name="source">
- <str name="fieldRegex">source\d_.*</str>
- <lst name="exclude">
- <str name="fieldRegex">source0_.*</str>
- </lst>
- </lst>
- <str name="dest">dest_s</str>
- </processor>
- </updateRequestProcessorChain>
-
- <updateRequestProcessorChain name="extract-selector-regex">
- <processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory">
- <str name="modelFile">en-test-ner.bin</str>
- <str name="analyzerFieldType">opennlp-en-tokenization</str>
- <lst name="source">
- <str name="fieldRegex">source\d_.*</str>
- <lst name="exclude">
- <str name="fieldRegex">source0_.*</str>
- </lst>
- </lst>
- <lst name="dest">
- <str name="pattern">source\d(_s)</str>
- <str name="replacement">dest$1</str>
- </lst>
- </processor>
- </updateRequestProcessorChain>
-
- <updateRequestProcessorChain name="extract-regex-replaceall">
- <processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory">
- <str name="modelFile">en-test-ner.bin</str>
- <str name="analyzerFieldType">opennlp-en-tokenization</str>
- <lst name="source">
- <str name="fieldRegex">foo.*</str>
- </lst>
- <lst name="dest">
- <!-- unbounded pattern that can be replaced multiple times in field name -->
- <str name="pattern">x(\d)</str>
- <str name="replacement">y$1</str>
- </lst>
- </processor>
- </updateRequestProcessorChain>
-
- <updateRequestProcessorChain name="extract-regex-replaceall-with-entity-type">
- <processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory">
- <str name="modelFile">en-test-ner.bin</str>
- <str name="analyzerFieldType">opennlp-en-tokenization</str>
- <lst name="source">
- <str name="fieldRegex">foo.*</str>
- </lst>
- <lst name="dest">
- <!-- unbounded pattern that can be replaced multiple times in field name -->
- <str name="pattern">x(\d)</str>
- <str name="replacement">{EntityType}_y$1</str>
- </lst>
- </processor>
- </updateRequestProcessorChain>
-
- <!-- example used in OpenNLPExtractNamedEntitiesUpdateProcessorFactory javadocs -->
- <updateRequestProcessorChain name="multiple-extract">
- <processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory">
- <str name="modelFile">en-test-ner.bin</str>
- <str name="analyzerFieldType">opennlp-en-tokenization</str>
- <str name="source">text</str>
- <str name="dest">people_s</str>
- </processor>
- <processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory">
- <str name="modelFile">en-test-ner.bin</str>
- <str name="analyzerFieldType">opennlp-en-tokenization</str>
- <arr name="source">
- <str>title</str>
- <str>subtitle</str>
- </arr>
- <str name="dest">titular_people</str>
- </processor>
- <processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory">
- <str name="modelFile">en-test-ner.bin</str>
- <str name="analyzerFieldType">opennlp-en-tokenization</str>
- <lst name="source">
- <str name="fieldRegex">.*_txt$</str>
- <lst name="exclude">
- <str name="fieldName">notes_txt</str>
- </lst>
- </lst>
- <str name="dest">people_s</str>
- </processor>
- <processor class="solr.processor.OpenNLPExtractNamedEntitiesUpdateProcessorFactory">
- <str name="modelFile">en-test-ner.bin</str>
- <str name="analyzerFieldType">opennlp-en-tokenization</str>
- <lst name="source">
- <str name="fieldRegex">^desc(.*)s$</str>
- </lst>
- <lst name="dest">
- <str name="pattern">^desc(.*)s$</str>
- <str name="replacement">key_desc$1_people</str>
- </lst>
- </processor>
- <processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory">
- <str name="modelFile">en-test-ner.bin</str>
- <str name="analyzerFieldType">opennlp-en-tokenization</str>
- <str name="source">summary</str>
- <str name="dest">summary_{EntityType}_s</str>
- </processor>
- </updateRequestProcessorChain>
-</config>
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6c070b4a/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/solrconfig.snippet.randomindexconfig.xml
----------------------------------------------------------------------
diff --git a/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/solrconfig.snippet.randomindexconfig.xml b/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/solrconfig.snippet.randomindexconfig.xml
deleted file mode 100644
index 23516b0..0000000
--- a/solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf/solrconfig.snippet.randomindexconfig.xml
+++ /dev/null
@@ -1,48 +0,0 @@
-<?xml version="1.0" ?>
-
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-
-<!--
-A solrconfig.xml snippet containing indexConfig settings for randomized testing.
--->
-<indexConfig>
- <!-- this sys property is not set by SolrTestCaseJ4 because we ideally want to use
- the RandomMergePolicy in all tests - but some tests expect very specific
- Merge behavior, so those tests can set it as needed.
- -->
- <mergePolicyFactory class="${solr.tests.mergePolicyFactory:org.apache.solr.util.RandomMergePolicyFactory}" />
-
- <useCompoundFile>${useCompoundFile:false}</useCompoundFile>
-
- <maxBufferedDocs>${solr.tests.maxBufferedDocs}</maxBufferedDocs>
- <ramBufferSizeMB>${solr.tests.ramBufferSizeMB}</ramBufferSizeMB>
-
- <mergeScheduler class="${solr.tests.mergeScheduler}" />
-
- <writeLockTimeout>1000</writeLockTimeout>
- <commitLockTimeout>10000</commitLockTimeout>
-
- <!-- this sys property is not set by SolrTestCaseJ4 because almost all tests should
- use the single process lockType for speed - but tests that explicitly need
- to vary the lockType can set it as needed.
- -->
- <lockType>${solr.tests.lockType:single}</lockType>
-
- <infoStream>${solr.tests.infostream:false}</infoStream>
-
-</indexConfig>
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6c070b4a/solr/contrib/analysis-extras/src/test/java/org/apache/solr/analysis/TestFoldingMultitermExtrasQuery.java
----------------------------------------------------------------------
diff --git a/solr/contrib/analysis-extras/src/test/java/org/apache/solr/analysis/TestFoldingMultitermExtrasQuery.java b/solr/contrib/analysis-extras/src/test/java/org/apache/solr/analysis/TestFoldingMultitermExtrasQuery.java
new file mode 100644
index 0000000..b2cdbc2
--- /dev/null
+++ b/solr/contrib/analysis-extras/src/test/java/org/apache/solr/analysis/TestFoldingMultitermExtrasQuery.java
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.analysis;
+
+import java.io.File;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.solr.SolrTestCaseJ4;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+// See: https://issues.apache.org/jira/browse/SOLR-12028 Tests cannot remove files on Windows machines occasionally
+public class TestFoldingMultitermExtrasQuery extends SolrTestCaseJ4 {
+
+ public String getCoreName() {
+ return "basic";
+ }
+
+ @BeforeClass
+ public static void beforeTests() throws Exception {
+ File testHome = createTempDir().toFile();
+ FileUtils.copyDirectory(getFile("analysis-extras/solr"), testHome);
+ initCore("solrconfig-icucollate.xml","schema-folding-extra.xml", testHome.getAbsolutePath());
+
+ int idx = 1;
+ // ICUFoldingFilterFactory
+ assertU(adoc("id", Integer.toString(idx++), "content_icufolding", "BadMagicICUFolding"));
+ assertU(adoc("id", Integer.toString(idx++), "content_icufolding", "Ruß"));
+ assertU(adoc("id", Integer.toString(idx++), "content_icufolding", "ΜΆΪΟΣ"));
+ assertU(adoc("id", Integer.toString(idx++), "content_icufolding", "Μάϊος"));
+ assertU(adoc("id", Integer.toString(idx++), "content_icufolding", "résumé"));
+ assertU(adoc("id", Integer.toString(idx++), "content_icufolding", "re\u0301sume\u0301"));
+ assertU(adoc("id", Integer.toString(idx++), "content_icufolding", "ELİF"));
+ assertU(adoc("id", Integer.toString(idx++), "content_icufolding", "eli\u0307f"));
+
+ // ICUNormalizer2FilterFactory
+
+ assertU(adoc("id", Integer.toString(idx++), "content_icunormalizer2", "BadMagicICUFolding"));
+ assertU(adoc("id", Integer.toString(idx++), "content_icunormalizer2", "Ruß"));
+ assertU(adoc("id", Integer.toString(idx++), "content_icunormalizer2", "ΜΆΪΟΣ"));
+ assertU(adoc("id", Integer.toString(idx++), "content_icunormalizer2", "Μάϊος"));
+ assertU(adoc("id", Integer.toString(idx++), "content_icunormalizer2", "résumé"));
+ assertU(adoc("id", Integer.toString(idx++), "content_icunormalizer2", "re\u0301sume\u0301"));
+ assertU(adoc("id", Integer.toString(idx++), "content_icunormalizer2", "ELİF"));
+ assertU(adoc("id", Integer.toString(idx++), "content_icunormalizer2", "eli\u0307f"));
+
+ // ICUTransformFilterFactory
+ assertU(adoc("id", Integer.toString(idx++), "content_icutransform", "Российская"));
+
+ assertU(commit());
+ }
+
+ @Test
+ public void testICUFolding() {
+ assertQ(req("q", "content_icufolding:BadMagicicuFold*"), "//result[@numFound='1']");
+ assertQ(req("q", "content_icufolding:rU*"), "//result[@numFound='1']");
+ assertQ(req("q", "content_icufolding:Re*Me"), "//result[@numFound='2']");
+ assertQ(req("q", "content_icufolding:RE\u0301su*"), "//result[@numFound='2']");
+ assertQ(req("q", "content_icufolding:El*"), "//result[@numFound='2']");
+ }
+ @Test
+ public void testICUNormalizer2() {
+ assertQ(req("q", "content_icunormalizer2:BadMagicicuFold*"), "//result[@numFound='1']");
+ assertQ(req("q", "content_icunormalizer2:RU*"), "//result[@numFound='1']");
+ assertQ(req("q", "content_icunormalizer2:Μάϊ*"), "//result[@numFound='2']");
+ assertQ(req("q", "content_icunormalizer2:re\u0301Su*"), "//result[@numFound='2']");
+ assertQ(req("q", "content_icunormalizer2:eL*"), "//result[@numFound='2']");
+ }
+
+ public void testICUTransform() {
+ assertQ(req("q", "content_icutransform:Росс*"), "//result[@numFound='1']");
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6c070b4a/solr/contrib/analysis-extras/src/test/java/org/apache/solr/schema/TestICUCollationField.java
----------------------------------------------------------------------
diff --git a/solr/contrib/analysis-extras/src/test/java/org/apache/solr/schema/TestICUCollationField.java b/solr/contrib/analysis-extras/src/test/java/org/apache/solr/schema/TestICUCollationField.java
new file mode 100644
index 0000000..f164080
--- /dev/null
+++ b/solr/contrib/analysis-extras/src/test/java/org/apache/solr/schema/TestICUCollationField.java
@@ -0,0 +1,192 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.schema;
+
+import java.io.File;
+import java.io.FileOutputStream;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.commons.io.IOUtils;
+import org.apache.lucene.analysis.util.FilesystemResourceLoader;
+import org.apache.lucene.analysis.util.ResourceLoader;
+import org.apache.lucene.analysis.util.StringMockResourceLoader;
+import org.apache.solr.SolrTestCaseJ4;
+import org.junit.BeforeClass;
+
+import com.ibm.icu.text.Collator;
+import com.ibm.icu.text.RuleBasedCollator;
+import com.ibm.icu.util.ULocale;
+
+/**
+ * Tests {@link ICUCollationField} with TermQueries, RangeQueries, and sort order.
+ */
+public class TestICUCollationField extends SolrTestCaseJ4 {
+
+ @BeforeClass
+ public static void beforeClass() throws Exception {
+ String home = setupSolrHome();
+ initCore("solrconfig.xml","schema.xml", home);
+ // add some docs
+ assertU(adoc("id", "1", "text", "\u0633\u0627\u0628"));
+ assertU(adoc("id", "2", "text", "I WİLL USE TURKİSH CASING"));
+ assertU(adoc("id", "3", "text", "ı will use turkish casıng"));
+ assertU(adoc("id", "4", "text", "Töne"));
+ assertU(adoc("id", "5", "text", "I W\u0049\u0307LL USE TURKİSH CASING"));
+ assertU(adoc("id", "6", "text", "Testing"));
+ assertU(adoc("id", "7", "text", "Tone"));
+ assertU(adoc("id", "8", "text", "Testing"));
+ assertU(adoc("id", "9", "text", "testing"));
+ assertU(adoc("id", "10", "text", "toene"));
+ assertU(adoc("id", "11", "text", "Tzne"));
+ assertU(adoc("id", "12", "text", "\u0698\u0698"));
+ assertU(commit());
+ }
+
+ /**
+ * Ugly: but what to do? We want to test custom sort, which reads rules in as a resource.
+ * These are largish files, and jvm-specific (as our documentation says, you should always
+ * look out for jvm differences with collation).
+ * So it's preferable to create this file on-the-fly.
+ */
+ public static String setupSolrHome() throws Exception {
+ String tmpFile = createTempDir().toFile().getAbsolutePath();
+ // make data and conf dirs
+ new File(tmpFile + "/collection1", "data").mkdirs();
+ File confDir = new File(tmpFile + "/collection1", "conf");
+ confDir.mkdirs();
+
+ // copy over configuration files
+ FileUtils.copyFile(getFile("analysis-extras/solr/collection1/conf/solrconfig-icucollate.xml"), new File(confDir, "solrconfig.xml"));
+ FileUtils.copyFile(getFile("analysis-extras/solr/collection1/conf/schema-icucollate.xml"), new File(confDir, "schema.xml"));
+
+ // generate custom collation rules (DIN 5007-2), saving to customrules.dat
+ RuleBasedCollator baseCollator = (RuleBasedCollator) Collator.getInstance(new ULocale("de", "DE"));
+
+ String DIN5007_2_tailorings =
+ "& ae , a\u0308 & AE , A\u0308"+
+ "& oe , o\u0308 & OE , O\u0308"+
+ "& ue , u\u0308 & UE , u\u0308";
+
+ RuleBasedCollator tailoredCollator = new RuleBasedCollator(baseCollator.getRules() + DIN5007_2_tailorings);
+ String tailoredRules = tailoredCollator.getRules();
+ final String osFileName = "customrules.dat";
+ final FileOutputStream os = new FileOutputStream(new File(confDir, osFileName));
+ IOUtils.write(tailoredRules, os, "UTF-8");
+ os.close();
+
+ final ResourceLoader loader;
+ if (random().nextBoolean()) {
+ loader = new StringMockResourceLoader(tailoredRules);
+ } else {
+ loader = new FilesystemResourceLoader(confDir.toPath());
+ }
+ final Collator readCollator = ICUCollationField.createFromRules(osFileName, loader);
+ assertEquals(tailoredCollator, readCollator);
+
+ return tmpFile;
+ }
+
+ /**
+ * Test termquery with german DIN 5007-1 primary strength.
+ * In this case, ö is equivalent to o (but not oe)
+ */
+ public void testBasicTermQuery() {
+ assertQ("Collated TQ: ",
+ req("fl", "id", "q", "sort_de:tone", "sort", "id asc" ),
+ "//*[@numFound='2']",
+ "//result/doc[1]/str[@name='id'][.=4]",
+ "//result/doc[2]/str[@name='id'][.=7]"
+ );
+ }
+
+ /**
+ * Test rangequery again with the DIN 5007-1 collator.
+ * We do a range query of tone .. tp, in binary order this
+ * would retrieve nothing due to case and accent differences.
+ */
+ public void testBasicRangeQuery() {
+ assertQ("Collated RangeQ: ",
+ req("fl", "id", "q", "sort_de:[tone TO tp]", "sort", "id asc" ),
+ "//*[@numFound='2']",
+ "//result/doc[1]/str[@name='id'][.=4]",
+ "//result/doc[2]/str[@name='id'][.=7]"
+ );
+ }
+
+ /**
+ * Test sort with a danish collator. ö is ordered after z
+ */
+ public void testBasicSort() {
+ assertQ("Collated Sort: ",
+ req("fl", "id", "q", "sort_da:[tz TO töz]", "sort", "sort_da asc" ),
+ "//*[@numFound='2']",
+ "//result/doc[1]/str[@name='id'][.=11]",
+ "//result/doc[2]/str[@name='id'][.=4]"
+ );
+ }
+
+ /**
+ * Test sort with an arabic collator. U+0633 is ordered after U+0698.
+ * With a binary collator, the range would also return nothing.
+ */
+ public void testArabicSort() {
+ assertQ("Collated Sort: ",
+ req("fl", "id", "q", "sort_ar:[\u0698 TO \u0633\u0633]", "sort", "sort_ar asc" ),
+ "//*[@numFound='2']",
+ "//result/doc[1]/str[@name='id'][.=12]",
+ "//result/doc[2]/str[@name='id'][.=1]"
+ );
+ }
+
+ /**
+ * Test rangequery again with an Arabic collator.
+ * Binary order would normally order U+0633 in this range.
+ */
+ public void testNegativeRangeQuery() {
+ assertQ("Collated RangeQ: ",
+ req("fl", "id", "q", "sort_ar:[\u062F TO \u0698]", "sort", "id asc" ),
+ "//*[@numFound='0']"
+ );
+ }
+ /**
+ * Test canonical decomposition with turkish primary strength.
+ * With this sort order, İ is the uppercase form of i, and I is the uppercase form of ı.
+ * We index a decomposed form of İ.
+ */
+ public void testCanonicalDecomposition() {
+ assertQ("Collated TQ: ",
+ req("fl", "id", "q", "sort_tr_canon:\"I Will Use Turkish Casıng\"", "sort", "id asc" ),
+ "//*[@numFound='3']",
+ "//result/doc[1]/str[@name='id'][.=2]",
+ "//result/doc[2]/str[@name='id'][.=3]",
+ "//result/doc[3]/str[@name='id'][.=5]"
+ );
+ }
+
+ /**
+ * Test termquery with custom collator (DIN 5007-2).
+ * In this case, ö is equivalent to oe (but not o)
+ */
+ public void testCustomCollation() {
+ assertQ("Collated TQ: ",
+ req("fl", "id", "q", "sort_custom:toene"),
+ "//*[@numFound='2']",
+ "//result/doc/str[@name='id'][.=4]",
+ "//result/doc/str[@name='id'][.=10]"
+ );
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6c070b4a/solr/contrib/analysis-extras/src/test/java/org/apache/solr/schema/TestICUCollationFieldDocValues.java
----------------------------------------------------------------------
diff --git a/solr/contrib/analysis-extras/src/test/java/org/apache/solr/schema/TestICUCollationFieldDocValues.java b/solr/contrib/analysis-extras/src/test/java/org/apache/solr/schema/TestICUCollationFieldDocValues.java
new file mode 100644
index 0000000..57b403a
--- /dev/null
+++ b/solr/contrib/analysis-extras/src/test/java/org/apache/solr/schema/TestICUCollationFieldDocValues.java
@@ -0,0 +1,180 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.schema;
+
+import java.io.File;
+import java.io.FileOutputStream;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.commons.io.IOUtils;
+import org.apache.solr.SolrTestCaseJ4;
+import org.junit.BeforeClass;
+
+import com.ibm.icu.text.Collator;
+import com.ibm.icu.text.RuleBasedCollator;
+import com.ibm.icu.util.ULocale;
+
+/**
+ * Tests {@link ICUCollationField} with docValues.
+ */
+public class TestICUCollationFieldDocValues extends SolrTestCaseJ4 {
+
+ @BeforeClass
+ public static void beforeClass() throws Exception {
+ String home = setupSolrHome();
+ initCore("solrconfig.xml","schema.xml", home);
+ // add some docs
+ assertU(adoc("id", "1", "text", "\u0633\u0627\u0628"));
+ assertU(adoc("id", "2", "text", "I WİLL USE TURKİSH CASING"));
+ assertU(adoc("id", "3", "text", "ı will use turkish casıng"));
+ assertU(adoc("id", "4", "text", "Töne"));
+ assertU(adoc("id", "5", "text", "I W\u0049\u0307LL USE TURKİSH CASING"));
+ assertU(adoc("id", "6", "text", "Testing"));
+ assertU(adoc("id", "7", "text", "Tone"));
+ assertU(adoc("id", "8", "text", "Testing"));
+ assertU(adoc("id", "9", "text", "testing"));
+ assertU(adoc("id", "10", "text", "toene"));
+ assertU(adoc("id", "11", "text", "Tzne"));
+ assertU(adoc("id", "12", "text", "\u0698\u0698"));
+ assertU(commit());
+ }
+
+ /**
+ * Ugly: but what to do? We want to test custom sort, which reads rules in as a resource.
+ * These are largish files, and jvm-specific (as our documentation says, you should always
+ * look out for jvm differences with collation).
+ * So it's preferable to create this file on-the-fly.
+ */
+ public static String setupSolrHome() throws Exception {
+ File tmpFile = createTempDir().toFile();
+
+ // make data and conf dirs
+ new File(tmpFile + "/collection1", "data").mkdirs();
+ File confDir = new File(tmpFile + "/collection1", "conf");
+ confDir.mkdirs();
+
+ // copy over configuration files
+ FileUtils.copyFile(getFile("analysis-extras/solr/collection1/conf/solrconfig-icucollate.xml"), new File(confDir, "solrconfig.xml"));
+ FileUtils.copyFile(getFile("analysis-extras/solr/collection1/conf/schema-icucollate-dv.xml"), new File(confDir, "schema.xml"));
+
+ // generate custom collation rules (DIN 5007-2), saving to customrules.dat
+ RuleBasedCollator baseCollator = (RuleBasedCollator) Collator.getInstance(new ULocale("de", "DE"));
+
+ String DIN5007_2_tailorings =
+ "& ae , a\u0308 & AE , A\u0308"+
+ "& oe , o\u0308 & OE , O\u0308"+
+ "& ue , u\u0308 & UE , u\u0308";
+
+ RuleBasedCollator tailoredCollator = new RuleBasedCollator(baseCollator.getRules() + DIN5007_2_tailorings);
+ String tailoredRules = tailoredCollator.getRules();
+ FileOutputStream os = new FileOutputStream(new File(confDir, "customrules.dat"));
+ IOUtils.write(tailoredRules, os, "UTF-8");
+ os.close();
+
+ return tmpFile.getAbsolutePath();
+ }
+
+ /**
+ * Test termquery with german DIN 5007-1 primary strength.
+ * In this case, ö is equivalent to o (but not oe)
+ */
+ public void testBasicTermQuery() {
+ assertQ("Collated TQ: ",
+ req("fl", "id", "q", "sort_de:tone", "sort", "id asc" ),
+ "//*[@numFound='2']",
+ "//result/doc[1]/str[@name='id'][.=4]",
+ "//result/doc[2]/str[@name='id'][.=7]"
+ );
+ }
+
+ /**
+ * Test rangequery again with the DIN 5007-1 collator.
+ * We do a range query of tone .. tp, in binary order this
+ * would retrieve nothing due to case and accent differences.
+ */
+ public void testBasicRangeQuery() {
+ assertQ("Collated RangeQ: ",
+ req("fl", "id", "q", "sort_de:[tone TO tp]", "sort", "id asc" ),
+ "//*[@numFound='2']",
+ "//result/doc[1]/str[@name='id'][.=4]",
+ "//result/doc[2]/str[@name='id'][.=7]"
+ );
+ }
+
+ /**
+ * Test sort with a danish collator. ö is ordered after z
+ */
+ public void testBasicSort() {
+ assertQ("Collated Sort: ",
+ req("fl", "id", "q", "sort_da:[tz TO töz]", "sort", "sort_da asc" ),
+ "//*[@numFound='2']",
+ "//result/doc[1]/str[@name='id'][.=11]",
+ "//result/doc[2]/str[@name='id'][.=4]"
+ );
+ }
+
+ /**
+ * Test sort with an arabic collator. U+0633 is ordered after U+0698.
+ * With a binary collator, the range would also return nothing.
+ */
+ public void testArabicSort() {
+ assertQ("Collated Sort: ",
+ req("fl", "id", "q", "sort_ar:[\u0698 TO \u0633\u0633]", "sort", "sort_ar asc" ),
+ "//*[@numFound='2']",
+ "//result/doc[1]/str[@name='id'][.=12]",
+ "//result/doc[2]/str[@name='id'][.=1]"
+ );
+ }
+
+ /**
+ * Test rangequery again with an Arabic collator.
+ * Binary order would normally order U+0633 in this range.
+ */
+ public void testNegativeRangeQuery() {
+ assertQ("Collated RangeQ: ",
+ req("fl", "id", "q", "sort_ar:[\u062F TO \u0698]", "sort", "id asc" ),
+ "//*[@numFound='0']"
+ );
+ }
+ /**
+ * Test canonical decomposition with turkish primary strength.
+ * With this sort order, İ is the uppercase form of i, and I is the uppercase form of ı.
+ * We index a decomposed form of İ.
+ */
+ public void testCanonicalDecomposition() {
+ assertQ("Collated TQ: ",
+ req("fl", "id", "q", "sort_tr_canon:\"I Will Use Turkish Casıng\"", "sort", "id asc" ),
+ "//*[@numFound='3']",
+ "//result/doc[1]/str[@name='id'][.=2]",
+ "//result/doc[2]/str[@name='id'][.=3]",
+ "//result/doc[3]/str[@name='id'][.=5]"
+ );
+ }
+
+ /**
+ * Test termquery with custom collator (DIN 5007-2).
+ * In this case, ö is equivalent to oe (but not o)
+ */
+ public void testCustomCollation() {
+ assertQ("Collated TQ: ",
+ req("fl", "id", "q", "sort_custom:toene"),
+ "//*[@numFound='2']",
+ "//result/doc/str[@name='id'][.=4]",
+ "//result/doc/str[@name='id'][.=10]"
+ );
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6c070b4a/solr/contrib/analysis-extras/src/test/java/org/apache/solr/schema/TestICUCollationFieldOptions.java
----------------------------------------------------------------------
diff --git a/solr/contrib/analysis-extras/src/test/java/org/apache/solr/schema/TestICUCollationFieldOptions.java b/solr/contrib/analysis-extras/src/test/java/org/apache/solr/schema/TestICUCollationFieldOptions.java
new file mode 100644
index 0000000..0b198b7
--- /dev/null
+++ b/solr/contrib/analysis-extras/src/test/java/org/apache/solr/schema/TestICUCollationFieldOptions.java
@@ -0,0 +1,119 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.schema;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.solr.SolrTestCaseJ4;
+import org.junit.BeforeClass;
+
+import java.io.File;
+
+/**
+ * Tests expert options of {@link ICUCollationField}.
+ */
+public class TestICUCollationFieldOptions extends SolrTestCaseJ4 {
+ @BeforeClass
+ public static void beforeClass() throws Exception {
+ File testHome = createTempDir().toFile();
+ FileUtils.copyDirectory(getFile("analysis-extras/solr"), testHome);
+ initCore("solrconfig-icucollate.xml","schema-icucollateoptions.xml", testHome.getAbsolutePath());
+ // add some docs
+ assertU(adoc("id", "1", "text", "foo-bar"));
+ assertU(adoc("id", "2", "text", "foo bar"));
+ assertU(adoc("id", "3", "text", "foobar"));
+ assertU(adoc("id", "4", "text", "foobar-10"));
+ assertU(adoc("id", "5", "text", "foobar-9"));
+ assertU(adoc("id", "6", "text", "resume"));
+ assertU(adoc("id", "7", "text", "Résumé"));
+ assertU(adoc("id", "8", "text", "Resume"));
+ assertU(adoc("id", "9", "text", "résumé"));
+ assertU(commit());
+ }
+
+ /*
+ * Setting alternate=shifted to shift whitespace, punctuation and symbols
+ * to quaternary level
+ */
+ public void testIgnorePunctuation() {
+ assertQ("Collated TQ: ",
+ req("fl", "id", "q", "sort_ignore_punctuation:foobar", "sort", "id asc" ),
+ "//*[@numFound='3']",
+ "//result/doc[1]/str[@name='id'][.=1]",
+ "//result/doc[2]/str[@name='id'][.=2]",
+ "//result/doc[3]/str[@name='id'][.=3]"
+ );
+ }
+
+ /*
+ * Setting alternate=shifted and variableTop to shift whitespace, but not
+ * punctuation or symbols, to quaternary level
+ */
+ public void testIgnoreWhitespace() {
+ assertQ("Collated TQ: ",
+ req("fl", "id", "q", "sort_ignore_space:\"foo bar\"", "sort", "id asc" ),
+ "//*[@numFound='2']",
+ "//result/doc[1]/str[@name='id'][.=2]",
+ "//result/doc[2]/str[@name='id'][.=3]"
+ );
+ }
+
+ /*
+ * Setting numeric to encode digits with numeric value, so that
+ * foobar-9 sorts before foobar-10
+ */
+ public void testNumerics() {
+ assertQ("Collated sort: ",
+ req("fl", "id", "q", "id:[4 TO 5]", "sort", "sort_numerics asc" ),
+ "//*[@numFound='2']",
+ "//result/doc[1]/str[@name='id'][.=5]",
+ "//result/doc[2]/str[@name='id'][.=4]"
+ );
+ }
+
+ /*
+ * Setting caseLevel=true to create an additional case level between
+ * secondary and tertiary
+ */
+ public void testIgnoreAccentsButNotCase() {
+ assertQ("Collated TQ: ",
+ req("fl", "id", "q", "sort_ignore_accents:resume", "sort", "id asc" ),
+ "//*[@numFound='2']",
+ "//result/doc[1]/str[@name='id'][.=6]",
+ "//result/doc[2]/str[@name='id'][.=9]"
+ );
+
+ assertQ("Collated TQ: ",
+ req("fl", "id", "q", "sort_ignore_accents:Resume", "sort", "id asc" ),
+ "//*[@numFound='2']",
+ "//result/doc[1]/str[@name='id'][.=7]",
+ "//result/doc[2]/str[@name='id'][.=8]"
+ );
+ }
+
+ /*
+ * Setting caseFirst=upper to cause uppercase strings to sort
+ * before lowercase ones.
+ */
+ public void testUpperCaseFirst() {
+ assertQ("Collated sort: ",
+ req("fl", "id", "q", "id:6 OR id:8", "sort", "sort_uppercase_first asc" ),
+ "//*[@numFound='2']",
+ "//result/doc[1]/str[@name='id'][.=8]",
+ "//result/doc[2]/str[@name='id'][.=6]"
+ );
+ }
+}