You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by da...@apache.org on 2018/11/02 10:30:05 UTC

[7/7] lucene-solr:jira/gradle: Adding solr:analysis-extras module

Adding solr:analysis-extras module


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/6c070b4a
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/6c070b4a
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/6c070b4a

Branch: refs/heads/jira/gradle
Commit: 6c070b4a553f8c2da8f2481b1977f1ec239ae610
Parents: 5e447a4
Author: Cao Manh Dat <da...@apache.org>
Authored: Fri Nov 2 10:29:44 2018 +0000
Committer: Cao Manh Dat <da...@apache.org>
Committed: Fri Nov 2 10:29:44 2018 +0000

----------------------------------------------------------------------
 lucene/analysis/common/build.gradle             |  13 +
 settings.gradle                                 |   3 +-
 solr/contrib/analysis-extras/build.gradle       |  17 +
 .../apache/solr/schema/ICUCollationField.java   | 311 ----------
 .../java/org/apache/solr/schema/package.html    |  23 -
 ...ractNamedEntitiesUpdateProcessorFactory.java | 577 -------------------
 .../apache/solr/update/processor/package.html   |  24 -
 .../analysis-extras/src/java/overview.html      |  21 -
 .../apache/solr/schema/ICUCollationField.java   | 311 ++++++++++
 .../java/org/apache/solr/schema/package.html    |  23 +
 ...ractNamedEntitiesUpdateProcessorFactory.java | 577 +++++++++++++++++++
 .../apache/solr/update/processor/package.html   |  24 +
 .../analysis-extras/src/main/java/overview.html |  21 +
 .../solr/collection1/conf/en-test-ner.bin       | Bin 2049 -> 0 bytes
 .../solr/collection1/conf/en-test-sent.bin      | Bin 1051 -> 0 bytes
 .../solr/collection1/conf/en-test-tokenizer.bin | Bin 15100 -> 0 bytes
 .../collection1/conf/schema-folding-extra.xml   |  52 --
 .../collection1/conf/schema-icucollate-dv.xml   |  57 --
 .../solr/collection1/conf/schema-icucollate.xml |  57 --
 .../conf/schema-icucollateoptions.xml           |  68 ---
 .../collection1/conf/schema-opennlp-extract.xml |  49 --
 .../collection1/conf/solrconfig-icucollate.xml  |  27 -
 .../conf/solrconfig-opennlp-extract.xml         | 206 -------
 .../solrconfig.snippet.randomindexconfig.xml    |  48 --
 .../TestFoldingMultitermExtrasQuery.java        |  87 +++
 .../solr/schema/TestICUCollationField.java      | 192 ++++++
 .../schema/TestICUCollationFieldDocValues.java  | 180 ++++++
 .../schema/TestICUCollationFieldOptions.java    | 119 ++++
 ...ractNamedEntitiesUpdateProcessorFactory.java | 195 +++++++
 .../TestFoldingMultitermExtrasQuery.java        |  87 ---
 .../solr/schema/TestICUCollationField.java      | 192 ------
 .../schema/TestICUCollationFieldDocValues.java  | 180 ------
 .../schema/TestICUCollationFieldOptions.java    | 119 ----
 ...ractNamedEntitiesUpdateProcessorFactory.java | 195 -------
 .../solr/collection1/conf/en-test-ner.bin       | Bin 0 -> 2049 bytes
 .../solr/collection1/conf/en-test-sent.bin      | Bin 0 -> 1051 bytes
 .../solr/collection1/conf/en-test-tokenizer.bin | Bin 0 -> 15100 bytes
 .../collection1/conf/schema-folding-extra.xml   |  52 ++
 .../collection1/conf/schema-icucollate-dv.xml   |  57 ++
 .../solr/collection1/conf/schema-icucollate.xml |  57 ++
 .../conf/schema-icucollateoptions.xml           |  68 +++
 .../collection1/conf/schema-opennlp-extract.xml |  49 ++
 .../collection1/conf/solrconfig-icucollate.xml  |  27 +
 .../conf/solrconfig-opennlp-extract.xml         | 206 +++++++
 .../solrconfig.snippet.randomindexconfig.xml    |  48 ++
 solr/core/build.gradle                          |   1 +
 46 files changed, 2326 insertions(+), 2294 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6c070b4a/lucene/analysis/common/build.gradle
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/build.gradle b/lucene/analysis/common/build.gradle
index 73321f7..841f27c 100644
--- a/lucene/analysis/common/build.gradle
+++ b/lucene/analysis/common/build.gradle
@@ -1,5 +1,18 @@
 apply plugin: 'java'
 
+task jarTest (type: Jar) {
+    from sourceSets.test.output
+    classifier = 'test'
+}
+
+configurations {
+    testOutput
+}
+
+artifacts {
+    testOutput jarTest
+}
+
 dependencies {
 	compile project(':lucene:core')
     testCompile project(':lucene:codecs')

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6c070b4a/settings.gradle
----------------------------------------------------------------------
diff --git a/settings.gradle b/settings.gradle
index 56554a7..5884dcb 100644
--- a/settings.gradle
+++ b/settings.gradle
@@ -45,4 +45,5 @@ include 'solr:core'
 include 'solr:server'
 include 'solr:solrj'
 include 'solr:test-framework'
-include 'solr:example:example-DIH'
\ No newline at end of file
+include 'solr:example:example-DIH'
+include 'solr:contrib:analysis-extras'
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6c070b4a/solr/contrib/analysis-extras/build.gradle
----------------------------------------------------------------------
diff --git a/solr/contrib/analysis-extras/build.gradle b/solr/contrib/analysis-extras/build.gradle
new file mode 100644
index 0000000..6eba4ad
--- /dev/null
+++ b/solr/contrib/analysis-extras/build.gradle
@@ -0,0 +1,17 @@
+apply plugin: 'java'
+
+dependencies {
+    compile library.icu4j
+    compile library.opennlp_tools
+    compile library.morfologik_fsa
+    compile library.morfologik_polish
+    compile library.morfologik_stemming
+
+	compile project(':solr:solrj')
+	compile project(':solr:core')
+	compile project(':lucene:analysis:icu')
+	compile project(':lucene:analysis:opennlp')
+
+	testCompile project(':solr:test-framework')
+	testCompile project(path: ':lucene:analysis:common', configuration: 'testOutput')
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6c070b4a/solr/contrib/analysis-extras/src/java/org/apache/solr/schema/ICUCollationField.java
----------------------------------------------------------------------
diff --git a/solr/contrib/analysis-extras/src/java/org/apache/solr/schema/ICUCollationField.java b/solr/contrib/analysis-extras/src/java/org/apache/solr/schema/ICUCollationField.java
deleted file mode 100644
index f723a25..0000000
--- a/solr/contrib/analysis-extras/src/java/org/apache/solr/schema/ICUCollationField.java
+++ /dev/null
@@ -1,311 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.solr.schema;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.List;
-import java.util.Map;
-
-import org.apache.commons.io.IOUtils;
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
-import org.apache.lucene.analysis.util.ResourceLoader;
-import org.apache.lucene.collation.ICUCollationKeyAnalyzer;
-import org.apache.lucene.document.SortedDocValuesField;
-import org.apache.lucene.document.SortedSetDocValuesField;
-import org.apache.lucene.index.IndexableField;
-import org.apache.lucene.search.Query;
-import org.apache.lucene.search.SortField;
-import org.apache.lucene.search.TermRangeQuery;
-import org.apache.lucene.util.BytesRef;
-import org.apache.solr.common.SolrException.ErrorCode;
-import org.apache.solr.common.SolrException;
-import org.apache.solr.response.TextResponseWriter;
-import org.apache.solr.search.QParser;
-import org.apache.solr.uninverting.UninvertingReader.Type;
-
-import com.ibm.icu.text.Collator;
-import com.ibm.icu.text.RuleBasedCollator;
-import com.ibm.icu.util.ULocale;
-
-/**
- * Field for collated sort keys. 
- * These can be used for locale-sensitive sort and range queries.
- * <p>
- * This field can be created in two ways: 
- * <ul>
- *  <li>Based upon a system collator associated with a Locale.
- *  <li>Based upon a tailored ruleset.
- * </ul>
- * <p>
- * Using a System collator:
- * <ul>
- *  <li>locale: RFC 3066 locale ID (mandatory)
- *  <li>strength: 'primary','secondary','tertiary', 'quaternary', or 'identical' (optional)
- *  <li>decomposition: 'no', or 'canonical' (optional)
- * </ul>
- * <p>
- * Using a Tailored ruleset:
- * <ul>
- *  <li>custom: UTF-8 text file containing rules supported by RuleBasedCollator (mandatory)
- *  <li>strength: 'primary','secondary','tertiary', 'quaternary', or 'identical' (optional)
- *  <li>decomposition: 'no' or 'canonical' (optional)
- * </ul>
- * <p>
- * Expert options:
- * <ul>
- *  <li>alternate: 'shifted' or 'non-ignorable'. Can be used to ignore punctuation/whitespace.
- *  <li>caseLevel: 'true' or 'false'. Useful with strength=primary to ignore accents but not case.
- *  <li>caseFirst: 'lower' or 'upper'. Useful to control which is sorted first when case is not ignored.
- *  <li>numeric: 'true' or 'false'. Digits are sorted according to numeric value, e.g. foobar-9 sorts before foobar-10
- *  <li>variableTop: single character or contraction. Controls what is variable for 'alternate'
- * </ul>
- * 
- * @see Collator
- * @see ULocale
- * @see RuleBasedCollator
- */
-public class ICUCollationField extends FieldType {
-  private Analyzer analyzer;
-
-  @Override
-  protected void init(IndexSchema schema, Map<String,String> args) {
-    properties |= TOKENIZED; // this ensures our analyzer gets hit
-    setup(schema.getResourceLoader(), args);
-    super.init(schema, args);
-  }
-  
-  /**
-   * Setup the field according to the provided parameters
-   */
-  private void setup(ResourceLoader loader, Map<String,String> args) {
-    String custom = args.remove("custom");
-    String localeID = args.remove("locale");
-    String strength = args.remove("strength");
-    String decomposition = args.remove("decomposition");
-    
-    String alternate = args.remove("alternate");
-    String caseLevel = args.remove("caseLevel");
-    String caseFirst = args.remove("caseFirst");
-    String numeric = args.remove("numeric");
-    String variableTop = args.remove("variableTop");
-
-    if (custom == null && localeID == null)
-      throw new SolrException(ErrorCode.SERVER_ERROR, "Either custom or locale is required.");
-    
-    if (custom != null && localeID != null)
-      throw new SolrException(ErrorCode.SERVER_ERROR, "Cannot specify both locale and custom. "
-          + "To tailor rules for a built-in language, see the javadocs for RuleBasedCollator. "
-          + "Then save the entire customized ruleset to a file, and use with the custom parameter");
-    
-    final Collator collator;
-    
-    if (localeID != null) { 
-      // create from a system collator, based on Locale.
-      collator = createFromLocale(localeID);
-    } else { 
-      // create from a custom ruleset
-      collator = createFromRules(custom, loader);
-    }
-    
-    // set the strength flag, otherwise it will be the default.
-    if (strength != null) {
-      if (strength.equalsIgnoreCase("primary"))
-        collator.setStrength(Collator.PRIMARY);
-      else if (strength.equalsIgnoreCase("secondary"))
-        collator.setStrength(Collator.SECONDARY);
-      else if (strength.equalsIgnoreCase("tertiary"))
-        collator.setStrength(Collator.TERTIARY);
-      else if (strength.equalsIgnoreCase("quaternary"))
-        collator.setStrength(Collator.QUATERNARY);
-      else if (strength.equalsIgnoreCase("identical"))
-        collator.setStrength(Collator.IDENTICAL);
-      else
-        throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid strength: " + strength);
-    }
-    
-    // set the decomposition flag, otherwise it will be the default.
-    if (decomposition != null) {
-      if (decomposition.equalsIgnoreCase("no"))
-        collator.setDecomposition(Collator.NO_DECOMPOSITION);
-      else if (decomposition.equalsIgnoreCase("canonical"))
-        collator.setDecomposition(Collator.CANONICAL_DECOMPOSITION);
-      else
-        throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid decomposition: " + decomposition);
-    }
-    
-    // expert options: concrete subclasses are always a RuleBasedCollator
-    RuleBasedCollator rbc = (RuleBasedCollator) collator;
-    if (alternate != null) {
-      if (alternate.equalsIgnoreCase("shifted")) {
-        rbc.setAlternateHandlingShifted(true);
-      } else if (alternate.equalsIgnoreCase("non-ignorable")) {
-        rbc.setAlternateHandlingShifted(false);
-      } else {
-        throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid alternate: " + alternate);
-      }
-    }
-    if (caseLevel != null) {
-      rbc.setCaseLevel(Boolean.parseBoolean(caseLevel));
-    }
-    if (caseFirst != null) {
-      if (caseFirst.equalsIgnoreCase("lower")) {
-        rbc.setLowerCaseFirst(true);
-      } else if (caseFirst.equalsIgnoreCase("upper")) {
-        rbc.setUpperCaseFirst(true);
-      } else {
-        throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid caseFirst: " + caseFirst);
-      }
-    }
-    if (numeric != null) {
-      rbc.setNumericCollation(Boolean.parseBoolean(numeric));
-    }
-    if (variableTop != null) {
-      rbc.setVariableTop(variableTop);
-    }
-
-    analyzer = new ICUCollationKeyAnalyzer(collator);
-  }
-  
-  /**
-   * Create a locale from localeID.
-   * Then return the appropriate collator for the locale.
-   */
-  private Collator createFromLocale(String localeID) {
-    return Collator.getInstance(new ULocale(localeID));
-  }
-  
-  /**
-   * Read custom rules from a file, and create a RuleBasedCollator
-   * The file cannot support comments, as # might be in the rules!
-   */
-  static Collator createFromRules(String fileName, ResourceLoader loader) {
-    InputStream input = null;
-    try {
-     input = loader.openResource(fileName);
-     String rules = IOUtils.toString(input, "UTF-8");
-     return new RuleBasedCollator(rules);
-    } catch (Exception e) {
-      // io error or invalid rules
-      throw new RuntimeException(e);
-    } finally {
-      IOUtils.closeQuietly(input);
-    }
-  }
-
-  @Override
-  public void write(TextResponseWriter writer, String name, IndexableField f) throws IOException {
-    writer.writeStr(name, f.stringValue(), true);
-  }
-
-  @Override
-  public SortField getSortField(SchemaField field, boolean top) {
-    return getStringSort(field, top);
-  }
-  
-  @Override
-  public Type getUninversionType(SchemaField sf) {
-    if (sf.multiValued()) {
-      return Type.SORTED_SET_BINARY; 
-    } else {
-      return Type.SORTED;
-    }
-  }
-
-  @Override
-  public Analyzer getIndexAnalyzer() {
-    return analyzer;
-  }
-
-  @Override
-  public Analyzer getQueryAnalyzer() {
-    return analyzer;
-  }
-
-  /**
-   * analyze the text with the analyzer, instead of the collator.
-   * because icu collators are not thread safe, this keeps things 
-   * simple (we already have a threadlocal clone in the reused TS)
-   */
-  private BytesRef getCollationKey(String field, String text) {
-    try (TokenStream source = analyzer.tokenStream(field, text)) {
-      source.reset();
-      
-      TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
-      
-
-      // we control the analyzer here: most errors are impossible
-      if (!source.incrementToken())
-        throw new IllegalArgumentException("analyzer returned no terms for text: " + text);
-      BytesRef bytes = BytesRef.deepCopyOf(termAtt.getBytesRef());
-      assert !source.incrementToken();
-      
-      source.end();
-      return bytes;
-    } catch (IOException e) {
-      throw new RuntimeException("Unable to analyze text: " + text, e);
-    }
-  }
-  
-  @Override
-  public Query getRangeQuery(QParser parser, SchemaField field, String part1, String part2, boolean minInclusive, boolean maxInclusive) {
-    String f = field.getName();
-    BytesRef low = part1 == null ? null : getCollationKey(f, part1);
-    BytesRef high = part2 == null ? null : getCollationKey(f, part2);
-    if (!field.indexed() && field.hasDocValues()) {
-      return SortedSetDocValuesField.newSlowRangeQuery(
-          field.getName(), low, high, minInclusive, maxInclusive);
-    } else {
-      return new TermRangeQuery(field.getName(), low, high, minInclusive, maxInclusive);
-    }
-  }
-
-  @Override
-  protected void checkSupportsDocValues() { // we support DocValues
-  }
-
-  @Override
-  public List<IndexableField> createFields(SchemaField field, Object value) {
-    if (field.hasDocValues()) {
-      List<IndexableField> fields = new ArrayList<>();
-      fields.add(createField(field, value));
-      final BytesRef bytes = getCollationKey(field.getName(), value.toString());
-      if (field.multiValued()) {
-        fields.add(new SortedSetDocValuesField(field.getName(), bytes));
-      } else {
-        fields.add(new SortedDocValuesField(field.getName(), bytes));
-      }
-      return fields;
-    } else {
-      return Collections.singletonList(createField(field, value));
-    }
-  }
-
-  @Override
-  public Object marshalSortValue(Object value) {
-    return marshalBase64SortValue(value);
-  }
-
-  @Override
-  public Object unmarshalSortValue(Object value) {
-    return unmarshalBase64SortValue(value);
-  }
-}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6c070b4a/solr/contrib/analysis-extras/src/java/org/apache/solr/schema/package.html
----------------------------------------------------------------------
diff --git a/solr/contrib/analysis-extras/src/java/org/apache/solr/schema/package.html b/solr/contrib/analysis-extras/src/java/org/apache/solr/schema/package.html
deleted file mode 100644
index 27d68db..0000000
--- a/solr/contrib/analysis-extras/src/java/org/apache/solr/schema/package.html
+++ /dev/null
@@ -1,23 +0,0 @@
-<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<!-- not a package-info.java, because we already defined this package in core/ -->
-<html>
-<body>
-<code>FieldType</code> plugins that have additional dependencies.
-</body>
-</html>

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6c070b4a/solr/contrib/analysis-extras/src/java/org/apache/solr/update/processor/OpenNLPExtractNamedEntitiesUpdateProcessorFactory.java
----------------------------------------------------------------------
diff --git a/solr/contrib/analysis-extras/src/java/org/apache/solr/update/processor/OpenNLPExtractNamedEntitiesUpdateProcessorFactory.java b/solr/contrib/analysis-extras/src/java/org/apache/solr/update/processor/OpenNLPExtractNamedEntitiesUpdateProcessorFactory.java
deleted file mode 100644
index d69c367..0000000
--- a/solr/contrib/analysis-extras/src/java/org/apache/solr/update/processor/OpenNLPExtractNamedEntitiesUpdateProcessorFactory.java
+++ /dev/null
@@ -1,577 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.solr.update.processor;
-
-import java.io.IOException;
-import java.lang.invoke.MethodHandles;
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Map;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-import java.util.regex.PatternSyntaxException;
-
-import opennlp.tools.util.Span;
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.opennlp.OpenNLPTokenizer;
-import org.apache.lucene.analysis.opennlp.tools.NLPNERTaggerOp;
-import org.apache.lucene.analysis.opennlp.tools.OpenNLPOpsFactory;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
-import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
-import org.apache.solr.common.SolrException;
-import org.apache.solr.common.SolrInputDocument;
-import org.apache.solr.common.SolrInputField;
-import org.apache.solr.common.util.NamedList;
-import org.apache.solr.common.util.Pair;
-import org.apache.solr.core.SolrCore;
-import org.apache.solr.request.SolrQueryRequest;
-import org.apache.solr.response.SolrQueryResponse;
-import org.apache.solr.schema.FieldType;
-import org.apache.solr.update.AddUpdateCommand;
-import org.apache.solr.update.processor.FieldMutatingUpdateProcessor.FieldNameSelector;
-import org.apache.solr.update.processor.FieldMutatingUpdateProcessorFactory.SelectorParams;
-import org.apache.solr.util.plugin.SolrCoreAware;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import static org.apache.solr.common.SolrException.ErrorCode.SERVER_ERROR;
-
-/**
- * Extracts named entities using an OpenNLP NER <code>modelFile</code> from the values found in
- * any matching <code>source</code> field into a configured <code>dest</code> field, after
- * first tokenizing the source text using the index analyzer on the configured
- * <code>analyzerFieldType</code>, which must include <code>solr.OpenNLPTokenizerFactory</code>
- * as the tokenizer. E.g.:
- *
- * <pre class="prettyprint">
- *   &lt;fieldType name="opennlp-en-tokenization" class="solr.TextField"&gt;
- *     &lt;analyzer&gt;
- *       &lt;tokenizer class="solr.OpenNLPTokenizerFactory"
- *                  sentenceModel="en-sent.bin"
- *                  tokenizerModel="en-tokenizer.bin"/&gt;
- *     &lt;/analyzer&gt;
- *   &lt;/fieldType&gt;
- * </pre>
- * 
- * <p>See the <a href="http://opennlp.apache.org/models.html">OpenNLP website</a>
- * for information on downloading pre-trained models.</p>
- *
- * Note that in order to use model files larger than 1MB on SolrCloud, 
- * <a href="https://lucene.apache.org/solr/guide/setting-up-an-external-zookeeper-ensemble#increasing-zookeeper-s-1mb-file-size-limit"
- * >ZooKeeper server and client configuration is required</a>.
- * 
- * <p>
- * The <code>source</code> field(s) can be configured as either:
- * </p>
- * <ul>
- *  <li>One or more <code>&lt;str&gt;</code></li>
- *  <li>An <code>&lt;arr&gt;</code> of <code>&lt;str&gt;</code></li>
- *  <li>A <code>&lt;lst&gt;</code> containing
- *   {@link FieldMutatingUpdateProcessor FieldMutatingUpdateProcessorFactory style selector arguments}</li>
- * </ul>
- *
- * <p>The <code>dest</code> field can be a single <code>&lt;str&gt;</code>
- * containing the literal name of a destination field, or it may be a <code>&lt;lst&gt;</code> specifying a
- * regex <code>pattern</code> and a <code>replacement</code> string. If the pattern + replacement option
- * is used the pattern will be matched against all fields matched by the source selector, and the replacement
- * string (including any capture groups specified from the pattern) will be evaluated a using
- * {@link Matcher#replaceAll(String)} to generate the literal name of the destination field.  Additionally,
- * an occurrence of the string "{EntityType}" in the <code>dest</code> field specification, or in the
- * <code>replacement</code> string, will be replaced with the entity type(s) returned for each entity by
- * the OpenNLP NER model; as a result, if the model extracts more than one entity type, then more than one
- * <code>dest</code> field will be populated.
- * </p>
- *
- * <p>If the resolved <code>dest</code> field already exists in the document, then the
- * named entities extracted from the <code>source</code> fields will be added to it.
- * </p>
- * <p>
- * In the example below:
- * </p>
- * <ul>
- *   <li>Named entities will be extracted from the <code>text</code> field and added
- *       to the <code>names_ss</code> field</li>
- *   <li>Named entities will be extracted from both the <code>title</code> and
- *       <code>subtitle</code> fields and added into the <code>titular_people</code> field</li>
- *   <li>Named entities will be extracted from any field with a name ending in <code>_txt</code>
- *       -- except for <code>notes_txt</code> -- and added into the <code>people_ss</code> field</li>
- *   <li>Named entities will be extracted from any field with a name beginning with "desc" and
- *       ending in "s" (e.g. "descs" and "descriptions") and added to a field prefixed with "key_",
- *       not ending in "s", and suffixed with "_people". (e.g. "key_desc_people" or
- *       "key_description_people")</li>
- *   <li>Named entities will be extracted from the <code>summary</code> field and added
- *       to the <code>summary_person_ss</code> field, assuming that the modelFile only extracts
- *       entities of type "person".</li>
- * </ul>
- *
- * <pre class="prettyprint">
- * &lt;updateRequestProcessorChain name="multiple-extract"&gt;
- *   &lt;processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory"&gt;
- *     &lt;str name="modelFile"&gt;en-test-ner-person.bin&lt;/str&gt;
- *     &lt;str name="analyzerFieldType"&gt;opennlp-en-tokenization&lt;/str&gt;
- *     &lt;str name="source"&gt;text&lt;/str&gt;
- *     &lt;str name="dest"&gt;people_s&lt;/str&gt;
- *   &lt;/processor&gt;
- *   &lt;processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory"&gt;
- *     &lt;str name="modelFile"&gt;en-test-ner-person.bin&lt;/str&gt;
- *     &lt;str name="analyzerFieldType"&gt;opennlp-en-tokenization&lt;/str&gt;
- *     &lt;arr name="source"&gt;
- *       &lt;str&gt;title&lt;/str&gt;
- *       &lt;str&gt;subtitle&lt;/str&gt;
- *     &lt;/arr&gt;
- *     &lt;str name="dest"&gt;titular_people&lt;/str&gt;
- *   &lt;/processor&gt;
- *   &lt;processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory"&gt;
- *     &lt;str name="modelFile"&gt;en-test-ner-person.bin&lt;/str&gt;
- *     &lt;str name="analyzerFieldType"&gt;opennlp-en-tokenization&lt;/str&gt;
- *     &lt;lst name="source"&gt;
- *       &lt;str name="fieldRegex"&gt;.*_txt$&lt;/str&gt;
- *       &lt;lst name="exclude"&gt;
- *         &lt;str name="fieldName"&gt;notes_txt&lt;/str&gt;
- *       &lt;/lst&gt;
- *     &lt;/lst&gt;
- *     &lt;str name="dest"&gt;people_s&lt;/str&gt;
- *   &lt;/processor&gt;
- *   &lt;processor class="solr.processor.OpenNLPExtractNamedEntitiesUpdateProcessorFactory"&gt;
- *     &lt;str name="modelFile"&gt;en-test-ner-person.bin&lt;/str&gt;
- *     &lt;str name="analyzerFieldType"&gt;opennlp-en-tokenization&lt;/str&gt;
- *     &lt;lst name="source"&gt;
- *       &lt;str name="fieldRegex"&gt;^desc(.*)s$&lt;/str&gt;
- *     &lt;/lst&gt;
- *     &lt;lst name="dest"&gt;
- *       &lt;str name="pattern"&gt;^desc(.*)s$&lt;/str&gt;
- *       &lt;str name="replacement"&gt;key_desc$1_people&lt;/str&gt;
- *     &lt;/lst&gt;
- *   &lt;/processor&gt;
- *   &lt;processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory"&gt;
- *     &lt;str name="modelFile"&gt;en-test-ner-person.bin&lt;/str&gt;
- *     &lt;str name="analyzerFieldType"&gt;opennlp-en-tokenization&lt;/str&gt;
- *     &lt;str name="source"&gt;summary&lt;/str&gt;
- *     &lt;str name="dest"&gt;summary_{EntityType}_s&lt;/str&gt;
- *   &lt;/processor&gt;
- *   &lt;processor class="solr.LogUpdateProcessorFactory" /&gt;
- *   &lt;processor class="solr.RunUpdateProcessorFactory" /&gt;
- * &lt;/updateRequestProcessorChain&gt;
- * </pre>
- *
- * @since 7.3.0
- */
-public class OpenNLPExtractNamedEntitiesUpdateProcessorFactory
-    extends UpdateRequestProcessorFactory implements SolrCoreAware {
-
-  private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
-
-  public static final String SOURCE_PARAM = "source";
-  public static final String DEST_PARAM = "dest";
-  public static final String PATTERN_PARAM = "pattern";
-  public static final String REPLACEMENT_PARAM = "replacement";
-  public static final String MODEL_PARAM = "modelFile";
-  public static final String ANALYZER_FIELD_TYPE_PARAM = "analyzerFieldType";
-  public static final String ENTITY_TYPE = "{EntityType}";
-
-  private SelectorParams srcInclusions = new SelectorParams();
-  private Collection<SelectorParams> srcExclusions = new ArrayList<>();
-
-  private FieldNameSelector srcSelector = null;
-
-  private String modelFile = null;
-  private String analyzerFieldType = null;
-
-  /**
-   * If pattern is null, this this is a literal field name.  If pattern is non-null then this
-   * is a replacement string that may contain meta-characters (ie: capture group identifiers)
-   * @see #pattern
-   */
-  private String dest = null;
-  /** @see #dest */
-  private Pattern pattern = null;
-
-  protected final FieldNameSelector getSourceSelector() {
-    if (null != srcSelector) return srcSelector;
-
-    throw new SolrException(SERVER_ERROR, "selector was never initialized, inform(SolrCore) never called???");
-  }
-
-  @SuppressWarnings("unchecked")
-  @Override
-  public void init(NamedList args) {
-
-    // high level (loose) check for which type of config we have.
-    //
-    // individual init methods do more strict syntax checking
-    if (0 <= args.indexOf(SOURCE_PARAM, 0) && 0 <= args.indexOf(DEST_PARAM, 0) ) {
-      initSourceSelectorSyntax(args);
-    } else if (0 <= args.indexOf(PATTERN_PARAM, 0) && 0 <= args.indexOf(REPLACEMENT_PARAM, 0)) {
-      initSimpleRegexReplacement(args);
-    } else {
-      throw new SolrException(SERVER_ERROR, "A combination of either '" + SOURCE_PARAM + "' + '"+
-          DEST_PARAM + "', or '" + REPLACEMENT_PARAM + "' + '" +
-          PATTERN_PARAM + "' init params are mandatory");
-    }
-
-    Object modelParam = args.remove(MODEL_PARAM);
-    if (null == modelParam) {
-      throw new SolrException(SERVER_ERROR, "Missing required init param '" + MODEL_PARAM + "'");
-    }
-    if ( ! (modelParam instanceof CharSequence)) {
-      throw new SolrException(SERVER_ERROR, "Init param '" + MODEL_PARAM + "' must be a <str>");
-    }
-    modelFile = modelParam.toString();
-
-    Object analyzerFieldTypeParam = args.remove(ANALYZER_FIELD_TYPE_PARAM);
-    if (null == analyzerFieldTypeParam) {
-      throw new SolrException(SERVER_ERROR, "Missing required init param '" + ANALYZER_FIELD_TYPE_PARAM + "'");
-    }
-    if ( ! (analyzerFieldTypeParam instanceof CharSequence)) {
-      throw new SolrException(SERVER_ERROR, "Init param '" + ANALYZER_FIELD_TYPE_PARAM + "' must be a <str>");
-    }
-    analyzerFieldType = analyzerFieldTypeParam.toString();
-
-    if (0 < args.size()) {
-      throw new SolrException(SERVER_ERROR, "Unexpected init param(s): '" + args.getName(0) + "'");
-    }
-
-    super.init(args);
-  }
-
-  /**
-   * init helper method that should only be called when we know for certain that both the
-   * "source" and "dest" init params do <em>not</em> exist.
-   */
-  @SuppressWarnings("unchecked")
-  private void initSimpleRegexReplacement(NamedList args) {
-    // The syntactic sugar for the case where there is only one regex pattern for source and the same pattern
-    // is used for the destination pattern...
-    //
-    //  pattern != null && replacement != null
-    //
-    // ...as top level elements, with no other config options specified
-
-    // if we got here we know we had pattern and replacement, now check for the other two  so that we can give a better
-    // message than "unexpected"
-    if (0 <= args.indexOf(SOURCE_PARAM, 0) || 0 <= args.indexOf(DEST_PARAM, 0) ) {
-      throw new SolrException(SERVER_ERROR,"Short hand syntax must not be mixed with full syntax. Found " +
-          PATTERN_PARAM + " and " + REPLACEMENT_PARAM + " but also found " + SOURCE_PARAM + " or " + DEST_PARAM);
-    }
-
-    assert args.indexOf(SOURCE_PARAM, 0) < 0;
-
-    Object patt = args.remove(PATTERN_PARAM);
-    Object replacement = args.remove(REPLACEMENT_PARAM);
-
-    if (null == patt || null == replacement) {
-      throw new SolrException(SERVER_ERROR, "Init params '" + PATTERN_PARAM + "' and '" +
-          REPLACEMENT_PARAM + "' are both mandatory if '" + SOURCE_PARAM + "' and '"+
-          DEST_PARAM + "' are not both specified");
-    }
-
-    if (0 != args.size()) {
-      throw new SolrException(SERVER_ERROR, "Init params '" + REPLACEMENT_PARAM + "' and '" +
-          PATTERN_PARAM + "' must be children of '" + DEST_PARAM +
-          "' to be combined with other options.");
-    }
-
-    if (!(replacement instanceof String)) {
-      throw new SolrException(SERVER_ERROR, "Init param '" + REPLACEMENT_PARAM + "' must be a string (i.e. <str>)");
-    }
-    if (!(patt instanceof String)) {
-      throw new SolrException(SERVER_ERROR, "Init param '" + PATTERN_PARAM + "' must be a string (i.e. <str>)");
-    }
-
-    dest = replacement.toString();
-    try {
-      this.pattern = Pattern.compile(patt.toString());
-    } catch (PatternSyntaxException pe) {
-      throw new SolrException(SERVER_ERROR, "Init param " + PATTERN_PARAM +
-          " is not a valid regex pattern: " + patt, pe);
-
-    }
-    srcInclusions = new SelectorParams();
-    srcInclusions.fieldRegex = Collections.singletonList(this.pattern);
-  }
-
-  /**
-   * init helper method that should only be called when we know for certain that both the
-   * "source" and "dest" init params <em>do</em> exist.
-   */
-  @SuppressWarnings("unchecked")
-  private void initSourceSelectorSyntax(NamedList args) {
-    // Full and complete syntax where source and dest are mandatory.
-    //
-    // source may be a single string or a selector.
-    // dest may be a single string or list containing pattern and replacement
-    //
-    //   source != null && dest != null
-
-    // if we got here we know we had source and dest, now check for the other two so that we can give a better
-    // message than "unexpected"
-    if (0 <= args.indexOf(PATTERN_PARAM, 0) || 0 <= args.indexOf(REPLACEMENT_PARAM, 0) ) {
-      throw new SolrException(SERVER_ERROR,"Short hand syntax must not be mixed with full syntax. Found " +
-          SOURCE_PARAM + " and " + DEST_PARAM + " but also found " + PATTERN_PARAM + " or " + REPLACEMENT_PARAM);
-    }
-
-    Object d = args.remove(DEST_PARAM);
-    assert null != d;
-
-    List<Object> sources = args.getAll(SOURCE_PARAM);
-    assert null != sources;
-
-    if (1 == sources.size()) {
-      if (sources.get(0) instanceof NamedList) {
-        // nested set of selector options
-        NamedList selectorConfig = (NamedList) args.remove(SOURCE_PARAM);
-
-        srcInclusions = parseSelectorParams(selectorConfig);
-
-        List<Object> excList = selectorConfig.getAll("exclude");
-
-        for (Object excObj : excList) {
-          if (null == excObj) {
-            throw new SolrException(SERVER_ERROR, "Init param '" + SOURCE_PARAM +
-                "' child 'exclude' can not be null");
-          }
-          if (!(excObj instanceof NamedList)) {
-            throw new SolrException(SERVER_ERROR, "Init param '" + SOURCE_PARAM +
-                "' child 'exclude' must be <lst/>");
-          }
-          NamedList exc = (NamedList) excObj;
-          srcExclusions.add(parseSelectorParams(exc));
-          if (0 < exc.size()) {
-            throw new SolrException(SERVER_ERROR, "Init param '" + SOURCE_PARAM +
-                "' has unexpected 'exclude' sub-param(s): '"
-                + selectorConfig.getName(0) + "'");
-          }
-          // call once per instance
-          selectorConfig.remove("exclude");
-        }
-
-        if (0 < selectorConfig.size()) {
-          throw new SolrException(SERVER_ERROR, "Init param '" + SOURCE_PARAM +
-              "' contains unexpected child param(s): '" +
-              selectorConfig.getName(0) + "'");
-        }
-        // consume from the named list so it doesn't interfere with subsequent processing
-        sources.remove(0);
-      }
-    }
-    if (1 <= sources.size()) {
-      // source better be one or more strings
-      srcInclusions.fieldName = new HashSet<>(args.removeConfigArgs("source"));
-    }
-    if (srcInclusions == null) {
-      throw new SolrException(SERVER_ERROR,
-          "Init params do not specify any field from which to extract entities, please supply either "
-          + SOURCE_PARAM + " and " + DEST_PARAM + " or " + PATTERN_PARAM + " and " + REPLACEMENT_PARAM + ". See javadocs" +
-          "for OpenNLPExtractNamedEntitiesUpdateProcessor for further details.");
-    }
-
-    if (d instanceof NamedList) {
-      NamedList destList = (NamedList) d;
-
-      Object patt = destList.remove(PATTERN_PARAM);
-      Object replacement = destList.remove(REPLACEMENT_PARAM);
-
-      if (null == patt || null == replacement) {
-        throw new SolrException(SERVER_ERROR, "Init param '" + DEST_PARAM + "' children '" +
-            PATTERN_PARAM + "' and '" + REPLACEMENT_PARAM +
-            "' are both mandatory and can not be null");
-      }
-      if (! (patt instanceof String && replacement instanceof String)) {
-        throw new SolrException(SERVER_ERROR, "Init param '" + DEST_PARAM + "' children '" +
-            PATTERN_PARAM + "' and '" + REPLACEMENT_PARAM +
-            "' must both be strings (i.e. <str>)");
-      }
-      if (0 != destList.size()) {
-        throw new SolrException(SERVER_ERROR, "Init param '" + DEST_PARAM + "' has unexpected children: '"
-            + destList.getName(0) + "'");
-      }
-
-      try {
-        this.pattern = Pattern.compile(patt.toString());
-      } catch (PatternSyntaxException pe) {
-        throw new SolrException(SERVER_ERROR, "Init param '" + DEST_PARAM + "' child '" + PATTERN_PARAM +
-            " is not a valid regex pattern: " + patt, pe);
-      }
-      dest = replacement.toString();
-
-    } else if (d instanceof String) {
-      dest = d.toString();
-    } else {
-      throw new SolrException(SERVER_ERROR, "Init param '" + DEST_PARAM + "' must either be a string " +
-          "(i.e. <str>) or a list (i.e. <lst>) containing '" +
-          PATTERN_PARAM + "' and '" + REPLACEMENT_PARAM);
-    }
-
-  }
-
-  @Override
-  public void inform(final SolrCore core) {
-
-    srcSelector =
-        FieldMutatingUpdateProcessor.createFieldNameSelector
-            (core.getResourceLoader(), core, srcInclusions, FieldMutatingUpdateProcessor.SELECT_NO_FIELDS);
-
-    for (SelectorParams exc : srcExclusions) {
-      srcSelector = FieldMutatingUpdateProcessor.wrap
-          (srcSelector,
-              FieldMutatingUpdateProcessor.createFieldNameSelector
-                  (core.getResourceLoader(), core, exc, FieldMutatingUpdateProcessor.SELECT_NO_FIELDS));
-    }
-    try {
-      OpenNLPOpsFactory.getNERTaggerModel(modelFile, core.getResourceLoader());
-    } catch (IOException e) {
-      throw new IllegalArgumentException(e);
-    }
-  }
-
-  @Override
-  public final UpdateRequestProcessor getInstance
-      (SolrQueryRequest req, SolrQueryResponse rsp, UpdateRequestProcessor next) {
-    final FieldNameSelector srcSelector = getSourceSelector();
-    return new UpdateRequestProcessor(next) {
-      private final NLPNERTaggerOp nerTaggerOp;
-      private Analyzer analyzer = null;
-      {
-        try {
-          nerTaggerOp = OpenNLPOpsFactory.getNERTagger(modelFile);
-          FieldType fieldType = req.getSchema().getFieldTypeByName(analyzerFieldType);
-          if (fieldType == null) {
-            throw new SolrException
-                (SERVER_ERROR, ANALYZER_FIELD_TYPE_PARAM + " '" + analyzerFieldType + "' not found in the schema.");
-          }
-          analyzer = fieldType.getIndexAnalyzer();
-        } catch (IOException e) {
-          throw new IllegalArgumentException(e);
-        }
-      }
-
-      @Override
-      public void processAdd(AddUpdateCommand cmd) throws IOException {
-
-        final SolrInputDocument doc = cmd.getSolrInputDocument();
-
-        // Destination may be regex replace string, or "{EntityType}" replaced by
-        // each entity's type, both of which can cause multiple output fields.
-        Map<String,SolrInputField> destMap = new HashMap<>();
-
-        // preserve initial values
-        for (final String fname : doc.getFieldNames()) {
-          if ( ! srcSelector.shouldMutate(fname)) continue;
-
-          Collection<Object> srcFieldValues = doc.getFieldValues(fname);
-          if (srcFieldValues == null || srcFieldValues.isEmpty()) continue;
-
-          String resolvedDest = dest;
-
-          if (pattern != null) {
-            Matcher matcher = pattern.matcher(fname);
-            if (matcher.find()) {
-              resolvedDest = matcher.replaceAll(dest);
-            } else {
-              log.debug("srcSelector.shouldMutate(\"{}\") returned true, " +
-                  "but replacement pattern did not match, field skipped.", fname);
-              continue;
-            }
-          }
-
-          for (Object val : srcFieldValues) {
-            for (Pair<String,String> entity : extractTypedNamedEntities(val)) {
-              SolrInputField destField = null;
-              String entityName = entity.first();
-              String entityType = entity.second();
-              final String resolved = resolvedDest.replace(ENTITY_TYPE, entityType);
-              if (doc.containsKey(resolved)) {
-                destField = doc.getField(resolved);
-              } else {
-                SolrInputField targetField = destMap.get(resolved);
-                if (targetField == null) {
-                  destField = new SolrInputField(resolved);
-                } else {
-                  destField = targetField;
-                }
-              }
-              destField.addValue(entityName);
-
-              // put it in map to avoid concurrent modification...
-              destMap.put(resolved, destField);
-            }
-          }
-        }
-
-        for (Map.Entry<String,SolrInputField> entry : destMap.entrySet()) {
-          doc.put(entry.getKey(), entry.getValue());
-        }
-        super.processAdd(cmd);
-      }
-
-      /** Using configured NER model, extracts (name, type) pairs from the given source field value */
-      private List<Pair<String,String>> extractTypedNamedEntities(Object srcFieldValue) throws IOException {
-        List<Pair<String,String>> entitiesWithType = new ArrayList<>();
-        List<String> terms = new ArrayList<>();
-        List<Integer> startOffsets = new ArrayList<>();
-        List<Integer> endOffsets = new ArrayList<>();
-        String fullText = srcFieldValue.toString();
-        TokenStream tokenStream = analyzer.tokenStream("", fullText);
-        CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class);
-        OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
-        FlagsAttribute flagsAtt = tokenStream.addAttribute(FlagsAttribute.class);
-        tokenStream.reset();
-        synchronized (nerTaggerOp) {
-          while (tokenStream.incrementToken()) {
-            terms.add(termAtt.toString());
-            startOffsets.add(offsetAtt.startOffset());
-            endOffsets.add(offsetAtt.endOffset());
-            boolean endOfSentence = 0 != (flagsAtt.getFlags() & OpenNLPTokenizer.EOS_FLAG_BIT);
-            if (endOfSentence) {    // extract named entities one sentence at a time
-              extractEntitiesFromSentence(fullText, terms, startOffsets, endOffsets, entitiesWithType);
-            }
-          }
-          tokenStream.end();
-          tokenStream.close();
-          if (!terms.isEmpty()) { // In case last token of last sentence isn't properly flagged with EOS_FLAG_BIT
-            extractEntitiesFromSentence(fullText, terms, startOffsets, endOffsets, entitiesWithType);
-          }
-          nerTaggerOp.reset();      // Forget all adaptive data collected during previous calls
-        }
-        return entitiesWithType;
-      }
-
-      private void extractEntitiesFromSentence(String fullText, List<String> terms, List<Integer> startOffsets,
-                                               List<Integer> endOffsets, List<Pair<String,String>> entitiesWithType) {
-        for (Span span : nerTaggerOp.getNames(terms.toArray(new String[terms.size()]))) {
-          String text = fullText.substring(startOffsets.get(span.getStart()), endOffsets.get(span.getEnd() - 1));
-          entitiesWithType.add(new Pair<>(text, span.getType()));
-        }
-        terms.clear();
-        startOffsets.clear();
-        endOffsets.clear();
-      }
-    };
-  }
-
-  /** macro */
-  private static SelectorParams parseSelectorParams(NamedList args) {
-    return FieldMutatingUpdateProcessorFactory.parseSelectorParams(args);
-  }
-}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6c070b4a/solr/contrib/analysis-extras/src/java/org/apache/solr/update/processor/package.html
----------------------------------------------------------------------
diff --git a/solr/contrib/analysis-extras/src/java/org/apache/solr/update/processor/package.html b/solr/contrib/analysis-extras/src/java/org/apache/solr/update/processor/package.html
deleted file mode 100644
index 1388c29..0000000
--- a/solr/contrib/analysis-extras/src/java/org/apache/solr/update/processor/package.html
+++ /dev/null
@@ -1,24 +0,0 @@
-<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<!-- not a package-info.java, because we already defined this package in core/ -->
-<html>
-  <body>
-    Update request processor invoking OpenNLP Named Entity Recognition over configured
-    source field(s), populating configured target field(s) with the results.
-  </body>
-</html>

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6c070b4a/solr/contrib/analysis-extras/src/java/overview.html
----------------------------------------------------------------------
diff --git a/solr/contrib/analysis-extras/src/java/overview.html b/solr/contrib/analysis-extras/src/java/overview.html
deleted file mode 100644
index f3d70ca..0000000
--- a/solr/contrib/analysis-extras/src/java/overview.html
+++ /dev/null
@@ -1,21 +0,0 @@
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<html>
-<body>
-Apache Solr Search Server: Analysis Extras contrib
-</body>
-</html>

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6c070b4a/solr/contrib/analysis-extras/src/main/java/org/apache/solr/schema/ICUCollationField.java
----------------------------------------------------------------------
diff --git a/solr/contrib/analysis-extras/src/main/java/org/apache/solr/schema/ICUCollationField.java b/solr/contrib/analysis-extras/src/main/java/org/apache/solr/schema/ICUCollationField.java
new file mode 100644
index 0000000..f723a25
--- /dev/null
+++ b/solr/contrib/analysis-extras/src/main/java/org/apache/solr/schema/ICUCollationField.java
@@ -0,0 +1,311 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.schema;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
+import org.apache.lucene.analysis.util.ResourceLoader;
+import org.apache.lucene.collation.ICUCollationKeyAnalyzer;
+import org.apache.lucene.document.SortedDocValuesField;
+import org.apache.lucene.document.SortedSetDocValuesField;
+import org.apache.lucene.index.IndexableField;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.SortField;
+import org.apache.lucene.search.TermRangeQuery;
+import org.apache.lucene.util.BytesRef;
+import org.apache.solr.common.SolrException.ErrorCode;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.response.TextResponseWriter;
+import org.apache.solr.search.QParser;
+import org.apache.solr.uninverting.UninvertingReader.Type;
+
+import com.ibm.icu.text.Collator;
+import com.ibm.icu.text.RuleBasedCollator;
+import com.ibm.icu.util.ULocale;
+
+/**
+ * Field for collated sort keys. 
+ * These can be used for locale-sensitive sort and range queries.
+ * <p>
+ * This field can be created in two ways: 
+ * <ul>
+ *  <li>Based upon a system collator associated with a Locale.
+ *  <li>Based upon a tailored ruleset.
+ * </ul>
+ * <p>
+ * Using a System collator:
+ * <ul>
+ *  <li>locale: RFC 3066 locale ID (mandatory)
+ *  <li>strength: 'primary','secondary','tertiary', 'quaternary', or 'identical' (optional)
+ *  <li>decomposition: 'no', or 'canonical' (optional)
+ * </ul>
+ * <p>
+ * Using a Tailored ruleset:
+ * <ul>
+ *  <li>custom: UTF-8 text file containing rules supported by RuleBasedCollator (mandatory)
+ *  <li>strength: 'primary','secondary','tertiary', 'quaternary', or 'identical' (optional)
+ *  <li>decomposition: 'no' or 'canonical' (optional)
+ * </ul>
+ * <p>
+ * Expert options:
+ * <ul>
+ *  <li>alternate: 'shifted' or 'non-ignorable'. Can be used to ignore punctuation/whitespace.
+ *  <li>caseLevel: 'true' or 'false'. Useful with strength=primary to ignore accents but not case.
+ *  <li>caseFirst: 'lower' or 'upper'. Useful to control which is sorted first when case is not ignored.
+ *  <li>numeric: 'true' or 'false'. Digits are sorted according to numeric value, e.g. foobar-9 sorts before foobar-10
+ *  <li>variableTop: single character or contraction. Controls what is variable for 'alternate'
+ * </ul>
+ * 
+ * @see Collator
+ * @see ULocale
+ * @see RuleBasedCollator
+ */
+public class ICUCollationField extends FieldType {
+  private Analyzer analyzer;
+
+  @Override
+  protected void init(IndexSchema schema, Map<String,String> args) {
+    properties |= TOKENIZED; // this ensures our analyzer gets hit
+    setup(schema.getResourceLoader(), args);
+    super.init(schema, args);
+  }
+  
+  /**
+   * Setup the field according to the provided parameters
+   */
+  private void setup(ResourceLoader loader, Map<String,String> args) {
+    String custom = args.remove("custom");
+    String localeID = args.remove("locale");
+    String strength = args.remove("strength");
+    String decomposition = args.remove("decomposition");
+    
+    String alternate = args.remove("alternate");
+    String caseLevel = args.remove("caseLevel");
+    String caseFirst = args.remove("caseFirst");
+    String numeric = args.remove("numeric");
+    String variableTop = args.remove("variableTop");
+
+    if (custom == null && localeID == null)
+      throw new SolrException(ErrorCode.SERVER_ERROR, "Either custom or locale is required.");
+    
+    if (custom != null && localeID != null)
+      throw new SolrException(ErrorCode.SERVER_ERROR, "Cannot specify both locale and custom. "
+          + "To tailor rules for a built-in language, see the javadocs for RuleBasedCollator. "
+          + "Then save the entire customized ruleset to a file, and use with the custom parameter");
+    
+    final Collator collator;
+    
+    if (localeID != null) { 
+      // create from a system collator, based on Locale.
+      collator = createFromLocale(localeID);
+    } else { 
+      // create from a custom ruleset
+      collator = createFromRules(custom, loader);
+    }
+    
+    // set the strength flag, otherwise it will be the default.
+    if (strength != null) {
+      if (strength.equalsIgnoreCase("primary"))
+        collator.setStrength(Collator.PRIMARY);
+      else if (strength.equalsIgnoreCase("secondary"))
+        collator.setStrength(Collator.SECONDARY);
+      else if (strength.equalsIgnoreCase("tertiary"))
+        collator.setStrength(Collator.TERTIARY);
+      else if (strength.equalsIgnoreCase("quaternary"))
+        collator.setStrength(Collator.QUATERNARY);
+      else if (strength.equalsIgnoreCase("identical"))
+        collator.setStrength(Collator.IDENTICAL);
+      else
+        throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid strength: " + strength);
+    }
+    
+    // set the decomposition flag, otherwise it will be the default.
+    if (decomposition != null) {
+      if (decomposition.equalsIgnoreCase("no"))
+        collator.setDecomposition(Collator.NO_DECOMPOSITION);
+      else if (decomposition.equalsIgnoreCase("canonical"))
+        collator.setDecomposition(Collator.CANONICAL_DECOMPOSITION);
+      else
+        throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid decomposition: " + decomposition);
+    }
+    
+    // expert options: concrete subclasses are always a RuleBasedCollator
+    RuleBasedCollator rbc = (RuleBasedCollator) collator;
+    if (alternate != null) {
+      if (alternate.equalsIgnoreCase("shifted")) {
+        rbc.setAlternateHandlingShifted(true);
+      } else if (alternate.equalsIgnoreCase("non-ignorable")) {
+        rbc.setAlternateHandlingShifted(false);
+      } else {
+        throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid alternate: " + alternate);
+      }
+    }
+    if (caseLevel != null) {
+      rbc.setCaseLevel(Boolean.parseBoolean(caseLevel));
+    }
+    if (caseFirst != null) {
+      if (caseFirst.equalsIgnoreCase("lower")) {
+        rbc.setLowerCaseFirst(true);
+      } else if (caseFirst.equalsIgnoreCase("upper")) {
+        rbc.setUpperCaseFirst(true);
+      } else {
+        throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid caseFirst: " + caseFirst);
+      }
+    }
+    if (numeric != null) {
+      rbc.setNumericCollation(Boolean.parseBoolean(numeric));
+    }
+    if (variableTop != null) {
+      rbc.setVariableTop(variableTop);
+    }
+
+    analyzer = new ICUCollationKeyAnalyzer(collator);
+  }
+  
+  /**
+   * Create a locale from localeID.
+   * Then return the appropriate collator for the locale.
+   */
+  private Collator createFromLocale(String localeID) {
+    return Collator.getInstance(new ULocale(localeID));
+  }
+  
+  /**
+   * Read custom rules from a file, and create a RuleBasedCollator
+   * The file cannot support comments, as # might be in the rules!
+   */
+  static Collator createFromRules(String fileName, ResourceLoader loader) {
+    InputStream input = null;
+    try {
+     input = loader.openResource(fileName);
+     String rules = IOUtils.toString(input, "UTF-8");
+     return new RuleBasedCollator(rules);
+    } catch (Exception e) {
+      // io error or invalid rules
+      throw new RuntimeException(e);
+    } finally {
+      IOUtils.closeQuietly(input);
+    }
+  }
+
+  @Override
+  public void write(TextResponseWriter writer, String name, IndexableField f) throws IOException {
+    writer.writeStr(name, f.stringValue(), true);
+  }
+
+  @Override
+  public SortField getSortField(SchemaField field, boolean top) {
+    return getStringSort(field, top);
+  }
+  
+  @Override
+  public Type getUninversionType(SchemaField sf) {
+    if (sf.multiValued()) {
+      return Type.SORTED_SET_BINARY; 
+    } else {
+      return Type.SORTED;
+    }
+  }
+
+  @Override
+  public Analyzer getIndexAnalyzer() {
+    return analyzer;
+  }
+
+  @Override
+  public Analyzer getQueryAnalyzer() {
+    return analyzer;
+  }
+
+  /**
+   * analyze the text with the analyzer, instead of the collator.
+   * because icu collators are not thread safe, this keeps things 
+   * simple (we already have a threadlocal clone in the reused TS)
+   */
+  private BytesRef getCollationKey(String field, String text) {
+    try (TokenStream source = analyzer.tokenStream(field, text)) {
+      source.reset();
+      
+      TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
+      
+
+      // we control the analyzer here: most errors are impossible
+      if (!source.incrementToken())
+        throw new IllegalArgumentException("analyzer returned no terms for text: " + text);
+      BytesRef bytes = BytesRef.deepCopyOf(termAtt.getBytesRef());
+      assert !source.incrementToken();
+      
+      source.end();
+      return bytes;
+    } catch (IOException e) {
+      throw new RuntimeException("Unable to analyze text: " + text, e);
+    }
+  }
+  
+  @Override
+  public Query getRangeQuery(QParser parser, SchemaField field, String part1, String part2, boolean minInclusive, boolean maxInclusive) {
+    String f = field.getName();
+    BytesRef low = part1 == null ? null : getCollationKey(f, part1);
+    BytesRef high = part2 == null ? null : getCollationKey(f, part2);
+    if (!field.indexed() && field.hasDocValues()) {
+      return SortedSetDocValuesField.newSlowRangeQuery(
+          field.getName(), low, high, minInclusive, maxInclusive);
+    } else {
+      return new TermRangeQuery(field.getName(), low, high, minInclusive, maxInclusive);
+    }
+  }
+
+  @Override
+  protected void checkSupportsDocValues() { // we support DocValues
+  }
+
+  @Override
+  public List<IndexableField> createFields(SchemaField field, Object value) {
+    if (field.hasDocValues()) {
+      List<IndexableField> fields = new ArrayList<>();
+      fields.add(createField(field, value));
+      final BytesRef bytes = getCollationKey(field.getName(), value.toString());
+      if (field.multiValued()) {
+        fields.add(new SortedSetDocValuesField(field.getName(), bytes));
+      } else {
+        fields.add(new SortedDocValuesField(field.getName(), bytes));
+      }
+      return fields;
+    } else {
+      return Collections.singletonList(createField(field, value));
+    }
+  }
+
+  @Override
+  public Object marshalSortValue(Object value) {
+    return marshalBase64SortValue(value);
+  }
+
+  @Override
+  public Object unmarshalSortValue(Object value) {
+    return unmarshalBase64SortValue(value);
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6c070b4a/solr/contrib/analysis-extras/src/main/java/org/apache/solr/schema/package.html
----------------------------------------------------------------------
diff --git a/solr/contrib/analysis-extras/src/main/java/org/apache/solr/schema/package.html b/solr/contrib/analysis-extras/src/main/java/org/apache/solr/schema/package.html
new file mode 100644
index 0000000..27d68db
--- /dev/null
+++ b/solr/contrib/analysis-extras/src/main/java/org/apache/solr/schema/package.html
@@ -0,0 +1,23 @@
+<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<!-- not a package-info.java, because we already defined this package in core/ -->
+<html>
+<body>
+<code>FieldType</code> plugins that have additional dependencies.
+</body>
+</html>