You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by da...@apache.org on 2018/11/02 10:30:05 UTC
[7/7] lucene-solr:jira/gradle: Adding solr:analysis-extras module
Adding solr:analysis-extras module
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/6c070b4a
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/6c070b4a
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/6c070b4a
Branch: refs/heads/jira/gradle
Commit: 6c070b4a553f8c2da8f2481b1977f1ec239ae610
Parents: 5e447a4
Author: Cao Manh Dat <da...@apache.org>
Authored: Fri Nov 2 10:29:44 2018 +0000
Committer: Cao Manh Dat <da...@apache.org>
Committed: Fri Nov 2 10:29:44 2018 +0000
----------------------------------------------------------------------
lucene/analysis/common/build.gradle | 13 +
settings.gradle | 3 +-
solr/contrib/analysis-extras/build.gradle | 17 +
.../apache/solr/schema/ICUCollationField.java | 311 ----------
.../java/org/apache/solr/schema/package.html | 23 -
...ractNamedEntitiesUpdateProcessorFactory.java | 577 -------------------
.../apache/solr/update/processor/package.html | 24 -
.../analysis-extras/src/java/overview.html | 21 -
.../apache/solr/schema/ICUCollationField.java | 311 ++++++++++
.../java/org/apache/solr/schema/package.html | 23 +
...ractNamedEntitiesUpdateProcessorFactory.java | 577 +++++++++++++++++++
.../apache/solr/update/processor/package.html | 24 +
.../analysis-extras/src/main/java/overview.html | 21 +
.../solr/collection1/conf/en-test-ner.bin | Bin 2049 -> 0 bytes
.../solr/collection1/conf/en-test-sent.bin | Bin 1051 -> 0 bytes
.../solr/collection1/conf/en-test-tokenizer.bin | Bin 15100 -> 0 bytes
.../collection1/conf/schema-folding-extra.xml | 52 --
.../collection1/conf/schema-icucollate-dv.xml | 57 --
.../solr/collection1/conf/schema-icucollate.xml | 57 --
.../conf/schema-icucollateoptions.xml | 68 ---
.../collection1/conf/schema-opennlp-extract.xml | 49 --
.../collection1/conf/solrconfig-icucollate.xml | 27 -
.../conf/solrconfig-opennlp-extract.xml | 206 -------
.../solrconfig.snippet.randomindexconfig.xml | 48 --
.../TestFoldingMultitermExtrasQuery.java | 87 +++
.../solr/schema/TestICUCollationField.java | 192 ++++++
.../schema/TestICUCollationFieldDocValues.java | 180 ++++++
.../schema/TestICUCollationFieldOptions.java | 119 ++++
...ractNamedEntitiesUpdateProcessorFactory.java | 195 +++++++
.../TestFoldingMultitermExtrasQuery.java | 87 ---
.../solr/schema/TestICUCollationField.java | 192 ------
.../schema/TestICUCollationFieldDocValues.java | 180 ------
.../schema/TestICUCollationFieldOptions.java | 119 ----
...ractNamedEntitiesUpdateProcessorFactory.java | 195 -------
.../solr/collection1/conf/en-test-ner.bin | Bin 0 -> 2049 bytes
.../solr/collection1/conf/en-test-sent.bin | Bin 0 -> 1051 bytes
.../solr/collection1/conf/en-test-tokenizer.bin | Bin 0 -> 15100 bytes
.../collection1/conf/schema-folding-extra.xml | 52 ++
.../collection1/conf/schema-icucollate-dv.xml | 57 ++
.../solr/collection1/conf/schema-icucollate.xml | 57 ++
.../conf/schema-icucollateoptions.xml | 68 +++
.../collection1/conf/schema-opennlp-extract.xml | 49 ++
.../collection1/conf/solrconfig-icucollate.xml | 27 +
.../conf/solrconfig-opennlp-extract.xml | 206 +++++++
.../solrconfig.snippet.randomindexconfig.xml | 48 ++
solr/core/build.gradle | 1 +
46 files changed, 2326 insertions(+), 2294 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6c070b4a/lucene/analysis/common/build.gradle
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/build.gradle b/lucene/analysis/common/build.gradle
index 73321f7..841f27c 100644
--- a/lucene/analysis/common/build.gradle
+++ b/lucene/analysis/common/build.gradle
@@ -1,5 +1,18 @@
apply plugin: 'java'
+task jarTest (type: Jar) {
+ from sourceSets.test.output
+ classifier = 'test'
+}
+
+configurations {
+ testOutput
+}
+
+artifacts {
+ testOutput jarTest
+}
+
dependencies {
compile project(':lucene:core')
testCompile project(':lucene:codecs')
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6c070b4a/settings.gradle
----------------------------------------------------------------------
diff --git a/settings.gradle b/settings.gradle
index 56554a7..5884dcb 100644
--- a/settings.gradle
+++ b/settings.gradle
@@ -45,4 +45,5 @@ include 'solr:core'
include 'solr:server'
include 'solr:solrj'
include 'solr:test-framework'
-include 'solr:example:example-DIH'
\ No newline at end of file
+include 'solr:example:example-DIH'
+include 'solr:contrib:analysis-extras'
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6c070b4a/solr/contrib/analysis-extras/build.gradle
----------------------------------------------------------------------
diff --git a/solr/contrib/analysis-extras/build.gradle b/solr/contrib/analysis-extras/build.gradle
new file mode 100644
index 0000000..6eba4ad
--- /dev/null
+++ b/solr/contrib/analysis-extras/build.gradle
@@ -0,0 +1,17 @@
+apply plugin: 'java'
+
+dependencies {
+ compile library.icu4j
+ compile library.opennlp_tools
+ compile library.morfologik_fsa
+ compile library.morfologik_polish
+ compile library.morfologik_stemming
+
+ compile project(':solr:solrj')
+ compile project(':solr:core')
+ compile project(':lucene:analysis:icu')
+ compile project(':lucene:analysis:opennlp')
+
+ testCompile project(':solr:test-framework')
+ testCompile project(path: ':lucene:analysis:common', configuration: 'testOutput')
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6c070b4a/solr/contrib/analysis-extras/src/java/org/apache/solr/schema/ICUCollationField.java
----------------------------------------------------------------------
diff --git a/solr/contrib/analysis-extras/src/java/org/apache/solr/schema/ICUCollationField.java b/solr/contrib/analysis-extras/src/java/org/apache/solr/schema/ICUCollationField.java
deleted file mode 100644
index f723a25..0000000
--- a/solr/contrib/analysis-extras/src/java/org/apache/solr/schema/ICUCollationField.java
+++ /dev/null
@@ -1,311 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.solr.schema;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.List;
-import java.util.Map;
-
-import org.apache.commons.io.IOUtils;
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
-import org.apache.lucene.analysis.util.ResourceLoader;
-import org.apache.lucene.collation.ICUCollationKeyAnalyzer;
-import org.apache.lucene.document.SortedDocValuesField;
-import org.apache.lucene.document.SortedSetDocValuesField;
-import org.apache.lucene.index.IndexableField;
-import org.apache.lucene.search.Query;
-import org.apache.lucene.search.SortField;
-import org.apache.lucene.search.TermRangeQuery;
-import org.apache.lucene.util.BytesRef;
-import org.apache.solr.common.SolrException.ErrorCode;
-import org.apache.solr.common.SolrException;
-import org.apache.solr.response.TextResponseWriter;
-import org.apache.solr.search.QParser;
-import org.apache.solr.uninverting.UninvertingReader.Type;
-
-import com.ibm.icu.text.Collator;
-import com.ibm.icu.text.RuleBasedCollator;
-import com.ibm.icu.util.ULocale;
-
-/**
- * Field for collated sort keys.
- * These can be used for locale-sensitive sort and range queries.
- * <p>
- * This field can be created in two ways:
- * <ul>
- * <li>Based upon a system collator associated with a Locale.
- * <li>Based upon a tailored ruleset.
- * </ul>
- * <p>
- * Using a System collator:
- * <ul>
- * <li>locale: RFC 3066 locale ID (mandatory)
- * <li>strength: 'primary','secondary','tertiary', 'quaternary', or 'identical' (optional)
- * <li>decomposition: 'no', or 'canonical' (optional)
- * </ul>
- * <p>
- * Using a Tailored ruleset:
- * <ul>
- * <li>custom: UTF-8 text file containing rules supported by RuleBasedCollator (mandatory)
- * <li>strength: 'primary','secondary','tertiary', 'quaternary', or 'identical' (optional)
- * <li>decomposition: 'no' or 'canonical' (optional)
- * </ul>
- * <p>
- * Expert options:
- * <ul>
- * <li>alternate: 'shifted' or 'non-ignorable'. Can be used to ignore punctuation/whitespace.
- * <li>caseLevel: 'true' or 'false'. Useful with strength=primary to ignore accents but not case.
- * <li>caseFirst: 'lower' or 'upper'. Useful to control which is sorted first when case is not ignored.
- * <li>numeric: 'true' or 'false'. Digits are sorted according to numeric value, e.g. foobar-9 sorts before foobar-10
- * <li>variableTop: single character or contraction. Controls what is variable for 'alternate'
- * </ul>
- *
- * @see Collator
- * @see ULocale
- * @see RuleBasedCollator
- */
-public class ICUCollationField extends FieldType {
- private Analyzer analyzer;
-
- @Override
- protected void init(IndexSchema schema, Map<String,String> args) {
- properties |= TOKENIZED; // this ensures our analyzer gets hit
- setup(schema.getResourceLoader(), args);
- super.init(schema, args);
- }
-
- /**
- * Setup the field according to the provided parameters
- */
- private void setup(ResourceLoader loader, Map<String,String> args) {
- String custom = args.remove("custom");
- String localeID = args.remove("locale");
- String strength = args.remove("strength");
- String decomposition = args.remove("decomposition");
-
- String alternate = args.remove("alternate");
- String caseLevel = args.remove("caseLevel");
- String caseFirst = args.remove("caseFirst");
- String numeric = args.remove("numeric");
- String variableTop = args.remove("variableTop");
-
- if (custom == null && localeID == null)
- throw new SolrException(ErrorCode.SERVER_ERROR, "Either custom or locale is required.");
-
- if (custom != null && localeID != null)
- throw new SolrException(ErrorCode.SERVER_ERROR, "Cannot specify both locale and custom. "
- + "To tailor rules for a built-in language, see the javadocs for RuleBasedCollator. "
- + "Then save the entire customized ruleset to a file, and use with the custom parameter");
-
- final Collator collator;
-
- if (localeID != null) {
- // create from a system collator, based on Locale.
- collator = createFromLocale(localeID);
- } else {
- // create from a custom ruleset
- collator = createFromRules(custom, loader);
- }
-
- // set the strength flag, otherwise it will be the default.
- if (strength != null) {
- if (strength.equalsIgnoreCase("primary"))
- collator.setStrength(Collator.PRIMARY);
- else if (strength.equalsIgnoreCase("secondary"))
- collator.setStrength(Collator.SECONDARY);
- else if (strength.equalsIgnoreCase("tertiary"))
- collator.setStrength(Collator.TERTIARY);
- else if (strength.equalsIgnoreCase("quaternary"))
- collator.setStrength(Collator.QUATERNARY);
- else if (strength.equalsIgnoreCase("identical"))
- collator.setStrength(Collator.IDENTICAL);
- else
- throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid strength: " + strength);
- }
-
- // set the decomposition flag, otherwise it will be the default.
- if (decomposition != null) {
- if (decomposition.equalsIgnoreCase("no"))
- collator.setDecomposition(Collator.NO_DECOMPOSITION);
- else if (decomposition.equalsIgnoreCase("canonical"))
- collator.setDecomposition(Collator.CANONICAL_DECOMPOSITION);
- else
- throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid decomposition: " + decomposition);
- }
-
- // expert options: concrete subclasses are always a RuleBasedCollator
- RuleBasedCollator rbc = (RuleBasedCollator) collator;
- if (alternate != null) {
- if (alternate.equalsIgnoreCase("shifted")) {
- rbc.setAlternateHandlingShifted(true);
- } else if (alternate.equalsIgnoreCase("non-ignorable")) {
- rbc.setAlternateHandlingShifted(false);
- } else {
- throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid alternate: " + alternate);
- }
- }
- if (caseLevel != null) {
- rbc.setCaseLevel(Boolean.parseBoolean(caseLevel));
- }
- if (caseFirst != null) {
- if (caseFirst.equalsIgnoreCase("lower")) {
- rbc.setLowerCaseFirst(true);
- } else if (caseFirst.equalsIgnoreCase("upper")) {
- rbc.setUpperCaseFirst(true);
- } else {
- throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid caseFirst: " + caseFirst);
- }
- }
- if (numeric != null) {
- rbc.setNumericCollation(Boolean.parseBoolean(numeric));
- }
- if (variableTop != null) {
- rbc.setVariableTop(variableTop);
- }
-
- analyzer = new ICUCollationKeyAnalyzer(collator);
- }
-
- /**
- * Create a locale from localeID.
- * Then return the appropriate collator for the locale.
- */
- private Collator createFromLocale(String localeID) {
- return Collator.getInstance(new ULocale(localeID));
- }
-
- /**
- * Read custom rules from a file, and create a RuleBasedCollator
- * The file cannot support comments, as # might be in the rules!
- */
- static Collator createFromRules(String fileName, ResourceLoader loader) {
- InputStream input = null;
- try {
- input = loader.openResource(fileName);
- String rules = IOUtils.toString(input, "UTF-8");
- return new RuleBasedCollator(rules);
- } catch (Exception e) {
- // io error or invalid rules
- throw new RuntimeException(e);
- } finally {
- IOUtils.closeQuietly(input);
- }
- }
-
- @Override
- public void write(TextResponseWriter writer, String name, IndexableField f) throws IOException {
- writer.writeStr(name, f.stringValue(), true);
- }
-
- @Override
- public SortField getSortField(SchemaField field, boolean top) {
- return getStringSort(field, top);
- }
-
- @Override
- public Type getUninversionType(SchemaField sf) {
- if (sf.multiValued()) {
- return Type.SORTED_SET_BINARY;
- } else {
- return Type.SORTED;
- }
- }
-
- @Override
- public Analyzer getIndexAnalyzer() {
- return analyzer;
- }
-
- @Override
- public Analyzer getQueryAnalyzer() {
- return analyzer;
- }
-
- /**
- * analyze the text with the analyzer, instead of the collator.
- * because icu collators are not thread safe, this keeps things
- * simple (we already have a threadlocal clone in the reused TS)
- */
- private BytesRef getCollationKey(String field, String text) {
- try (TokenStream source = analyzer.tokenStream(field, text)) {
- source.reset();
-
- TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
-
-
- // we control the analyzer here: most errors are impossible
- if (!source.incrementToken())
- throw new IllegalArgumentException("analyzer returned no terms for text: " + text);
- BytesRef bytes = BytesRef.deepCopyOf(termAtt.getBytesRef());
- assert !source.incrementToken();
-
- source.end();
- return bytes;
- } catch (IOException e) {
- throw new RuntimeException("Unable to analyze text: " + text, e);
- }
- }
-
- @Override
- public Query getRangeQuery(QParser parser, SchemaField field, String part1, String part2, boolean minInclusive, boolean maxInclusive) {
- String f = field.getName();
- BytesRef low = part1 == null ? null : getCollationKey(f, part1);
- BytesRef high = part2 == null ? null : getCollationKey(f, part2);
- if (!field.indexed() && field.hasDocValues()) {
- return SortedSetDocValuesField.newSlowRangeQuery(
- field.getName(), low, high, minInclusive, maxInclusive);
- } else {
- return new TermRangeQuery(field.getName(), low, high, minInclusive, maxInclusive);
- }
- }
-
- @Override
- protected void checkSupportsDocValues() { // we support DocValues
- }
-
- @Override
- public List<IndexableField> createFields(SchemaField field, Object value) {
- if (field.hasDocValues()) {
- List<IndexableField> fields = new ArrayList<>();
- fields.add(createField(field, value));
- final BytesRef bytes = getCollationKey(field.getName(), value.toString());
- if (field.multiValued()) {
- fields.add(new SortedSetDocValuesField(field.getName(), bytes));
- } else {
- fields.add(new SortedDocValuesField(field.getName(), bytes));
- }
- return fields;
- } else {
- return Collections.singletonList(createField(field, value));
- }
- }
-
- @Override
- public Object marshalSortValue(Object value) {
- return marshalBase64SortValue(value);
- }
-
- @Override
- public Object unmarshalSortValue(Object value) {
- return unmarshalBase64SortValue(value);
- }
-}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6c070b4a/solr/contrib/analysis-extras/src/java/org/apache/solr/schema/package.html
----------------------------------------------------------------------
diff --git a/solr/contrib/analysis-extras/src/java/org/apache/solr/schema/package.html b/solr/contrib/analysis-extras/src/java/org/apache/solr/schema/package.html
deleted file mode 100644
index 27d68db..0000000
--- a/solr/contrib/analysis-extras/src/java/org/apache/solr/schema/package.html
+++ /dev/null
@@ -1,23 +0,0 @@
-<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<!-- not a package-info.java, because we already defined this package in core/ -->
-<html>
-<body>
-<code>FieldType</code> plugins that have additional dependencies.
-</body>
-</html>
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6c070b4a/solr/contrib/analysis-extras/src/java/org/apache/solr/update/processor/OpenNLPExtractNamedEntitiesUpdateProcessorFactory.java
----------------------------------------------------------------------
diff --git a/solr/contrib/analysis-extras/src/java/org/apache/solr/update/processor/OpenNLPExtractNamedEntitiesUpdateProcessorFactory.java b/solr/contrib/analysis-extras/src/java/org/apache/solr/update/processor/OpenNLPExtractNamedEntitiesUpdateProcessorFactory.java
deleted file mode 100644
index d69c367..0000000
--- a/solr/contrib/analysis-extras/src/java/org/apache/solr/update/processor/OpenNLPExtractNamedEntitiesUpdateProcessorFactory.java
+++ /dev/null
@@ -1,577 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.solr.update.processor;
-
-import java.io.IOException;
-import java.lang.invoke.MethodHandles;
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Map;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-import java.util.regex.PatternSyntaxException;
-
-import opennlp.tools.util.Span;
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.opennlp.OpenNLPTokenizer;
-import org.apache.lucene.analysis.opennlp.tools.NLPNERTaggerOp;
-import org.apache.lucene.analysis.opennlp.tools.OpenNLPOpsFactory;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
-import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
-import org.apache.solr.common.SolrException;
-import org.apache.solr.common.SolrInputDocument;
-import org.apache.solr.common.SolrInputField;
-import org.apache.solr.common.util.NamedList;
-import org.apache.solr.common.util.Pair;
-import org.apache.solr.core.SolrCore;
-import org.apache.solr.request.SolrQueryRequest;
-import org.apache.solr.response.SolrQueryResponse;
-import org.apache.solr.schema.FieldType;
-import org.apache.solr.update.AddUpdateCommand;
-import org.apache.solr.update.processor.FieldMutatingUpdateProcessor.FieldNameSelector;
-import org.apache.solr.update.processor.FieldMutatingUpdateProcessorFactory.SelectorParams;
-import org.apache.solr.util.plugin.SolrCoreAware;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import static org.apache.solr.common.SolrException.ErrorCode.SERVER_ERROR;
-
-/**
- * Extracts named entities using an OpenNLP NER <code>modelFile</code> from the values found in
- * any matching <code>source</code> field into a configured <code>dest</code> field, after
- * first tokenizing the source text using the index analyzer on the configured
- * <code>analyzerFieldType</code>, which must include <code>solr.OpenNLPTokenizerFactory</code>
- * as the tokenizer. E.g.:
- *
- * <pre class="prettyprint">
- * <fieldType name="opennlp-en-tokenization" class="solr.TextField">
- * <analyzer>
- * <tokenizer class="solr.OpenNLPTokenizerFactory"
- * sentenceModel="en-sent.bin"
- * tokenizerModel="en-tokenizer.bin"/>
- * </analyzer>
- * </fieldType>
- * </pre>
- *
- * <p>See the <a href="http://opennlp.apache.org/models.html">OpenNLP website</a>
- * for information on downloading pre-trained models.</p>
- *
- * Note that in order to use model files larger than 1MB on SolrCloud,
- * <a href="https://lucene.apache.org/solr/guide/setting-up-an-external-zookeeper-ensemble#increasing-zookeeper-s-1mb-file-size-limit"
- * >ZooKeeper server and client configuration is required</a>.
- *
- * <p>
- * The <code>source</code> field(s) can be configured as either:
- * </p>
- * <ul>
- * <li>One or more <code><str></code></li>
- * <li>An <code><arr></code> of <code><str></code></li>
- * <li>A <code><lst></code> containing
- * {@link FieldMutatingUpdateProcessor FieldMutatingUpdateProcessorFactory style selector arguments}</li>
- * </ul>
- *
- * <p>The <code>dest</code> field can be a single <code><str></code>
- * containing the literal name of a destination field, or it may be a <code><lst></code> specifying a
- * regex <code>pattern</code> and a <code>replacement</code> string. If the pattern + replacement option
- * is used the pattern will be matched against all fields matched by the source selector, and the replacement
- * string (including any capture groups specified from the pattern) will be evaluated a using
- * {@link Matcher#replaceAll(String)} to generate the literal name of the destination field. Additionally,
- * an occurrence of the string "{EntityType}" in the <code>dest</code> field specification, or in the
- * <code>replacement</code> string, will be replaced with the entity type(s) returned for each entity by
- * the OpenNLP NER model; as a result, if the model extracts more than one entity type, then more than one
- * <code>dest</code> field will be populated.
- * </p>
- *
- * <p>If the resolved <code>dest</code> field already exists in the document, then the
- * named entities extracted from the <code>source</code> fields will be added to it.
- * </p>
- * <p>
- * In the example below:
- * </p>
- * <ul>
- * <li>Named entities will be extracted from the <code>text</code> field and added
- * to the <code>names_ss</code> field</li>
- * <li>Named entities will be extracted from both the <code>title</code> and
- * <code>subtitle</code> fields and added into the <code>titular_people</code> field</li>
- * <li>Named entities will be extracted from any field with a name ending in <code>_txt</code>
- * -- except for <code>notes_txt</code> -- and added into the <code>people_ss</code> field</li>
- * <li>Named entities will be extracted from any field with a name beginning with "desc" and
- * ending in "s" (e.g. "descs" and "descriptions") and added to a field prefixed with "key_",
- * not ending in "s", and suffixed with "_people". (e.g. "key_desc_people" or
- * "key_description_people")</li>
- * <li>Named entities will be extracted from the <code>summary</code> field and added
- * to the <code>summary_person_ss</code> field, assuming that the modelFile only extracts
- * entities of type "person".</li>
- * </ul>
- *
- * <pre class="prettyprint">
- * <updateRequestProcessorChain name="multiple-extract">
- * <processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory">
- * <str name="modelFile">en-test-ner-person.bin</str>
- * <str name="analyzerFieldType">opennlp-en-tokenization</str>
- * <str name="source">text</str>
- * <str name="dest">people_s</str>
- * </processor>
- * <processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory">
- * <str name="modelFile">en-test-ner-person.bin</str>
- * <str name="analyzerFieldType">opennlp-en-tokenization</str>
- * <arr name="source">
- * <str>title</str>
- * <str>subtitle</str>
- * </arr>
- * <str name="dest">titular_people</str>
- * </processor>
- * <processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory">
- * <str name="modelFile">en-test-ner-person.bin</str>
- * <str name="analyzerFieldType">opennlp-en-tokenization</str>
- * <lst name="source">
- * <str name="fieldRegex">.*_txt$</str>
- * <lst name="exclude">
- * <str name="fieldName">notes_txt</str>
- * </lst>
- * </lst>
- * <str name="dest">people_s</str>
- * </processor>
- * <processor class="solr.processor.OpenNLPExtractNamedEntitiesUpdateProcessorFactory">
- * <str name="modelFile">en-test-ner-person.bin</str>
- * <str name="analyzerFieldType">opennlp-en-tokenization</str>
- * <lst name="source">
- * <str name="fieldRegex">^desc(.*)s$</str>
- * </lst>
- * <lst name="dest">
- * <str name="pattern">^desc(.*)s$</str>
- * <str name="replacement">key_desc$1_people</str>
- * </lst>
- * </processor>
- * <processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory">
- * <str name="modelFile">en-test-ner-person.bin</str>
- * <str name="analyzerFieldType">opennlp-en-tokenization</str>
- * <str name="source">summary</str>
- * <str name="dest">summary_{EntityType}_s</str>
- * </processor>
- * <processor class="solr.LogUpdateProcessorFactory" />
- * <processor class="solr.RunUpdateProcessorFactory" />
- * </updateRequestProcessorChain>
- * </pre>
- *
- * @since 7.3.0
- */
-public class OpenNLPExtractNamedEntitiesUpdateProcessorFactory
- extends UpdateRequestProcessorFactory implements SolrCoreAware {
-
- private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
-
- public static final String SOURCE_PARAM = "source";
- public static final String DEST_PARAM = "dest";
- public static final String PATTERN_PARAM = "pattern";
- public static final String REPLACEMENT_PARAM = "replacement";
- public static final String MODEL_PARAM = "modelFile";
- public static final String ANALYZER_FIELD_TYPE_PARAM = "analyzerFieldType";
- public static final String ENTITY_TYPE = "{EntityType}";
-
- private SelectorParams srcInclusions = new SelectorParams();
- private Collection<SelectorParams> srcExclusions = new ArrayList<>();
-
- private FieldNameSelector srcSelector = null;
-
- private String modelFile = null;
- private String analyzerFieldType = null;
-
- /**
- * If pattern is null, this this is a literal field name. If pattern is non-null then this
- * is a replacement string that may contain meta-characters (ie: capture group identifiers)
- * @see #pattern
- */
- private String dest = null;
- /** @see #dest */
- private Pattern pattern = null;
-
- protected final FieldNameSelector getSourceSelector() {
- if (null != srcSelector) return srcSelector;
-
- throw new SolrException(SERVER_ERROR, "selector was never initialized, inform(SolrCore) never called???");
- }
-
- @SuppressWarnings("unchecked")
- @Override
- public void init(NamedList args) {
-
- // high level (loose) check for which type of config we have.
- //
- // individual init methods do more strict syntax checking
- if (0 <= args.indexOf(SOURCE_PARAM, 0) && 0 <= args.indexOf(DEST_PARAM, 0) ) {
- initSourceSelectorSyntax(args);
- } else if (0 <= args.indexOf(PATTERN_PARAM, 0) && 0 <= args.indexOf(REPLACEMENT_PARAM, 0)) {
- initSimpleRegexReplacement(args);
- } else {
- throw new SolrException(SERVER_ERROR, "A combination of either '" + SOURCE_PARAM + "' + '"+
- DEST_PARAM + "', or '" + REPLACEMENT_PARAM + "' + '" +
- PATTERN_PARAM + "' init params are mandatory");
- }
-
- Object modelParam = args.remove(MODEL_PARAM);
- if (null == modelParam) {
- throw new SolrException(SERVER_ERROR, "Missing required init param '" + MODEL_PARAM + "'");
- }
- if ( ! (modelParam instanceof CharSequence)) {
- throw new SolrException(SERVER_ERROR, "Init param '" + MODEL_PARAM + "' must be a <str>");
- }
- modelFile = modelParam.toString();
-
- Object analyzerFieldTypeParam = args.remove(ANALYZER_FIELD_TYPE_PARAM);
- if (null == analyzerFieldTypeParam) {
- throw new SolrException(SERVER_ERROR, "Missing required init param '" + ANALYZER_FIELD_TYPE_PARAM + "'");
- }
- if ( ! (analyzerFieldTypeParam instanceof CharSequence)) {
- throw new SolrException(SERVER_ERROR, "Init param '" + ANALYZER_FIELD_TYPE_PARAM + "' must be a <str>");
- }
- analyzerFieldType = analyzerFieldTypeParam.toString();
-
- if (0 < args.size()) {
- throw new SolrException(SERVER_ERROR, "Unexpected init param(s): '" + args.getName(0) + "'");
- }
-
- super.init(args);
- }
-
- /**
- * init helper method that should only be called when we know for certain that both the
- * "source" and "dest" init params do <em>not</em> exist.
- */
- @SuppressWarnings("unchecked")
- private void initSimpleRegexReplacement(NamedList args) {
- // The syntactic sugar for the case where there is only one regex pattern for source and the same pattern
- // is used for the destination pattern...
- //
- // pattern != null && replacement != null
- //
- // ...as top level elements, with no other config options specified
-
- // if we got here we know we had pattern and replacement, now check for the other two so that we can give a better
- // message than "unexpected"
- if (0 <= args.indexOf(SOURCE_PARAM, 0) || 0 <= args.indexOf(DEST_PARAM, 0) ) {
- throw new SolrException(SERVER_ERROR,"Short hand syntax must not be mixed with full syntax. Found " +
- PATTERN_PARAM + " and " + REPLACEMENT_PARAM + " but also found " + SOURCE_PARAM + " or " + DEST_PARAM);
- }
-
- assert args.indexOf(SOURCE_PARAM, 0) < 0;
-
- Object patt = args.remove(PATTERN_PARAM);
- Object replacement = args.remove(REPLACEMENT_PARAM);
-
- if (null == patt || null == replacement) {
- throw new SolrException(SERVER_ERROR, "Init params '" + PATTERN_PARAM + "' and '" +
- REPLACEMENT_PARAM + "' are both mandatory if '" + SOURCE_PARAM + "' and '"+
- DEST_PARAM + "' are not both specified");
- }
-
- if (0 != args.size()) {
- throw new SolrException(SERVER_ERROR, "Init params '" + REPLACEMENT_PARAM + "' and '" +
- PATTERN_PARAM + "' must be children of '" + DEST_PARAM +
- "' to be combined with other options.");
- }
-
- if (!(replacement instanceof String)) {
- throw new SolrException(SERVER_ERROR, "Init param '" + REPLACEMENT_PARAM + "' must be a string (i.e. <str>)");
- }
- if (!(patt instanceof String)) {
- throw new SolrException(SERVER_ERROR, "Init param '" + PATTERN_PARAM + "' must be a string (i.e. <str>)");
- }
-
- dest = replacement.toString();
- try {
- this.pattern = Pattern.compile(patt.toString());
- } catch (PatternSyntaxException pe) {
- throw new SolrException(SERVER_ERROR, "Init param " + PATTERN_PARAM +
- " is not a valid regex pattern: " + patt, pe);
-
- }
- srcInclusions = new SelectorParams();
- srcInclusions.fieldRegex = Collections.singletonList(this.pattern);
- }
-
- /**
- * init helper method that should only be called when we know for certain that both the
- * "source" and "dest" init params <em>do</em> exist.
- */
- @SuppressWarnings("unchecked")
- private void initSourceSelectorSyntax(NamedList args) {
- // Full and complete syntax where source and dest are mandatory.
- //
- // source may be a single string or a selector.
- // dest may be a single string or list containing pattern and replacement
- //
- // source != null && dest != null
-
- // if we got here we know we had source and dest, now check for the other two so that we can give a better
- // message than "unexpected"
- if (0 <= args.indexOf(PATTERN_PARAM, 0) || 0 <= args.indexOf(REPLACEMENT_PARAM, 0) ) {
- throw new SolrException(SERVER_ERROR,"Short hand syntax must not be mixed with full syntax. Found " +
- SOURCE_PARAM + " and " + DEST_PARAM + " but also found " + PATTERN_PARAM + " or " + REPLACEMENT_PARAM);
- }
-
- Object d = args.remove(DEST_PARAM);
- assert null != d;
-
- List<Object> sources = args.getAll(SOURCE_PARAM);
- assert null != sources;
-
- if (1 == sources.size()) {
- if (sources.get(0) instanceof NamedList) {
- // nested set of selector options
- NamedList selectorConfig = (NamedList) args.remove(SOURCE_PARAM);
-
- srcInclusions = parseSelectorParams(selectorConfig);
-
- List<Object> excList = selectorConfig.getAll("exclude");
-
- for (Object excObj : excList) {
- if (null == excObj) {
- throw new SolrException(SERVER_ERROR, "Init param '" + SOURCE_PARAM +
- "' child 'exclude' can not be null");
- }
- if (!(excObj instanceof NamedList)) {
- throw new SolrException(SERVER_ERROR, "Init param '" + SOURCE_PARAM +
- "' child 'exclude' must be <lst/>");
- }
- NamedList exc = (NamedList) excObj;
- srcExclusions.add(parseSelectorParams(exc));
- if (0 < exc.size()) {
- throw new SolrException(SERVER_ERROR, "Init param '" + SOURCE_PARAM +
- "' has unexpected 'exclude' sub-param(s): '"
- + selectorConfig.getName(0) + "'");
- }
- // call once per instance
- selectorConfig.remove("exclude");
- }
-
- if (0 < selectorConfig.size()) {
- throw new SolrException(SERVER_ERROR, "Init param '" + SOURCE_PARAM +
- "' contains unexpected child param(s): '" +
- selectorConfig.getName(0) + "'");
- }
- // consume from the named list so it doesn't interfere with subsequent processing
- sources.remove(0);
- }
- }
- if (1 <= sources.size()) {
- // source better be one or more strings
- srcInclusions.fieldName = new HashSet<>(args.removeConfigArgs("source"));
- }
- if (srcInclusions == null) {
- throw new SolrException(SERVER_ERROR,
- "Init params do not specify any field from which to extract entities, please supply either "
- + SOURCE_PARAM + " and " + DEST_PARAM + " or " + PATTERN_PARAM + " and " + REPLACEMENT_PARAM + ". See javadocs" +
- "for OpenNLPExtractNamedEntitiesUpdateProcessor for further details.");
- }
-
- if (d instanceof NamedList) {
- NamedList destList = (NamedList) d;
-
- Object patt = destList.remove(PATTERN_PARAM);
- Object replacement = destList.remove(REPLACEMENT_PARAM);
-
- if (null == patt || null == replacement) {
- throw new SolrException(SERVER_ERROR, "Init param '" + DEST_PARAM + "' children '" +
- PATTERN_PARAM + "' and '" + REPLACEMENT_PARAM +
- "' are both mandatory and can not be null");
- }
- if (! (patt instanceof String && replacement instanceof String)) {
- throw new SolrException(SERVER_ERROR, "Init param '" + DEST_PARAM + "' children '" +
- PATTERN_PARAM + "' and '" + REPLACEMENT_PARAM +
- "' must both be strings (i.e. <str>)");
- }
- if (0 != destList.size()) {
- throw new SolrException(SERVER_ERROR, "Init param '" + DEST_PARAM + "' has unexpected children: '"
- + destList.getName(0) + "'");
- }
-
- try {
- this.pattern = Pattern.compile(patt.toString());
- } catch (PatternSyntaxException pe) {
- throw new SolrException(SERVER_ERROR, "Init param '" + DEST_PARAM + "' child '" + PATTERN_PARAM +
- " is not a valid regex pattern: " + patt, pe);
- }
- dest = replacement.toString();
-
- } else if (d instanceof String) {
- dest = d.toString();
- } else {
- throw new SolrException(SERVER_ERROR, "Init param '" + DEST_PARAM + "' must either be a string " +
- "(i.e. <str>) or a list (i.e. <lst>) containing '" +
- PATTERN_PARAM + "' and '" + REPLACEMENT_PARAM);
- }
-
- }
-
- @Override
- public void inform(final SolrCore core) {
-
- srcSelector =
- FieldMutatingUpdateProcessor.createFieldNameSelector
- (core.getResourceLoader(), core, srcInclusions, FieldMutatingUpdateProcessor.SELECT_NO_FIELDS);
-
- for (SelectorParams exc : srcExclusions) {
- srcSelector = FieldMutatingUpdateProcessor.wrap
- (srcSelector,
- FieldMutatingUpdateProcessor.createFieldNameSelector
- (core.getResourceLoader(), core, exc, FieldMutatingUpdateProcessor.SELECT_NO_FIELDS));
- }
- try {
- OpenNLPOpsFactory.getNERTaggerModel(modelFile, core.getResourceLoader());
- } catch (IOException e) {
- throw new IllegalArgumentException(e);
- }
- }
-
- @Override
- public final UpdateRequestProcessor getInstance
- (SolrQueryRequest req, SolrQueryResponse rsp, UpdateRequestProcessor next) {
- final FieldNameSelector srcSelector = getSourceSelector();
- return new UpdateRequestProcessor(next) {
- private final NLPNERTaggerOp nerTaggerOp;
- private Analyzer analyzer = null;
- {
- try {
- nerTaggerOp = OpenNLPOpsFactory.getNERTagger(modelFile);
- FieldType fieldType = req.getSchema().getFieldTypeByName(analyzerFieldType);
- if (fieldType == null) {
- throw new SolrException
- (SERVER_ERROR, ANALYZER_FIELD_TYPE_PARAM + " '" + analyzerFieldType + "' not found in the schema.");
- }
- analyzer = fieldType.getIndexAnalyzer();
- } catch (IOException e) {
- throw new IllegalArgumentException(e);
- }
- }
-
- @Override
- public void processAdd(AddUpdateCommand cmd) throws IOException {
-
- final SolrInputDocument doc = cmd.getSolrInputDocument();
-
- // Destination may be regex replace string, or "{EntityType}" replaced by
- // each entity's type, both of which can cause multiple output fields.
- Map<String,SolrInputField> destMap = new HashMap<>();
-
- // preserve initial values
- for (final String fname : doc.getFieldNames()) {
- if ( ! srcSelector.shouldMutate(fname)) continue;
-
- Collection<Object> srcFieldValues = doc.getFieldValues(fname);
- if (srcFieldValues == null || srcFieldValues.isEmpty()) continue;
-
- String resolvedDest = dest;
-
- if (pattern != null) {
- Matcher matcher = pattern.matcher(fname);
- if (matcher.find()) {
- resolvedDest = matcher.replaceAll(dest);
- } else {
- log.debug("srcSelector.shouldMutate(\"{}\") returned true, " +
- "but replacement pattern did not match, field skipped.", fname);
- continue;
- }
- }
-
- for (Object val : srcFieldValues) {
- for (Pair<String,String> entity : extractTypedNamedEntities(val)) {
- SolrInputField destField = null;
- String entityName = entity.first();
- String entityType = entity.second();
- final String resolved = resolvedDest.replace(ENTITY_TYPE, entityType);
- if (doc.containsKey(resolved)) {
- destField = doc.getField(resolved);
- } else {
- SolrInputField targetField = destMap.get(resolved);
- if (targetField == null) {
- destField = new SolrInputField(resolved);
- } else {
- destField = targetField;
- }
- }
- destField.addValue(entityName);
-
- // put it in map to avoid concurrent modification...
- destMap.put(resolved, destField);
- }
- }
- }
-
- for (Map.Entry<String,SolrInputField> entry : destMap.entrySet()) {
- doc.put(entry.getKey(), entry.getValue());
- }
- super.processAdd(cmd);
- }
-
- /** Using configured NER model, extracts (name, type) pairs from the given source field value */
- private List<Pair<String,String>> extractTypedNamedEntities(Object srcFieldValue) throws IOException {
- List<Pair<String,String>> entitiesWithType = new ArrayList<>();
- List<String> terms = new ArrayList<>();
- List<Integer> startOffsets = new ArrayList<>();
- List<Integer> endOffsets = new ArrayList<>();
- String fullText = srcFieldValue.toString();
- TokenStream tokenStream = analyzer.tokenStream("", fullText);
- CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class);
- OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
- FlagsAttribute flagsAtt = tokenStream.addAttribute(FlagsAttribute.class);
- tokenStream.reset();
- synchronized (nerTaggerOp) {
- while (tokenStream.incrementToken()) {
- terms.add(termAtt.toString());
- startOffsets.add(offsetAtt.startOffset());
- endOffsets.add(offsetAtt.endOffset());
- boolean endOfSentence = 0 != (flagsAtt.getFlags() & OpenNLPTokenizer.EOS_FLAG_BIT);
- if (endOfSentence) { // extract named entities one sentence at a time
- extractEntitiesFromSentence(fullText, terms, startOffsets, endOffsets, entitiesWithType);
- }
- }
- tokenStream.end();
- tokenStream.close();
- if (!terms.isEmpty()) { // In case last token of last sentence isn't properly flagged with EOS_FLAG_BIT
- extractEntitiesFromSentence(fullText, terms, startOffsets, endOffsets, entitiesWithType);
- }
- nerTaggerOp.reset(); // Forget all adaptive data collected during previous calls
- }
- return entitiesWithType;
- }
-
- private void extractEntitiesFromSentence(String fullText, List<String> terms, List<Integer> startOffsets,
- List<Integer> endOffsets, List<Pair<String,String>> entitiesWithType) {
- for (Span span : nerTaggerOp.getNames(terms.toArray(new String[terms.size()]))) {
- String text = fullText.substring(startOffsets.get(span.getStart()), endOffsets.get(span.getEnd() - 1));
- entitiesWithType.add(new Pair<>(text, span.getType()));
- }
- terms.clear();
- startOffsets.clear();
- endOffsets.clear();
- }
- };
- }
-
- /** macro */
- private static SelectorParams parseSelectorParams(NamedList args) {
- return FieldMutatingUpdateProcessorFactory.parseSelectorParams(args);
- }
-}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6c070b4a/solr/contrib/analysis-extras/src/java/org/apache/solr/update/processor/package.html
----------------------------------------------------------------------
diff --git a/solr/contrib/analysis-extras/src/java/org/apache/solr/update/processor/package.html b/solr/contrib/analysis-extras/src/java/org/apache/solr/update/processor/package.html
deleted file mode 100644
index 1388c29..0000000
--- a/solr/contrib/analysis-extras/src/java/org/apache/solr/update/processor/package.html
+++ /dev/null
@@ -1,24 +0,0 @@
-<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<!-- not a package-info.java, because we already defined this package in core/ -->
-<html>
- <body>
- Update request processor invoking OpenNLP Named Entity Recognition over configured
- source field(s), populating configured target field(s) with the results.
- </body>
-</html>
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6c070b4a/solr/contrib/analysis-extras/src/java/overview.html
----------------------------------------------------------------------
diff --git a/solr/contrib/analysis-extras/src/java/overview.html b/solr/contrib/analysis-extras/src/java/overview.html
deleted file mode 100644
index f3d70ca..0000000
--- a/solr/contrib/analysis-extras/src/java/overview.html
+++ /dev/null
@@ -1,21 +0,0 @@
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<html>
-<body>
-Apache Solr Search Server: Analysis Extras contrib
-</body>
-</html>
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6c070b4a/solr/contrib/analysis-extras/src/main/java/org/apache/solr/schema/ICUCollationField.java
----------------------------------------------------------------------
diff --git a/solr/contrib/analysis-extras/src/main/java/org/apache/solr/schema/ICUCollationField.java b/solr/contrib/analysis-extras/src/main/java/org/apache/solr/schema/ICUCollationField.java
new file mode 100644
index 0000000..f723a25
--- /dev/null
+++ b/solr/contrib/analysis-extras/src/main/java/org/apache/solr/schema/ICUCollationField.java
@@ -0,0 +1,311 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.schema;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
+import org.apache.lucene.analysis.util.ResourceLoader;
+import org.apache.lucene.collation.ICUCollationKeyAnalyzer;
+import org.apache.lucene.document.SortedDocValuesField;
+import org.apache.lucene.document.SortedSetDocValuesField;
+import org.apache.lucene.index.IndexableField;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.SortField;
+import org.apache.lucene.search.TermRangeQuery;
+import org.apache.lucene.util.BytesRef;
+import org.apache.solr.common.SolrException.ErrorCode;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.response.TextResponseWriter;
+import org.apache.solr.search.QParser;
+import org.apache.solr.uninverting.UninvertingReader.Type;
+
+import com.ibm.icu.text.Collator;
+import com.ibm.icu.text.RuleBasedCollator;
+import com.ibm.icu.util.ULocale;
+
+/**
+ * Field for collated sort keys.
+ * These can be used for locale-sensitive sort and range queries.
+ * <p>
+ * This field can be created in two ways:
+ * <ul>
+ * <li>Based upon a system collator associated with a Locale.
+ * <li>Based upon a tailored ruleset.
+ * </ul>
+ * <p>
+ * Using a System collator:
+ * <ul>
+ * <li>locale: RFC 3066 locale ID (mandatory)
+ * <li>strength: 'primary','secondary','tertiary', 'quaternary', or 'identical' (optional)
+ * <li>decomposition: 'no', or 'canonical' (optional)
+ * </ul>
+ * <p>
+ * Using a Tailored ruleset:
+ * <ul>
+ * <li>custom: UTF-8 text file containing rules supported by RuleBasedCollator (mandatory)
+ * <li>strength: 'primary','secondary','tertiary', 'quaternary', or 'identical' (optional)
+ * <li>decomposition: 'no' or 'canonical' (optional)
+ * </ul>
+ * <p>
+ * Expert options:
+ * <ul>
+ * <li>alternate: 'shifted' or 'non-ignorable'. Can be used to ignore punctuation/whitespace.
+ * <li>caseLevel: 'true' or 'false'. Useful with strength=primary to ignore accents but not case.
+ * <li>caseFirst: 'lower' or 'upper'. Useful to control which is sorted first when case is not ignored.
+ * <li>numeric: 'true' or 'false'. Digits are sorted according to numeric value, e.g. foobar-9 sorts before foobar-10
+ * <li>variableTop: single character or contraction. Controls what is variable for 'alternate'
+ * </ul>
+ *
+ * @see Collator
+ * @see ULocale
+ * @see RuleBasedCollator
+ */
+public class ICUCollationField extends FieldType {
+ private Analyzer analyzer;
+
+ @Override
+ protected void init(IndexSchema schema, Map<String,String> args) {
+ properties |= TOKENIZED; // this ensures our analyzer gets hit
+ setup(schema.getResourceLoader(), args);
+ super.init(schema, args);
+ }
+
+ /**
+ * Setup the field according to the provided parameters
+ */
+ private void setup(ResourceLoader loader, Map<String,String> args) {
+ String custom = args.remove("custom");
+ String localeID = args.remove("locale");
+ String strength = args.remove("strength");
+ String decomposition = args.remove("decomposition");
+
+ String alternate = args.remove("alternate");
+ String caseLevel = args.remove("caseLevel");
+ String caseFirst = args.remove("caseFirst");
+ String numeric = args.remove("numeric");
+ String variableTop = args.remove("variableTop");
+
+ if (custom == null && localeID == null)
+ throw new SolrException(ErrorCode.SERVER_ERROR, "Either custom or locale is required.");
+
+ if (custom != null && localeID != null)
+ throw new SolrException(ErrorCode.SERVER_ERROR, "Cannot specify both locale and custom. "
+ + "To tailor rules for a built-in language, see the javadocs for RuleBasedCollator. "
+ + "Then save the entire customized ruleset to a file, and use with the custom parameter");
+
+ final Collator collator;
+
+ if (localeID != null) {
+ // create from a system collator, based on Locale.
+ collator = createFromLocale(localeID);
+ } else {
+ // create from a custom ruleset
+ collator = createFromRules(custom, loader);
+ }
+
+ // set the strength flag, otherwise it will be the default.
+ if (strength != null) {
+ if (strength.equalsIgnoreCase("primary"))
+ collator.setStrength(Collator.PRIMARY);
+ else if (strength.equalsIgnoreCase("secondary"))
+ collator.setStrength(Collator.SECONDARY);
+ else if (strength.equalsIgnoreCase("tertiary"))
+ collator.setStrength(Collator.TERTIARY);
+ else if (strength.equalsIgnoreCase("quaternary"))
+ collator.setStrength(Collator.QUATERNARY);
+ else if (strength.equalsIgnoreCase("identical"))
+ collator.setStrength(Collator.IDENTICAL);
+ else
+ throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid strength: " + strength);
+ }
+
+ // set the decomposition flag, otherwise it will be the default.
+ if (decomposition != null) {
+ if (decomposition.equalsIgnoreCase("no"))
+ collator.setDecomposition(Collator.NO_DECOMPOSITION);
+ else if (decomposition.equalsIgnoreCase("canonical"))
+ collator.setDecomposition(Collator.CANONICAL_DECOMPOSITION);
+ else
+ throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid decomposition: " + decomposition);
+ }
+
+ // expert options: concrete subclasses are always a RuleBasedCollator
+ RuleBasedCollator rbc = (RuleBasedCollator) collator;
+ if (alternate != null) {
+ if (alternate.equalsIgnoreCase("shifted")) {
+ rbc.setAlternateHandlingShifted(true);
+ } else if (alternate.equalsIgnoreCase("non-ignorable")) {
+ rbc.setAlternateHandlingShifted(false);
+ } else {
+ throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid alternate: " + alternate);
+ }
+ }
+ if (caseLevel != null) {
+ rbc.setCaseLevel(Boolean.parseBoolean(caseLevel));
+ }
+ if (caseFirst != null) {
+ if (caseFirst.equalsIgnoreCase("lower")) {
+ rbc.setLowerCaseFirst(true);
+ } else if (caseFirst.equalsIgnoreCase("upper")) {
+ rbc.setUpperCaseFirst(true);
+ } else {
+ throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid caseFirst: " + caseFirst);
+ }
+ }
+ if (numeric != null) {
+ rbc.setNumericCollation(Boolean.parseBoolean(numeric));
+ }
+ if (variableTop != null) {
+ rbc.setVariableTop(variableTop);
+ }
+
+ analyzer = new ICUCollationKeyAnalyzer(collator);
+ }
+
+ /**
+ * Create a locale from localeID.
+ * Then return the appropriate collator for the locale.
+ */
+ private Collator createFromLocale(String localeID) {
+ return Collator.getInstance(new ULocale(localeID));
+ }
+
+ /**
+ * Read custom rules from a file, and create a RuleBasedCollator
+ * The file cannot support comments, as # might be in the rules!
+ */
+ static Collator createFromRules(String fileName, ResourceLoader loader) {
+ InputStream input = null;
+ try {
+ input = loader.openResource(fileName);
+ String rules = IOUtils.toString(input, "UTF-8");
+ return new RuleBasedCollator(rules);
+ } catch (Exception e) {
+ // io error or invalid rules
+ throw new RuntimeException(e);
+ } finally {
+ IOUtils.closeQuietly(input);
+ }
+ }
+
+ @Override
+ public void write(TextResponseWriter writer, String name, IndexableField f) throws IOException {
+ writer.writeStr(name, f.stringValue(), true);
+ }
+
+ @Override
+ public SortField getSortField(SchemaField field, boolean top) {
+ return getStringSort(field, top);
+ }
+
+ @Override
+ public Type getUninversionType(SchemaField sf) {
+ if (sf.multiValued()) {
+ return Type.SORTED_SET_BINARY;
+ } else {
+ return Type.SORTED;
+ }
+ }
+
+ @Override
+ public Analyzer getIndexAnalyzer() {
+ return analyzer;
+ }
+
+ @Override
+ public Analyzer getQueryAnalyzer() {
+ return analyzer;
+ }
+
+ /**
+ * analyze the text with the analyzer, instead of the collator.
+ * because icu collators are not thread safe, this keeps things
+ * simple (we already have a threadlocal clone in the reused TS)
+ */
+ private BytesRef getCollationKey(String field, String text) {
+ try (TokenStream source = analyzer.tokenStream(field, text)) {
+ source.reset();
+
+ TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
+
+
+ // we control the analyzer here: most errors are impossible
+ if (!source.incrementToken())
+ throw new IllegalArgumentException("analyzer returned no terms for text: " + text);
+ BytesRef bytes = BytesRef.deepCopyOf(termAtt.getBytesRef());
+ assert !source.incrementToken();
+
+ source.end();
+ return bytes;
+ } catch (IOException e) {
+ throw new RuntimeException("Unable to analyze text: " + text, e);
+ }
+ }
+
+ @Override
+ public Query getRangeQuery(QParser parser, SchemaField field, String part1, String part2, boolean minInclusive, boolean maxInclusive) {
+ String f = field.getName();
+ BytesRef low = part1 == null ? null : getCollationKey(f, part1);
+ BytesRef high = part2 == null ? null : getCollationKey(f, part2);
+ if (!field.indexed() && field.hasDocValues()) {
+ return SortedSetDocValuesField.newSlowRangeQuery(
+ field.getName(), low, high, minInclusive, maxInclusive);
+ } else {
+ return new TermRangeQuery(field.getName(), low, high, minInclusive, maxInclusive);
+ }
+ }
+
+ @Override
+ protected void checkSupportsDocValues() { // we support DocValues
+ }
+
+ @Override
+ public List<IndexableField> createFields(SchemaField field, Object value) {
+ if (field.hasDocValues()) {
+ List<IndexableField> fields = new ArrayList<>();
+ fields.add(createField(field, value));
+ final BytesRef bytes = getCollationKey(field.getName(), value.toString());
+ if (field.multiValued()) {
+ fields.add(new SortedSetDocValuesField(field.getName(), bytes));
+ } else {
+ fields.add(new SortedDocValuesField(field.getName(), bytes));
+ }
+ return fields;
+ } else {
+ return Collections.singletonList(createField(field, value));
+ }
+ }
+
+ @Override
+ public Object marshalSortValue(Object value) {
+ return marshalBase64SortValue(value);
+ }
+
+ @Override
+ public Object unmarshalSortValue(Object value) {
+ return unmarshalBase64SortValue(value);
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/6c070b4a/solr/contrib/analysis-extras/src/main/java/org/apache/solr/schema/package.html
----------------------------------------------------------------------
diff --git a/solr/contrib/analysis-extras/src/main/java/org/apache/solr/schema/package.html b/solr/contrib/analysis-extras/src/main/java/org/apache/solr/schema/package.html
new file mode 100644
index 0000000..27d68db
--- /dev/null
+++ b/solr/contrib/analysis-extras/src/main/java/org/apache/solr/schema/package.html
@@ -0,0 +1,23 @@
+<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<!-- not a package-info.java, because we already defined this package in core/ -->
+<html>
+<body>
+<code>FieldType</code> plugins that have additional dependencies.
+</body>
+</html>