You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by da...@apache.org on 2017/12/27 15:04:10 UTC
[17/54] [abbrv] lucene-solr:jira/solr-11702: LUCENE-2899: Add OpenNLP
Analysis capabilities as a module
LUCENE-2899: Add OpenNLP Analysis capabilities as a module
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/3e2f9e62
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/3e2f9e62
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/3e2f9e62
Branch: refs/heads/jira/solr-11702
Commit: 3e2f9e62d772218bf1fcae6d58542fad3ec43742
Parents: d02d1f1
Author: Steve Rowe <sa...@apache.org>
Authored: Fri Dec 15 11:24:18 2017 -0500
Committer: Steve Rowe <sa...@apache.org>
Committed: Fri Dec 15 11:24:18 2017 -0500
----------------------------------------------------------------------
dev-tools/idea/.idea/ant.xml | 1 +
dev-tools/idea/.idea/modules.xml | 1 +
dev-tools/idea/.idea/workspace.xml | 83 +-
.../idea/lucene/analysis/opennlp/opennlp.iml | 30 +
.../contrib/analysis-extras/analysis-extras.iml | 1 +
.../lucene/analysis/opennlp/pom.xml.template | 78 +
.../maven/lucene/analysis/pom.xml.template | 1 +
lucene/CHANGES.txt | 9 +
lucene/analysis/README.txt | 5 +
lucene/analysis/build.xml | 6 +-
.../miscellaneous/TypeAsSynonymFilter.java | 80 +
.../TypeAsSynonymFilterFactory.java | 55 +
...ache.lucene.analysis.util.TokenFilterFactory | 1 +
.../analysis/minhash/MinHashFilterTest.java | 6 +-
.../TestTypeAsSynonymFilterFactory.java | 50 +
lucene/analysis/opennlp/build.xml | 118 +
lucene/analysis/opennlp/ivy.xml | 29 +
.../analysis/opennlp/OpenNLPChunkerFilter.java | 108 +
.../opennlp/OpenNLPChunkerFilterFactory.java | 81 +
.../opennlp/OpenNLPLemmatizerFilter.java | 123 +
.../opennlp/OpenNLPLemmatizerFilterFactory.java | 89 +
.../analysis/opennlp/OpenNLPPOSFilter.java | 96 +
.../opennlp/OpenNLPPOSFilterFactory.java | 71 +
.../opennlp/OpenNLPSentenceBreakIterator.java | 224 ++
.../analysis/opennlp/OpenNLPTokenizer.java | 98 +
.../opennlp/OpenNLPTokenizerFactory.java | 79 +
.../lucene/analysis/opennlp/package-info.java | 21 +
.../analysis/opennlp/tools/NLPChunkerOp.java | 41 +
.../analysis/opennlp/tools/NLPLemmatizerOp.java | 80 +
.../analysis/opennlp/tools/NLPNERTaggerOp.java | 56 +
.../analysis/opennlp/tools/NLPPOSTaggerOp.java | 41 +
.../opennlp/tools/NLPSentenceDetectorOp.java | 50 +
.../analysis/opennlp/tools/NLPTokenizerOp.java | 48 +
.../opennlp/tools/OpenNLPOpsFactory.java | 176 +
.../analysis/opennlp/tools/package-info.java | 21 +
lucene/analysis/opennlp/src/java/overview.html | 61 +
...ache.lucene.analysis.util.TokenFilterFactory | 18 +
...apache.lucene.analysis.util.TokenizerFactory | 16 +
.../lucene/analysis/opennlp/en-test-chunker.bin | Bin 0 -> 89915 bytes
.../lucene/analysis/opennlp/en-test-lemmas.dict | 12 +
.../analysis/opennlp/en-test-lemmatizer.bin | Bin 0 -> 7370 bytes
.../analysis/opennlp/en-test-ner-person.bin | Bin 0 -> 1700 bytes
.../analysis/opennlp/en-test-pos-maxent.bin | Bin 0 -> 18424 bytes
.../lucene/analysis/opennlp/en-test-sent.bin | Bin 0 -> 1050 bytes
.../analysis/opennlp/en-test-tokenizer.bin | Bin 0 -> 15096 bytes
.../TestOpenNLPChunkerFilterFactory.java | 74 +
.../TestOpenNLPLemmatizerFilterFactory.java | 169 +
.../opennlp/TestOpenNLPPOSFilterFactory.java | 95 +
.../TestOpenNLPSentenceBreakIterator.java | 201 +
.../opennlp/TestOpenNLPTokenizerFactory.java | 97 +
.../src/tools/test-model-data/README.txt | 6 +
.../src/tools/test-model-data/chunks.txt | 3566 ++++++++++++++++++
.../src/tools/test-model-data/lemmas.txt | 875 +++++
.../tools/test-model-data/ner_TrainerParams.txt | 21 +
.../src/tools/test-model-data/ner_flashman.txt | 143 +
.../opennlp/src/tools/test-model-data/pos.txt | 30 +
.../src/tools/test-model-data/sentences.txt | 144 +
.../src/tools/test-model-data/tokenizer.txt | 69 +
.../apache/lucene/analysis/TestStopFilter.java | 9 +-
lucene/ivy-versions.properties | 3 +
lucene/licenses/opennlp-maxent-3.0.3.jar.sha1 | 1 +
lucene/licenses/opennlp-maxent-LICENSE-ASL.txt | 202 +
lucene/licenses/opennlp-maxent-NOTICE.txt | 6 +
lucene/licenses/opennlp-tools-1.8.3.jar.sha1 | 1 +
lucene/licenses/opennlp-tools-LICENSE-ASL.txt | 202 +
lucene/licenses/opennlp-tools-NOTICE.txt | 6 +
lucene/module-build.xml | 22 +
.../analysis/BaseTokenStreamTestCase.java | 32 +-
solr/CHANGES.txt | 7 +
solr/contrib/analysis-extras/README.txt | 10 +-
solr/contrib/analysis-extras/build.xml | 20 +-
solr/contrib/analysis-extras/ivy.xml | 3 +
...ractNamedEntitiesUpdateProcessorFactory.java | 571 +++
.../apache/solr/update/processor/package.html | 24 +
.../collection1/conf/en-test-ner-person.bin | Bin 0 -> 1700 bytes
.../solr/collection1/conf/en-test-sent.bin | Bin 0 -> 1050 bytes
.../solr/collection1/conf/en-test-tokenizer.bin | Bin 0 -> 15096 bytes
.../collection1/conf/schema-opennlp-extract.xml | 49 +
.../conf/solrconfig-opennlp-extract.xml | 206 +
.../solrconfig.snippet.randomindexconfig.xml | 48 +
...ractNamedEntitiesUpdateProcessorFactory.java | 192 +
.../processor/UpdateProcessorTestBase.java | 168 -
solr/licenses/opennlp-maxent-3.0.3.jar.sha1 | 1 +
solr/licenses/opennlp-maxent-LICENSE-ASL.txt | 202 +
solr/licenses/opennlp-maxent-NOTICE.txt | 6 +
solr/licenses/opennlp-tools-1.8.3.jar.sha1 | 1 +
solr/licenses/opennlp-tools-LICENSE-ASL.txt | 202 +
solr/licenses/opennlp-tools-NOTICE.txt | 6 +
.../solr-ref-guide/src/filter-descriptions.adoc | 32 +
solr/solr-ref-guide/src/language-analysis.adoc | 208 +
solr/solr-ref-guide/src/tokenizers.adoc | 4 +
.../src/update-request-processors.adoc | 6 +
.../processor/UpdateProcessorTestBase.java | 168 +
93 files changed, 10040 insertions(+), 232 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3e2f9e62/dev-tools/idea/.idea/ant.xml
----------------------------------------------------------------------
diff --git a/dev-tools/idea/.idea/ant.xml b/dev-tools/idea/.idea/ant.xml
index 8723e63..6c7bc8c 100644
--- a/dev-tools/idea/.idea/ant.xml
+++ b/dev-tools/idea/.idea/ant.xml
@@ -11,6 +11,7 @@
<buildFile url="file://$PROJECT_DIR$/lucene/analysis/icu/build.xml" />
<buildFile url="file://$PROJECT_DIR$/lucene/analysis/kuromoji/build.xml" />
<buildFile url="file://$PROJECT_DIR$/lucene/analysis/morfologik/build.xml" />
+ <buildFile url="file://$PROJECT_DIR$/lucene/analysis/opennlp/build.xml" />
<buildFile url="file://$PROJECT_DIR$/lucene/analysis/phonetic/build.xml" />
<buildFile url="file://$PROJECT_DIR$/lucene/analysis/smartcn/build.xml" />
<buildFile url="file://$PROJECT_DIR$/lucene/analysis/stempel/build.xml" />
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3e2f9e62/dev-tools/idea/.idea/modules.xml
----------------------------------------------------------------------
diff --git a/dev-tools/idea/.idea/modules.xml b/dev-tools/idea/.idea/modules.xml
index 7ad2a78..4df1000 100644
--- a/dev-tools/idea/.idea/modules.xml
+++ b/dev-tools/idea/.idea/modules.xml
@@ -15,6 +15,7 @@
<module group="Lucene/Analysis" filepath="$PROJECT_DIR$/lucene/analysis/icu/icu.iml" />
<module group="Lucene/Analysis" filepath="$PROJECT_DIR$/lucene/analysis/kuromoji/kuromoji.iml" />
<module group="Lucene/Analysis" filepath="$PROJECT_DIR$/lucene/analysis/morfologik/morfologik.iml" />
+ <module group="Lucene/Analysis" filepath="$PROJECT_DIR$/lucene/analysis/opennlp/opennlp.iml" />
<module group="Lucene/Analysis" filepath="$PROJECT_DIR$/lucene/analysis/phonetic/phonetic.iml" />
<module group="Lucene/Analysis" filepath="$PROJECT_DIR$/lucene/analysis/smartcn/smartcn.iml" />
<module group="Lucene/Analysis" filepath="$PROJECT_DIR$/lucene/analysis/stempel/stempel.iml" />
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3e2f9e62/dev-tools/idea/.idea/workspace.xml
----------------------------------------------------------------------
diff --git a/dev-tools/idea/.idea/workspace.xml b/dev-tools/idea/.idea/workspace.xml
index e22108f..11794af 100644
--- a/dev-tools/idea/.idea/workspace.xml
+++ b/dev-tools/idea/.idea/workspace.xml
@@ -44,6 +44,14 @@
<option name="TEST_SEARCH_SCOPE"><value defaultName="singleModule" /></option>
<patterns><pattern testClass=".*\.Test[^.]*|.*\.[^.]*Test" /></patterns>
</configuration>
+ <configuration default="false" name="Module analyzers-opennlp" type="JUnit" factoryName="JUnit">
+ <module name="opennlp" />
+ <option name="TEST_OBJECT" value="pattern" />
+ <option name="WORKING_DIRECTORY" value="file://$PROJECT_DIR$/idea-build/lucene/analysis/opennlp" />
+ <option name="VM_PARAMETERS" value="-ea -DtempDir=temp" />
+ <option name="TEST_SEARCH_SCOPE"><value defaultName="singleModule" /></option>
+ <patterns><pattern testClass=".*\.Test[^.]*|.*\.[^.]*Test" /></patterns>
+ </configuration>
<configuration default="false" name="Module analyzers-phonetic" type="JUnit" factoryName="JUnit">
<module name="phonetic" />
<option name="TEST_OBJECT" value="pattern" />
@@ -333,48 +341,49 @@
<patterns><pattern testClass=".*\.Test[^.]*|.*\.[^.]*Test" /></patterns>
</configuration>
- <list size="41">
+ <list size="42">
<item index="0" class="java.lang.String" itemvalue="JUnit.Lucene core" />
<item index="1" class="java.lang.String" itemvalue="JUnit.Module analyzers-common" />
<item index="2" class="java.lang.String" itemvalue="JUnit.Module analyzers-icu" />
<item index="3" class="java.lang.String" itemvalue="JUnit.Module analyzers-kuromoji" />
<item index="4" class="java.lang.String" itemvalue="JUnit.Module analyzers-morfologik" />
- <item index="5" class="java.lang.String" itemvalue="JUnit.Module analyzers-phonetic" />
- <item index="6" class="java.lang.String" itemvalue="JUnit.Module analyzers-smartcn" />
- <item index="7" class="java.lang.String" itemvalue="JUnit.Module analyzers-stempel" />
- <item index="8" class="java.lang.String" itemvalue="JUnit.Module analyzers-uima" />
- <item index="9" class="java.lang.String" itemvalue="JUnit.Module backward-codecs" />
- <item index="10" class="java.lang.String" itemvalue="JUnit.Module benchmark" />
- <item index="11" class="java.lang.String" itemvalue="JUnit.Module classification" />
- <item index="12" class="java.lang.String" itemvalue="JUnit.Module codecs" />
- <item index="13" class="java.lang.String" itemvalue="JUnit.Module expressions" />
- <item index="14" class="java.lang.String" itemvalue="JUnit.Module facet" />
- <item index="15" class="java.lang.String" itemvalue="JUnit.Module grouping" />
- <item index="16" class="java.lang.String" itemvalue="JUnit.Module highlighter" />
- <item index="17" class="java.lang.String" itemvalue="JUnit.Module join" />
- <item index="18" class="java.lang.String" itemvalue="JUnit.Module memory" />
- <item index="19" class="java.lang.String" itemvalue="JUnit.Module misc" />
- <item index="20" class="java.lang.String" itemvalue="JUnit.Module queries" />
- <item index="21" class="java.lang.String" itemvalue="JUnit.Module queryparser" />
- <item index="22" class="java.lang.String" itemvalue="JUnit.Module replicator" />
- <item index="23" class="java.lang.String" itemvalue="JUnit.Module sandbox" />
- <item index="24" class="java.lang.String" itemvalue="JUnit.Module spatial" />
- <item index="25" class="java.lang.String" itemvalue="JUnit.Module spatial-extras" />
- <item index="26" class="java.lang.String" itemvalue="JUnit.Module spatial3d" />
- <item index="27" class="java.lang.String" itemvalue="JUnit.Module suggest" />
- <item index="28" class="java.lang.String" itemvalue="Application.solrcloud" />
- <item index="29" class="java.lang.String" itemvalue="JUnit.Solr core" />
- <item index="30" class="java.lang.String" itemvalue="JUnit.Solrj" />
- <item index="31" class="java.lang.String" itemvalue="JUnit.Solr analysis-extras contrib" />
- <item index="32" class="java.lang.String" itemvalue="JUnit.Solr analytics contrib" />
- <item index="33" class="java.lang.String" itemvalue="JUnit.Solr clustering contrib" />
- <item index="34" class="java.lang.String" itemvalue="JUnit.Solr dataimporthandler contrib" />
- <item index="35" class="java.lang.String" itemvalue="JUnit.Solr dataimporthandler-extras contrib" />
- <item index="36" class="java.lang.String" itemvalue="JUnit.Solr extraction contrib" />
- <item index="37" class="java.lang.String" itemvalue="JUnit.Solr langid contrib" />
- <item index="38" class="java.lang.String" itemvalue="JUnit.Solr ltr contrib" />
- <item index="39" class="java.lang.String" itemvalue="JUnit.Solr uima contrib" />
- <item index="40" class="java.lang.String" itemvalue="JUnit.Solr velocity contrib" />
+ <item index="5" class="java.lang.String" itemvalue="JUnit.Module analyzers-opennlp" />
+ <item index="6" class="java.lang.String" itemvalue="JUnit.Module analyzers-phonetic" />
+ <item index="7" class="java.lang.String" itemvalue="JUnit.Module analyzers-smartcn" />
+ <item index="8" class="java.lang.String" itemvalue="JUnit.Module analyzers-stempel" />
+ <item index="9" class="java.lang.String" itemvalue="JUnit.Module analyzers-uima" />
+ <item index="10" class="java.lang.String" itemvalue="JUnit.Module backward-codecs" />
+ <item index="11" class="java.lang.String" itemvalue="JUnit.Module benchmark" />
+ <item index="12" class="java.lang.String" itemvalue="JUnit.Module classification" />
+ <item index="13" class="java.lang.String" itemvalue="JUnit.Module codecs" />
+ <item index="14" class="java.lang.String" itemvalue="JUnit.Module expressions" />
+ <item index="15" class="java.lang.String" itemvalue="JUnit.Module facet" />
+ <item index="16" class="java.lang.String" itemvalue="JUnit.Module grouping" />
+ <item index="17" class="java.lang.String" itemvalue="JUnit.Module highlighter" />
+ <item index="18" class="java.lang.String" itemvalue="JUnit.Module join" />
+ <item index="19" class="java.lang.String" itemvalue="JUnit.Module memory" />
+ <item index="20" class="java.lang.String" itemvalue="JUnit.Module misc" />
+ <item index="21" class="java.lang.String" itemvalue="JUnit.Module queries" />
+ <item index="22" class="java.lang.String" itemvalue="JUnit.Module queryparser" />
+ <item index="23" class="java.lang.String" itemvalue="JUnit.Module replicator" />
+ <item index="24" class="java.lang.String" itemvalue="JUnit.Module sandbox" />
+ <item index="25" class="java.lang.String" itemvalue="JUnit.Module spatial" />
+ <item index="26" class="java.lang.String" itemvalue="JUnit.Module spatial-extras" />
+ <item index="27" class="java.lang.String" itemvalue="JUnit.Module spatial3d" />
+ <item index="28" class="java.lang.String" itemvalue="JUnit.Module suggest" />
+ <item index="29" class="java.lang.String" itemvalue="Application.solrcloud" />
+ <item index="30" class="java.lang.String" itemvalue="JUnit.Solr core" />
+ <item index="31" class="java.lang.String" itemvalue="JUnit.Solrj" />
+ <item index="32" class="java.lang.String" itemvalue="JUnit.Solr analysis-extras contrib" />
+ <item index="33" class="java.lang.String" itemvalue="JUnit.Solr analytics contrib" />
+ <item index="34" class="java.lang.String" itemvalue="JUnit.Solr clustering contrib" />
+ <item index="35" class="java.lang.String" itemvalue="JUnit.Solr dataimporthandler contrib" />
+ <item index="36" class="java.lang.String" itemvalue="JUnit.Solr dataimporthandler-extras contrib" />
+ <item index="37" class="java.lang.String" itemvalue="JUnit.Solr extraction contrib" />
+ <item index="38" class="java.lang.String" itemvalue="JUnit.Solr langid contrib" />
+ <item index="39" class="java.lang.String" itemvalue="JUnit.Solr ltr contrib" />
+ <item index="40" class="java.lang.String" itemvalue="JUnit.Solr uima contrib" />
+ <item index="41" class="java.lang.String" itemvalue="JUnit.Solr velocity contrib" />
</list>
</component>
</project>
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3e2f9e62/dev-tools/idea/lucene/analysis/opennlp/opennlp.iml
----------------------------------------------------------------------
diff --git a/dev-tools/idea/lucene/analysis/opennlp/opennlp.iml b/dev-tools/idea/lucene/analysis/opennlp/opennlp.iml
new file mode 100644
index 0000000..7725065
--- /dev/null
+++ b/dev-tools/idea/lucene/analysis/opennlp/opennlp.iml
@@ -0,0 +1,30 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="JAVA_MODULE" version="4">
+ <component name="NewModuleRootManager" inherit-compiler-output="false">
+ <output url="file://$MODULE_DIR$/../../../idea-build/lucene/analysis/opennlp/classes/java" />
+ <output-test url="file://$MODULE_DIR$/../../../idea-build/lucene/analysis/opennlp/classes/test" />
+ <exclude-output />
+ <content url="file://$MODULE_DIR$">
+ <sourceFolder url="file://$MODULE_DIR$/src/java" isTestSource="false" />
+ <sourceFolder url="file://$MODULE_DIR$/src/test" isTestSource="true" />
+ <sourceFolder url="file://$MODULE_DIR$/src/resources" type="java-resource" />
+ <sourceFolder url="file://$MODULE_DIR$/src/test-files" type="java-test-resource" />
+ </content>
+ <orderEntry type="inheritedJdk" />
+ <orderEntry type="sourceFolder" forTests="false" />
+ <orderEntry type="module-library">
+ <library>
+ <CLASSES>
+ <root url="file://$MODULE_DIR$/lib" />
+ </CLASSES>
+ <JAVADOC />
+ <SOURCES />
+ <jarDirectory url="file://$MODULE_DIR$/lib" recursive="false" />
+ </library>
+ </orderEntry>
+ <orderEntry type="library" scope="TEST" name="JUnit" level="project" />
+ <orderEntry type="module" scope="TEST" module-name="lucene-test-framework" />
+ <orderEntry type="module" module-name="analysis-common" />
+ <orderEntry type="module" module-name="lucene-core" />
+ </component>
+</module>
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3e2f9e62/dev-tools/idea/solr/contrib/analysis-extras/analysis-extras.iml
----------------------------------------------------------------------
diff --git a/dev-tools/idea/solr/contrib/analysis-extras/analysis-extras.iml b/dev-tools/idea/solr/contrib/analysis-extras/analysis-extras.iml
index 287b46a..7c0c0c1 100644
--- a/dev-tools/idea/solr/contrib/analysis-extras/analysis-extras.iml
+++ b/dev-tools/idea/solr/contrib/analysis-extras/analysis-extras.iml
@@ -37,5 +37,6 @@
<orderEntry type="module" module-name="lucene-core" />
<orderEntry type="module" module-name="misc" />
<orderEntry type="module" module-name="sandbox" />
+ <orderEntry type="module" module-name="opennlp" />
</component>
</module>
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3e2f9e62/dev-tools/maven/lucene/analysis/opennlp/pom.xml.template
----------------------------------------------------------------------
diff --git a/dev-tools/maven/lucene/analysis/opennlp/pom.xml.template b/dev-tools/maven/lucene/analysis/opennlp/pom.xml.template
new file mode 100644
index 0000000..4109a0a
--- /dev/null
+++ b/dev-tools/maven/lucene/analysis/opennlp/pom.xml.template
@@ -0,0 +1,78 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+ <parent>
+ <groupId>org.apache.lucene</groupId>
+ <artifactId>lucene-parent</artifactId>
+ <version>@version@</version>
+ <relativePath>../../pom.xml</relativePath>
+ </parent>
+ <groupId>org.apache.lucene</groupId>
+ <artifactId>lucene-analyzers-opennlp</artifactId>
+ <packaging>jar</packaging>
+ <name>Lucene OpenNLP integration</name>
+ <description>
+ Lucene OpenNLP integration
+ </description>
+ <properties>
+ <module-directory>lucene/analysis/opennlp</module-directory>
+ <relative-top-level>../../../..</relative-top-level>
+ <module-path>${relative-top-level}/${module-directory}</module-path>
+ </properties>
+ <scm>
+ <connection>scm:git:${vc-anonymous-base-url}</connection>
+ <developerConnection>scm:git:${vc-dev-base-url}</developerConnection>
+ <url>${vc-browse-base-url};f=${module-directory}</url>
+ </scm>
+ <dependencies>
+ <dependency>
+ <!-- lucene-test-framework dependency must be declared before lucene-core -->
+ <groupId>org.apache.lucene</groupId>
+ <artifactId>lucene-test-framework</artifactId>
+ <scope>test</scope>
+ </dependency>
+ @lucene-analyzers-opennlp.internal.dependencies@
+ @lucene-analyzers-opennlp.external.dependencies@
+ @lucene-analyzers-opennlp.internal.test.dependencies@
+ @lucene-analyzers-opennlp.external.test.dependencies@
+ </dependencies>
+ <build>
+ <sourceDirectory>${module-path}/src/java</sourceDirectory>
+ <testSourceDirectory>${module-path}/src/test</testSourceDirectory>
+ <resources>
+ <resource>
+ <directory>${module-path}/src/resources</directory>
+ </resource>
+ </resources>
+ <testResources>
+ <testResource>
+ <directory>${project.build.testSourceDirectory}</directory>
+ <excludes>
+ <exclude>**/*.java</exclude>
+ </excludes>
+ </testResource>
+ <testResource>
+ <directory>${module-path}/src/test-files</directory>
+ </testResource>
+ </testResources>
+ </build>
+</project>
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3e2f9e62/dev-tools/maven/lucene/analysis/pom.xml.template
----------------------------------------------------------------------
diff --git a/dev-tools/maven/lucene/analysis/pom.xml.template b/dev-tools/maven/lucene/analysis/pom.xml.template
index 9058abf..466ad30 100644
--- a/dev-tools/maven/lucene/analysis/pom.xml.template
+++ b/dev-tools/maven/lucene/analysis/pom.xml.template
@@ -35,6 +35,7 @@
<module>icu</module>
<module>kuromoji</module>
<module>morfologik</module>
+ <module>opennlp</module>
<module>phonetic</module>
<module>smartcn</module>
<module>stempel</module>
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3e2f9e62/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 0fbf446..db8aaab 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -65,6 +65,15 @@ API Changes
* LUCENE-8051: LevensteinDistance renamed to LevenshteinDistance.
(Pulak Ghosh via Adrien Grand)
+New Features
+
+* LUCENE-2899: Add new module analysis/opennlp, with analysis components
+ to perform tokenization, part-of-speech tagging, lemmatization and phrase
+ chunking by invoking the corresponding OpenNLP tools. Named entity
+ recognition is also provided as a Solr update request processor.
+ (Lance Norskog, Grant Ingersoll, Joern Kottmann, Em, Kai Gülzau,
+ Rene Nederhand, Robert Muir, Steven Bower, Steve Rowe)
+
Improvements
* LUCENE-8081: Allow IndexWriter to opt out of flushing on indexing threads
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3e2f9e62/lucene/analysis/README.txt
----------------------------------------------------------------------
diff --git a/lucene/analysis/README.txt b/lucene/analysis/README.txt
index 7dc7f53..c68584e 100644
--- a/lucene/analysis/README.txt
+++ b/lucene/analysis/README.txt
@@ -28,6 +28,9 @@ lucene-analyzers-kuromoji-XX.jar
lucene-analyzers-morfologik-XX.jar
An analyzer using the Morfologik stemming library.
+lucene-analyzers-opennlp-XX.jar
+ An analyzer using the OpenNLP natural-language processing library.
+
lucene-analyzers-phonetic-XX.jar
An add-on analysis library that provides phonetic encoders via Apache
Commons-Codec. Note: this module depends on the commons-codec jar
@@ -49,6 +52,7 @@ common/src/java
icu/src/java
kuromoji/src/java
morfologik/src/java
+opennlp/src/java
phonetic/src/java
smartcn/src/java
stempel/src/java
@@ -59,6 +63,7 @@ common/src/test
icu/src/test
kuromoji/src/test
morfologik/src/test
+opennlp/src/test
phonetic/src/test
smartcn/src/test
stempel/src/test
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3e2f9e62/lucene/analysis/build.xml
----------------------------------------------------------------------
diff --git a/lucene/analysis/build.xml b/lucene/analysis/build.xml
index 844f5f3..ed1566c 100644
--- a/lucene/analysis/build.xml
+++ b/lucene/analysis/build.xml
@@ -65,6 +65,10 @@
<ant dir="morfologik" />
</target>
+ <target name="opennlp">
+ <ant dir="opennlp" />
+ </target>
+
<target name="phonetic">
<ant dir="phonetic" />
</target>
@@ -82,7 +86,7 @@
</target>
<target name="default" depends="compile"/>
- <target name="compile" depends="common,icu,kuromoji,morfologik,phonetic,smartcn,stempel,uima" />
+ <target name="compile" depends="common,icu,kuromoji,morfologik,opennlp,phonetic,smartcn,stempel,uima" />
<target name="clean">
<forall-analyzers target="clean"/>
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3e2f9e62/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TypeAsSynonymFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TypeAsSynonymFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TypeAsSynonymFilter.java
new file mode 100644
index 0000000..8269d5d
--- /dev/null
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TypeAsSynonymFilter.java
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.miscellaneous;
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.util.AttributeSource;
+
+/**
+ * Adds the {@link TypeAttribute#type()} as a synonym,
+ * i.e. another token at the same position, optionally with a specified prefix prepended.
+ */
+public final class TypeAsSynonymFilter extends TokenFilter {
+ private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
+ private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
+ private final String prefix;
+
+ AttributeSource.State savedToken = null;
+
+
+ public TypeAsSynonymFilter(TokenStream input) {
+ this(input, null);
+ }
+
+ /**
+ * @param input input tokenstream
+ * @param prefix Prepend this string to every token type emitted as token text.
+ * If null, nothing will be prepended.
+ */
+ public TypeAsSynonymFilter(TokenStream input, String prefix) {
+ super(input);
+ this.prefix = prefix;
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ if (savedToken != null) { // Emit last token's type at the same position
+ restoreState(savedToken);
+ savedToken = null;
+ termAtt.setEmpty();
+ if (prefix != null) {
+ termAtt.append(prefix);
+ }
+ termAtt.append(typeAtt.type());
+ posIncrAtt.setPositionIncrement(0);
+ return true;
+ } else if (input.incrementToken()) { // Ho pending token type to emit
+ savedToken = captureState();
+ return true;
+ }
+ return false;
+ }
+
+ @Override
+ public void reset() throws IOException {
+ super.reset();
+ savedToken = null;
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3e2f9e62/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TypeAsSynonymFilterFactory.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TypeAsSynonymFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TypeAsSynonymFilterFactory.java
new file mode 100644
index 0000000..69708b7
--- /dev/null
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TypeAsSynonymFilterFactory.java
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.miscellaneous;
+
+import java.util.Map;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.TokenFilterFactory;
+
+/**
+ * Factory for {@link TypeAsSynonymFilter}.
+ * <pre class="prettyprint">
+ * <fieldType name="text_type_as_synonym" class="solr.TextField" positionIncrementGap="100">
+ * <analyzer>
+ * <tokenizer class="solr.UAX29URLEmailTokenizerFactory"/>
+ * <filter class="solr.TypeAsSynonymFilterFactory" prefix="_type_" />
+ * </analyzer>
+ * </fieldType></pre>
+ *
+ * <p>
+ * If the optional {@code prefix} parameter is used, the specified value will be prepended
+ * to the type, e.g. with prefix="_type_", for a token "example.com" with type "<URL>",
+ * the emitted synonym will have text "_type_<URL>".
+ */
+public class TypeAsSynonymFilterFactory extends TokenFilterFactory {
+ private final String prefix;
+
+ public TypeAsSynonymFilterFactory(Map<String,String> args) {
+ super(args);
+ prefix = get(args, "prefix"); // default value is null
+ if (!args.isEmpty()) {
+ throw new IllegalArgumentException("Unknown parameters: " + args);
+ }
+ }
+
+ @Override
+ public TokenStream create(TokenStream input) {
+ return new TypeAsSynonymFilter(input, prefix);
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3e2f9e62/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
index d871ad6..6dcc81c 100644
--- a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
+++ b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
@@ -80,6 +80,7 @@ org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilterFactory
org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilterFactory
org.apache.lucene.analysis.miscellaneous.TrimFilterFactory
org.apache.lucene.analysis.miscellaneous.TruncateTokenFilterFactory
+org.apache.lucene.analysis.miscellaneous.TypeAsSynonymFilterFactory
org.apache.lucene.analysis.miscellaneous.WordDelimiterFilterFactory
org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilterFactory
org.apache.lucene.analysis.miscellaneous.ScandinavianFoldingFilterFactory
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3e2f9e62/lucene/analysis/common/src/test/org/apache/lucene/analysis/minhash/MinHashFilterTest.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/minhash/MinHashFilterTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/minhash/MinHashFilterTest.java
index a4080fe..1bc6ed7 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/minhash/MinHashFilterTest.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/minhash/MinHashFilterTest.java
@@ -183,14 +183,14 @@ public class MinHashFilterTest extends BaseTokenStreamTestCase {
TokenStream ts = createTokenStream(5, "woof woof woof woof woof", 1, 1, 100, false);
assertTokenStreamContents(ts, hashes, new int[]{0},
new int[]{24}, new String[]{MinHashFilter.MIN_HASH_TYPE}, new int[]{1}, new int[]{1}, 24, 0, null,
- true);
+ true, null);
ts = createTokenStream(5, "woof woof woof woof woof", 2, 1, 1, false);
assertTokenStreamContents(ts, new String[]{new String(new char[]{0, 0, 8449, 54077, 64133, 32857, 8605, 41409}),
new String(new char[]{0, 1, 16887, 58164, 39536, 14926, 6529, 17276})}, new int[]{0, 0},
new int[]{24, 24}, new String[]{MinHashFilter.MIN_HASH_TYPE, MinHashFilter.MIN_HASH_TYPE}, new int[]{1, 0},
new int[]{1, 1}, 24, 0, null,
- true);
+ true, null);
}
@Test
@@ -203,7 +203,7 @@ public class MinHashFilterTest extends BaseTokenStreamTestCase {
false);
assertTokenStreamContents(ts, hashes, new int[]{0, 0},
new int[]{49, 49}, new String[]{MinHashFilter.MIN_HASH_TYPE, MinHashFilter.MIN_HASH_TYPE}, new int[]{1, 0},
- new int[]{1, 1}, 49, 0, null, true);
+ new int[]{1, 1}, 49, 0, null, true, null);
}
private ArrayList<String> getTokens(TokenStream ts) throws IOException {
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3e2f9e62/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTypeAsSynonymFilterFactory.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTypeAsSynonymFilterFactory.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTypeAsSynonymFilterFactory.java
new file mode 100644
index 0000000..6beb139
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTypeAsSynonymFilterFactory.java
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.miscellaneous;
+
+import org.apache.lucene.analysis.CannedTokenStream;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
+
+public class TestTypeAsSynonymFilterFactory extends BaseTokenStreamFactoryTestCase {
+
+ private static final Token[] TOKENS = { token("Visit", "<ALPHANUM>"), token("example.com", "<URL>") };
+
+ public void testBasic() throws Exception {
+ TokenStream stream = new CannedTokenStream(TOKENS);
+ stream = tokenFilterFactory("TypeAsSynonym").create(stream);
+ assertTokenStreamContents(stream, new String[] { "Visit", "<ALPHANUM>", "example.com", "<URL>" },
+ null, null, new String[] { "<ALPHANUM>", "<ALPHANUM>", "<URL>", "<URL>" }, new int[] { 1, 0, 1, 0 });
+ }
+
+ public void testPrefix() throws Exception {
+ TokenStream stream = new CannedTokenStream(TOKENS);
+ stream = tokenFilterFactory("TypeAsSynonym", "prefix", "_type_").create(stream);
+ assertTokenStreamContents(stream, new String[] { "Visit", "_type_<ALPHANUM>", "example.com", "_type_<URL>" },
+ null, null, new String[] { "<ALPHANUM>", "<ALPHANUM>", "<URL>", "<URL>" }, new int[] { 1, 0, 1, 0 });
+ }
+
+ private static Token token(String term, String type) {
+ Token token = new Token();
+ token.setEmpty();
+ token.append(term);
+ token.setType(type);
+ return token;
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3e2f9e62/lucene/analysis/opennlp/build.xml
----------------------------------------------------------------------
diff --git a/lucene/analysis/opennlp/build.xml b/lucene/analysis/opennlp/build.xml
new file mode 100644
index 0000000..e2cd20a
--- /dev/null
+++ b/lucene/analysis/opennlp/build.xml
@@ -0,0 +1,118 @@
+<?xml version="1.0"?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ -->
+
+<project name="analyzers-opennlp" default="default">
+
+ <description>
+ OpenNLP Library Integration
+ </description>
+
+ <path id="opennlpjars">
+ <fileset dir="lib"/>
+ </path>
+
+ <property name="test.model.data.dir" location="src/tools/test-model-data"/>
+ <property name="tests.userdir" location="src/test-files"/>
+ <property name="test.model.dir" location="${tests.userdir}/org/apache/lucene/analysis/opennlp"/>
+
+ <import file="../analysis-module-build.xml"/>
+
+ <property name="analysis-extras.conf.dir"
+ location="${common.dir}/../solr/contrib/analysis-extras/src/test-files/analysis-extras/solr/collection1/conf"/>
+
+ <path id="classpath">
+ <pathelement path="${analyzers-common.jar}"/>
+ <path refid="opennlpjars"/>
+ <path refid="base.classpath"/>
+ </path>
+
+ <path id="test.classpath">
+ <path refid="test.base.classpath"/>
+ <pathelement path="${tests.userdir}"/>
+ </path>
+
+ <target name="compile-core" depends="jar-analyzers-common, common.compile-core" />
+
+ <!--
+ This does not create real NLP models, just small unencumbered ones for the unit tests.
+ All text taken from reuters corpus.
+ Tags applied with online demos at CCG Urbana-Champaign.
+ -->
+ <target name="train-test-models" description="Train all small test models for unit tests" depends="resolve">
+ <mkdir dir="${test.model.dir}"/>
+ <!-- https://opennlp.apache.org/docs/1.8.3/manual/opennlp.html#tools.sentdetect.training -->
+ <trainModel command="SentenceDetectorTrainer" lang="en" data="sentences.txt" model="en-test-sent.bin"/>
+ <copy file="${test.model.dir}/en-test-sent.bin" todir="${analysis-extras.conf.dir}"/>
+
+ <!-- https://opennlp.apache.org/docs/1.8.3/manual/opennlp.html#tools.tokenizer.training -->
+ <trainModel command="TokenizerTrainer" lang="en" data="tokenizer.txt" model="en-test-tokenizer.bin"/>
+ <copy file="${test.model.dir}/en-test-tokenizer.bin" todir="${analysis-extras.conf.dir}"/>
+
+ <!-- https://opennlp.apache.org/docs/1.8.3/manual/opennlp.html#tools.postagger.training -->
+ <trainModel command="POSTaggerTrainer" lang="en" data="pos.txt" model="en-test-pos-maxent.bin"/>
+
+ <!-- https://opennlp.apache.org/docs/1.8.3/manual/opennlp.html#tools.chunker.training -->
+ <trainModel command="ChunkerTrainerME" lang="en" data="chunks.txt" model="en-test-chunker.bin"/>
+
+ <!-- https://opennlp.apache.org/docs/1.8.3/manual/opennlp.html#tools.namefind.training -->
+ <trainModel command="TokenNameFinderTrainer" lang="en" data="ner_flashman.txt" model="en-test-ner-person.bin">
+ <extra-args>
+ <arg value="-params"/>
+ <arg value="ner_TrainerParams.txt"/>
+ </extra-args>
+ </trainModel>
+ <copy file="${test.model.dir}/en-test-ner-person.bin" todir="${analysis-extras.conf.dir}"/>
+
+ <!-- https://opennlp.apache.org/docs/1.8.3/manual/opennlp.html#tools.lemmatizer.training -->
+ <trainModel command="LemmatizerTrainerME" lang="en" data="lemmas.txt" model="en-test-lemmatizer.bin"/>
+ </target>
+
+ <macrodef name="trainModel">
+ <attribute name="command"/>
+ <attribute name="lang"/>
+ <attribute name="data"/>
+ <attribute name="model"/>
+ <element name="extra-args" optional="true"/>
+ <sequential>
+ <java classname="opennlp.tools.cmdline.CLI"
+ dir="${test.model.data.dir}"
+ fork="true"
+ failonerror="true">
+ <classpath>
+ <path refid="opennlpjars"/>
+ </classpath>
+
+ <arg value="@{command}"/>
+
+ <arg value="-lang"/>
+ <arg value="@{lang}"/>
+
+ <arg value="-data"/>
+ <arg value="@{data}"/>
+
+ <arg value="-model"/>
+ <arg value="${test.model.dir}/@{model}"/>
+
+ <extra-args/>
+ </java>
+ </sequential>
+ </macrodef>
+
+ <target name="regenerate" depends="train-test-models"/>
+</project>
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3e2f9e62/lucene/analysis/opennlp/ivy.xml
----------------------------------------------------------------------
diff --git a/lucene/analysis/opennlp/ivy.xml b/lucene/analysis/opennlp/ivy.xml
new file mode 100644
index 0000000..c7b885f
--- /dev/null
+++ b/lucene/analysis/opennlp/ivy.xml
@@ -0,0 +1,29 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+<ivy-module version="2.0">
+ <info organisation="org.apache.lucene" module="analyzers-opennlp" />
+ <configurations defaultconfmapping="compile->master">
+ <conf name="compile" transitive="false"/>
+ </configurations>
+ <dependencies>
+ <dependency org="org.apache.opennlp" name="opennlp-tools" rev="${/org.apache.opennlp/opennlp-tools}" transitive="false" conf="compile" />
+ <dependency org="org.apache.opennlp" name="opennlp-maxent" rev="${/org.apache.opennlp/opennlp-maxent}" transitive="false" conf="compile" />
+ <exclude org="*" ext="*" matcher="regexp" type="${ivy.exclude.types}" />
+ </dependencies>
+</ivy-module>
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3e2f9e62/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPChunkerFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPChunkerFilter.java b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPChunkerFilter.java
new file mode 100644
index 0000000..cfc47e6
--- /dev/null
+++ b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPChunkerFilter.java
@@ -0,0 +1,108 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.opennlp;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.opennlp.tools.NLPChunkerOp;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.util.AttributeSource;
+
+/**
+ * Run OpenNLP chunker. Prerequisite: the OpenNLPTokenizer and OpenNLPPOSFilter must precede this filter.
+ * Tags terms in the TypeAttribute, replacing the POS tags previously put there by OpenNLPPOSFilter.
+ */
+public final class OpenNLPChunkerFilter extends TokenFilter {
+
+ private List<AttributeSource> sentenceTokenAttrs = new ArrayList<>();
+ private int tokenNum = 0;
+ private boolean moreTokensAvailable = true;
+ private String[] sentenceTerms = null;
+ private String[] sentenceTermPOSTags = null;
+
+ private final NLPChunkerOp chunkerOp;
+ private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
+ private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
+ private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+
+ public OpenNLPChunkerFilter(TokenStream input, NLPChunkerOp chunkerOp) {
+ super(input);
+ this.chunkerOp = chunkerOp;
+ }
+
+ @Override
+ public final boolean incrementToken() throws IOException {
+ if ( ! moreTokensAvailable) {
+ clear();
+ return false;
+ }
+ if (tokenNum == sentenceTokenAttrs.size()) {
+ nextSentence();
+ if (sentenceTerms == null) {
+ clear();
+ return false;
+ }
+ assignTokenTypes(chunkerOp.getChunks(sentenceTerms, sentenceTermPOSTags, null));
+ tokenNum = 0;
+ }
+ clearAttributes();
+ sentenceTokenAttrs.get(tokenNum++).copyTo(this);
+ return true;
+ }
+
+ private void nextSentence() throws IOException {
+ List<String> termList = new ArrayList<>();
+ List<String> posTagList = new ArrayList<>();
+ sentenceTokenAttrs.clear();
+ boolean endOfSentence = false;
+ while ( ! endOfSentence && (moreTokensAvailable = input.incrementToken())) {
+ termList.add(termAtt.toString());
+ posTagList.add(typeAtt.type());
+ endOfSentence = 0 != (flagsAtt.getFlags() & OpenNLPTokenizer.EOS_FLAG_BIT);
+ sentenceTokenAttrs.add(input.cloneAttributes());
+ }
+ sentenceTerms = termList.size() > 0 ? termList.toArray(new String[termList.size()]) : null;
+ sentenceTermPOSTags = posTagList.size() > 0 ? posTagList.toArray(new String[posTagList.size()]) : null;
+ }
+
+ private void assignTokenTypes(String[] tags) {
+ for (int i = 0 ; i < tags.length ; ++i) {
+ sentenceTokenAttrs.get(i).getAttribute(TypeAttribute.class).setType(tags[i]);
+ }
+ }
+
+ @Override
+ public void reset() throws IOException {
+ super.reset();
+ moreTokensAvailable = true;
+ clear();
+ }
+
+ private void clear() {
+ sentenceTokenAttrs.clear();
+ sentenceTerms = null;
+ sentenceTermPOSTags = null;
+ tokenNum = 0;
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3e2f9e62/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPChunkerFilterFactory.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPChunkerFilterFactory.java b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPChunkerFilterFactory.java
new file mode 100644
index 0000000..96eb672
--- /dev/null
+++ b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPChunkerFilterFactory.java
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.opennlp;
+
+import java.io.IOException;
+import java.util.Map;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.opennlp.tools.NLPChunkerOp;
+import org.apache.lucene.analysis.opennlp.tools.OpenNLPOpsFactory;
+import org.apache.lucene.analysis.util.ResourceLoader;
+import org.apache.lucene.analysis.util.ResourceLoaderAware;
+import org.apache.lucene.analysis.util.TokenFilterFactory;
+
+/**
+ * Factory for {@link OpenNLPChunkerFilter}.
+ *
+ * <pre class="prettyprint">
+ * <fieldType name="text_opennlp_chunked" class="solr.TextField" positionIncrementGap="100">
+ * <analyzer>
+ * <tokenizer class="solr.OpenNLPTokenizerFactory" sentenceModel="filename" tokenizerModel="filename"/>
+ * <filter class="solr.OpenNLPPOSFilterFactory" posTaggerModel="filename"/>
+ * <filter class="solr.OpenNLPChunkerFilterFactory" chunkerModel="filename"/>
+ * </analyzer>
+ * </fieldType></pre>
+ * @since 7.3.0
+ */
+public class OpenNLPChunkerFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
+ public static final String CHUNKER_MODEL = "chunkerModel";
+
+ private final String chunkerModelFile;
+
+ public OpenNLPChunkerFilterFactory(Map<String,String> args) {
+ super(args);
+ chunkerModelFile = get(args, CHUNKER_MODEL);
+ if (!args.isEmpty()) {
+ throw new IllegalArgumentException("Unknown parameters: " + args);
+ }
+ }
+
+ @Override
+ public OpenNLPChunkerFilter create(TokenStream in) {
+ try {
+ NLPChunkerOp chunkerOp = null;
+
+ if (chunkerModelFile != null) {
+ chunkerOp = OpenNLPOpsFactory.getChunker(chunkerModelFile);
+ }
+ return new OpenNLPChunkerFilter(in, chunkerOp);
+ } catch (IOException e) {
+ throw new IllegalArgumentException(e);
+ }
+ }
+
+ @Override
+ public void inform(ResourceLoader loader) {
+ try {
+ // load and register read-only models in cache with file/resource names
+ if (chunkerModelFile != null) {
+ OpenNLPOpsFactory.getChunkerModel(chunkerModelFile, loader);
+ }
+ } catch (IOException e) {
+ throw new IllegalArgumentException(e);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3e2f9e62/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPLemmatizerFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPLemmatizerFilter.java b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPLemmatizerFilter.java
new file mode 100644
index 0000000..4c484b9
--- /dev/null
+++ b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPLemmatizerFilter.java
@@ -0,0 +1,123 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.opennlp;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.opennlp.tools.NLPLemmatizerOp;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.util.AttributeSource;
+
+/**
+ * <p>Runs OpenNLP dictionary-based and/or MaxEnt lemmatizers.</p>
+ * <p>
+ * Both a dictionary-based lemmatizer and a MaxEnt lemmatizer are supported,
+ * via the "dictionary" and "lemmatizerModel" params, respectively.
+ * If both are configured, the dictionary-based lemmatizer is tried first,
+ * and then the MaxEnt lemmatizer is consulted for out-of-vocabulary tokens.
+ * </p>
+ * <p>
+ * The dictionary file must be encoded as UTF-8, with one entry per line,
+ * in the form <tt>word[tab]lemma[tab]part-of-speech</tt>
+ * </p>
+ */
+public class OpenNLPLemmatizerFilter extends TokenFilter {
+ private final NLPLemmatizerOp lemmatizerOp;
+ private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
+ private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
+ private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
+ private List<AttributeSource> sentenceTokenAttrs = new ArrayList<>();
+ private Iterator<AttributeSource> sentenceTokenAttrsIter = null;
+ private boolean moreTokensAvailable = true;
+ private String[] sentenceTokens = null; // non-keyword tokens
+ private String[] sentenceTokenTypes = null; // types for non-keyword tokens
+ private String[] lemmas = null; // lemmas for non-keyword tokens
+ private int lemmaNum = 0; // lemma counter
+
+ public OpenNLPLemmatizerFilter(TokenStream input, NLPLemmatizerOp lemmatizerOp) {
+ super(input);
+ this.lemmatizerOp = lemmatizerOp;
+ }
+
+ @Override
+ public final boolean incrementToken() throws IOException {
+ if ( ! moreTokensAvailable) {
+ clear();
+ return false;
+ }
+ if (sentenceTokenAttrsIter == null || ! sentenceTokenAttrsIter.hasNext()) {
+ nextSentence();
+ if (sentenceTokens == null) { // zero non-keyword tokens
+ clear();
+ return false;
+ }
+ lemmas = lemmatizerOp.lemmatize(sentenceTokens, sentenceTokenTypes);
+ lemmaNum = 0;
+ sentenceTokenAttrsIter = sentenceTokenAttrs.iterator();
+ }
+ clearAttributes();
+ sentenceTokenAttrsIter.next().copyTo(this);
+ if ( ! keywordAtt.isKeyword()) {
+ termAtt.setEmpty().append(lemmas[lemmaNum++]);
+ }
+ return true;
+
+ }
+
+ private void nextSentence() throws IOException {
+ List<String> tokenList = new ArrayList<>();
+ List<String> typeList = new ArrayList<>();
+ sentenceTokenAttrs.clear();
+ boolean endOfSentence = false;
+ while ( ! endOfSentence && (moreTokensAvailable = input.incrementToken())) {
+ if ( ! keywordAtt.isKeyword()) {
+ tokenList.add(termAtt.toString());
+ typeList.add(typeAtt.type());
+ }
+ endOfSentence = 0 != (flagsAtt.getFlags() & OpenNLPTokenizer.EOS_FLAG_BIT);
+ sentenceTokenAttrs.add(input.cloneAttributes());
+ }
+ sentenceTokens = tokenList.size() > 0 ? tokenList.toArray(new String[tokenList.size()]) : null;
+ sentenceTokenTypes = typeList.size() > 0 ? typeList.toArray(new String[typeList.size()]) : null;
+ }
+
+ @Override
+ public void reset() throws IOException {
+ super.reset();
+ moreTokensAvailable = true;
+ clear();
+ }
+
+ private void clear() {
+ sentenceTokenAttrs.clear();
+ sentenceTokenAttrsIter = null;
+ sentenceTokens = null;
+ sentenceTokenTypes = null;
+ lemmas = null;
+ lemmaNum = 0;
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3e2f9e62/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPLemmatizerFilterFactory.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPLemmatizerFilterFactory.java b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPLemmatizerFilterFactory.java
new file mode 100644
index 0000000..90a0e43
--- /dev/null
+++ b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPLemmatizerFilterFactory.java
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.opennlp;
+
+import java.io.IOException;
+import java.util.Map;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.opennlp.tools.NLPLemmatizerOp;
+import org.apache.lucene.analysis.opennlp.tools.OpenNLPOpsFactory;
+import org.apache.lucene.analysis.util.ResourceLoader;
+import org.apache.lucene.analysis.util.ResourceLoaderAware;
+import org.apache.lucene.analysis.util.TokenFilterFactory;
+
+/**
+ * Factory for {@link OpenNLPLemmatizerFilter}.
+ *
+ * <pre class="prettyprint">
+ * <fieldType name="text_opennlp_lemma" class="solr.TextField" positionIncrementGap="100"
+ * <analyzer>
+ * <tokenizer class="solr.OpenNLPTokenizerFactory"
+ * sentenceModel="filename"
+ * tokenizerModel="filename"/>
+ * />
+ * <filter class="solr.OpenNLPLemmatizerFilterFactory"
+ * dictionary="filename"
+ * lemmatizerModel="filename"/>
+ * </analyzer>
+ * </fieldType></pre>
+ * @since 7.3.0
+ */
+public class OpenNLPLemmatizerFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
+ public static final String DICTIONARY = "dictionary";
+ public static final String LEMMATIZER_MODEL = "lemmatizerModel";
+
+ private final String dictionaryFile;
+ private final String lemmatizerModelFile;
+
+ public OpenNLPLemmatizerFilterFactory(Map<String,String> args) {
+ super(args);
+ dictionaryFile = get(args, DICTIONARY);
+ lemmatizerModelFile = get(args, LEMMATIZER_MODEL);
+
+ if (dictionaryFile == null && lemmatizerModelFile == null) {
+ throw new IllegalArgumentException("Configuration Error: missing parameter: at least one of '"
+ + DICTIONARY + "' and '" + LEMMATIZER_MODEL + "' must be provided.");
+ }
+
+ if (!args.isEmpty()) {
+ throw new IllegalArgumentException("Unknown parameters: " + args);
+ }
+ }
+
+ @Override
+ public OpenNLPLemmatizerFilter create(TokenStream in) {
+ try {
+ NLPLemmatizerOp lemmatizerOp = OpenNLPOpsFactory.getLemmatizer(dictionaryFile, lemmatizerModelFile);
+ return new OpenNLPLemmatizerFilter(in, lemmatizerOp);
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ @Override
+ public void inform(ResourceLoader loader) throws IOException {
+ // register models in cache with file/resource names
+ if (dictionaryFile != null) {
+ OpenNLPOpsFactory.getLemmatizerDictionary(dictionaryFile, loader);
+ }
+ if (lemmatizerModelFile != null) {
+ OpenNLPOpsFactory.getLemmatizerModel(lemmatizerModelFile, loader);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3e2f9e62/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPPOSFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPPOSFilter.java b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPPOSFilter.java
new file mode 100644
index 0000000..a5bea28
--- /dev/null
+++ b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPPOSFilter.java
@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.opennlp;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.opennlp.tools.NLPPOSTaggerOp;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.util.AttributeSource;
+
+/**
+ * Run OpenNLP POS tagger. Tags all terms in the TypeAttribute.
+ */
+public final class OpenNLPPOSFilter extends TokenFilter {
+
+ private List<AttributeSource> sentenceTokenAttrs = new ArrayList<>();
+ String[] tags = null;
+ private int tokenNum = 0;
+ private boolean moreTokensAvailable = true;
+
+ private final NLPPOSTaggerOp posTaggerOp;
+ private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
+ private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
+ private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+
+ public OpenNLPPOSFilter(TokenStream input, NLPPOSTaggerOp posTaggerOp) {
+ super(input);
+ this.posTaggerOp = posTaggerOp;
+ }
+
+ @Override
+ public final boolean incrementToken() throws IOException {
+ if ( ! moreTokensAvailable) {
+ clear();
+ return false;
+ }
+ if (tokenNum == sentenceTokenAttrs.size()) { // beginning of stream, or previous sentence exhausted
+ String[] sentenceTokens = nextSentence();
+ if (sentenceTokens == null) {
+ clear();
+ return false;
+ }
+ tags = posTaggerOp.getPOSTags(sentenceTokens);
+ tokenNum = 0;
+ }
+ clearAttributes();
+ sentenceTokenAttrs.get(tokenNum).copyTo(this);
+ typeAtt.setType(tags[tokenNum++]);
+ return true;
+ }
+
+ private String[] nextSentence() throws IOException {
+ List<String> termList = new ArrayList<>();
+ sentenceTokenAttrs.clear();
+ boolean endOfSentence = false;
+ while ( ! endOfSentence && (moreTokensAvailable = input.incrementToken())) {
+ termList.add(termAtt.toString());
+ endOfSentence = 0 != (flagsAtt.getFlags() & OpenNLPTokenizer.EOS_FLAG_BIT);
+ sentenceTokenAttrs.add(input.cloneAttributes());
+ }
+ return termList.size() > 0 ? termList.toArray(new String[termList.size()]) : null;
+ }
+
+ @Override
+ public void reset() throws IOException {
+ super.reset();
+ moreTokensAvailable = true;
+ }
+
+ private void clear() {
+ sentenceTokenAttrs.clear();
+ tags = null;
+ tokenNum = 0;
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3e2f9e62/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPPOSFilterFactory.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPPOSFilterFactory.java b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPPOSFilterFactory.java
new file mode 100644
index 0000000..952218f
--- /dev/null
+++ b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPPOSFilterFactory.java
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.opennlp;
+
+import java.io.IOException;
+import java.util.Map;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.opennlp.tools.OpenNLPOpsFactory;
+import org.apache.lucene.analysis.util.ResourceLoader;
+import org.apache.lucene.analysis.util.ResourceLoaderAware;
+import org.apache.lucene.analysis.util.TokenFilterFactory;
+
+/**
+ * Factory for {@link OpenNLPPOSFilter}.
+ *
+ * <pre class="prettyprint">
+ * <fieldType name="text_opennlp_pos" class="solr.TextField" positionIncrementGap="100">
+ * <analyzer>
+ * <tokenizer class="solr.OpenNLPTokenizerFactory" sentenceModel="filename" tokenizerModel="filename"/>
+ * <filter class="solr.OpenNLPPOSFilterFactory" posTaggerModel="filename"/>
+ * </analyzer>
+ * </fieldType></pre>
+ * @since 7.3.0
+ */
+public class OpenNLPPOSFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
+ public static final String POS_TAGGER_MODEL = "posTaggerModel";
+
+ private final String posTaggerModelFile;
+
+ public OpenNLPPOSFilterFactory(Map<String,String> args) {
+ super(args);
+ posTaggerModelFile = require(args, POS_TAGGER_MODEL);
+ if (!args.isEmpty()) {
+ throw new IllegalArgumentException("Unknown parameters: " + args);
+ }
+ }
+
+ @Override
+ public OpenNLPPOSFilter create(TokenStream in) {
+ try {
+ return new OpenNLPPOSFilter(in, OpenNLPOpsFactory.getPOSTagger(posTaggerModelFile));
+ } catch (IOException e) {
+ throw new IllegalArgumentException(e);
+ }
+ }
+
+ @Override
+ public void inform(ResourceLoader loader) {
+ try { // load and register the read-only model in cache with file/resource name
+ OpenNLPOpsFactory.getPOSTaggerModel(posTaggerModelFile, loader);
+ } catch (IOException e) {
+ throw new IllegalArgumentException(e);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3e2f9e62/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPSentenceBreakIterator.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPSentenceBreakIterator.java b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPSentenceBreakIterator.java
new file mode 100644
index 0000000..f69fbc6
--- /dev/null
+++ b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPSentenceBreakIterator.java
@@ -0,0 +1,224 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.opennlp;
+
+import java.text.BreakIterator;
+import java.text.CharacterIterator;
+
+import opennlp.tools.util.Span;
+import org.apache.lucene.analysis.opennlp.tools.NLPSentenceDetectorOp;
+import org.apache.lucene.analysis.util.CharArrayIterator;
+
+/**
+ * A {@link BreakIterator} that splits sentences using an OpenNLP sentence chunking model.
+ */
+public final class OpenNLPSentenceBreakIterator extends BreakIterator {
+
+ private CharacterIterator text;
+ private int currentSentence;
+ private int[] sentenceStarts;
+ private NLPSentenceDetectorOp sentenceOp;
+
+ public OpenNLPSentenceBreakIterator(NLPSentenceDetectorOp sentenceOp) {
+ this.sentenceOp = sentenceOp;
+ }
+
+ @Override
+ public int current() {
+ return text.getIndex();
+ }
+
+ @Override
+ public int first() {
+ currentSentence = 0;
+ text.setIndex(text.getBeginIndex());
+ return current();
+ }
+
+ @Override
+ public int last() {
+ if (sentenceStarts.length > 0) {
+ currentSentence = sentenceStarts.length - 1;
+ text.setIndex(text.getEndIndex());
+ } else { // there are no sentences; both the first and last positions are the begin index
+ currentSentence = 0;
+ text.setIndex(text.getBeginIndex());
+ }
+ return current();
+ }
+
+ @Override
+ public int next() {
+ if (text.getIndex() == text.getEndIndex() || 0 == sentenceStarts.length) {
+ return DONE;
+ } else if (currentSentence < sentenceStarts.length - 1) {
+ text.setIndex(sentenceStarts[++currentSentence]);
+ return current();
+ } else {
+ return last();
+ }
+ }
+
+ @Override
+ public int following(int pos) {
+ if (pos < text.getBeginIndex() || pos > text.getEndIndex()) {
+ throw new IllegalArgumentException("offset out of bounds");
+ } else if (0 == sentenceStarts.length) {
+ text.setIndex(text.getBeginIndex());
+ return DONE;
+ } else if (pos >= sentenceStarts[sentenceStarts.length - 1]) {
+ // this conflicts with the javadocs, but matches actual behavior (Oracle has a bug in something)
+ // https://bugs.openjdk.java.net/browse/JDK-8015110
+ text.setIndex(text.getEndIndex());
+ currentSentence = sentenceStarts.length - 1;
+ return DONE;
+ } else { // there are at least two sentences
+ currentSentence = (sentenceStarts.length - 1) / 2; // start search from the middle
+ moveToSentenceAt(pos, 0, sentenceStarts.length - 2);
+ text.setIndex(sentenceStarts[++currentSentence]);
+ return current();
+ }
+ }
+
+ /** Binary search over sentences */
+ private void moveToSentenceAt(int pos, int minSentence, int maxSentence) {
+ if (minSentence != maxSentence) {
+ if (pos < sentenceStarts[currentSentence]) {
+ int newMaxSentence = currentSentence - 1;
+ currentSentence = minSentence + (currentSentence - minSentence) / 2;
+ moveToSentenceAt(pos, minSentence, newMaxSentence);
+ } else if (pos >= sentenceStarts[currentSentence + 1]) {
+ int newMinSentence = currentSentence + 1;
+ currentSentence = maxSentence - (maxSentence - currentSentence) / 2;
+ moveToSentenceAt(pos, newMinSentence, maxSentence);
+ }
+ } else {
+ assert currentSentence == minSentence;
+ assert pos >= sentenceStarts[currentSentence];
+ assert (currentSentence == sentenceStarts.length - 1 && pos <= text.getEndIndex())
+ || pos < sentenceStarts[currentSentence + 1];
+ }
+ // we have arrived - nothing to do
+ }
+
+ @Override
+ public int previous() {
+ if (text.getIndex() == text.getBeginIndex()) {
+ return DONE;
+ } else {
+ if (0 == sentenceStarts.length) {
+ text.setIndex(text.getBeginIndex());
+ return DONE;
+ }
+ if (text.getIndex() == text.getEndIndex()) {
+ text.setIndex(sentenceStarts[currentSentence]);
+ } else {
+ text.setIndex(sentenceStarts[--currentSentence]);
+ }
+ return current();
+ }
+ }
+
+ @Override
+ public int preceding(int pos) {
+ if (pos < text.getBeginIndex() || pos > text.getEndIndex()) {
+ throw new IllegalArgumentException("offset out of bounds");
+ } else if (0 == sentenceStarts.length) {
+ text.setIndex(text.getBeginIndex());
+ currentSentence = 0;
+ return DONE;
+ } else if (pos < sentenceStarts[0]) {
+ // this conflicts with the javadocs, but matches actual behavior (Oracle has a bug in something)
+ // https://bugs.openjdk.java.net/browse/JDK-8015110
+ text.setIndex(text.getBeginIndex());
+ currentSentence = 0;
+ return DONE;
+ } else {
+ currentSentence = sentenceStarts.length / 2; // start search from the middle
+ moveToSentenceAt(pos, 0, sentenceStarts.length - 1);
+ if (0 == currentSentence) {
+ text.setIndex(text.getBeginIndex());
+ return DONE;
+ } else {
+ text.setIndex(sentenceStarts[--currentSentence]);
+ return current();
+ }
+ }
+ }
+
+ @Override
+ public int next(int n) {
+ currentSentence += n;
+ if (n < 0) {
+ if (text.getIndex() == text.getEndIndex()) {
+ ++currentSentence;
+ }
+ if (currentSentence < 0) {
+ currentSentence = 0;
+ text.setIndex(text.getBeginIndex());
+ return DONE;
+ } else {
+ text.setIndex(sentenceStarts[currentSentence]);
+ }
+ } else if (n > 0) {
+ if (currentSentence >= sentenceStarts.length) {
+ currentSentence = sentenceStarts.length - 1;
+ text.setIndex(text.getEndIndex());
+ return DONE;
+ } else {
+ text.setIndex(sentenceStarts[currentSentence]);
+ }
+ }
+ return current();
+ }
+
+ @Override
+ public CharacterIterator getText() {
+ return text;
+ }
+
+ @Override
+ public void setText(CharacterIterator newText) {
+ text = newText;
+ text.setIndex(text.getBeginIndex());
+ currentSentence = 0;
+ Span[] spans = sentenceOp.splitSentences(characterIteratorToString());
+ sentenceStarts = new int[spans.length];
+ for (int i = 0; i < spans.length; ++i) {
+ // Adjust start positions to match those of the passed-in CharacterIterator
+ sentenceStarts[i] = spans[i].getStart() + text.getBeginIndex();
+ }
+ }
+
+ private String characterIteratorToString() {
+ String fullText;
+ if (text instanceof CharArrayIterator) {
+ CharArrayIterator charArrayIterator = (CharArrayIterator)text;
+ fullText = new String(charArrayIterator.getText(), charArrayIterator.getStart(), charArrayIterator.getLength());
+ } else {
+ // TODO: is there a better way to extract full text from arbitrary CharacterIterators?
+ StringBuilder builder = new StringBuilder();
+ for (char ch = text.first(); ch != CharacterIterator.DONE; ch = text.next()) {
+ builder.append(ch);
+ }
+ fullText = builder.toString();
+ text.setIndex(text.getBeginIndex());
+ }
+ return fullText;
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3e2f9e62/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPTokenizer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPTokenizer.java b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPTokenizer.java
new file mode 100644
index 0000000..75a3b81
--- /dev/null
+++ b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPTokenizer.java
@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.opennlp;
+
+import java.io.IOException;
+
+import opennlp.tools.util.Span;
+
+import org.apache.lucene.analysis.opennlp.tools.NLPSentenceDetectorOp;
+import org.apache.lucene.analysis.opennlp.tools.NLPTokenizerOp;
+import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.util.SegmentingTokenizerBase;
+import org.apache.lucene.util.AttributeFactory;
+
+/**
+ * Run OpenNLP SentenceDetector and Tokenizer.
+ * The last token in each sentence is marked by setting the {@link #EOS_FLAG_BIT} in the FlagsAttribute;
+ * following filters can use this information to apply operations to tokens one sentence at a time.
+ */
+public final class OpenNLPTokenizer extends SegmentingTokenizerBase {
+ public static int EOS_FLAG_BIT = 1;
+
+ private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
+ private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+
+ private Span[] termSpans = null;
+ private int termNum = 0;
+ private int sentenceStart = 0;
+
+ private NLPSentenceDetectorOp sentenceOp = null;
+ private NLPTokenizerOp tokenizerOp = null;
+
+ public OpenNLPTokenizer(AttributeFactory factory, NLPSentenceDetectorOp sentenceOp, NLPTokenizerOp tokenizerOp) throws IOException {
+ super(factory, new OpenNLPSentenceBreakIterator(sentenceOp));
+ if (sentenceOp == null || tokenizerOp == null) {
+ throw new IllegalArgumentException("OpenNLPTokenizer: both a Sentence Detector and a Tokenizer are required");
+ }
+ this.sentenceOp = sentenceOp;
+ this.tokenizerOp = tokenizerOp;
+ }
+
+ @Override
+ public void close() throws IOException {
+ super.close();
+ termSpans = null;
+ termNum = sentenceStart = 0;
+ };
+
+ @Override
+ protected void setNextSentence(int sentenceStart, int sentenceEnd) {
+ this.sentenceStart = sentenceStart;
+ String sentenceText = new String(buffer, sentenceStart, sentenceEnd - sentenceStart);
+ termSpans = tokenizerOp.getTerms(sentenceText);
+ termNum = 0;
+ }
+
+ @Override
+ protected boolean incrementWord() {
+ if (termSpans == null || termNum == termSpans.length) {
+ return false;
+ }
+ clearAttributes();
+ Span term = termSpans[termNum];
+ termAtt.copyBuffer(buffer, sentenceStart + term.getStart(), term.length());
+ offsetAtt.setOffset(correctOffset(offset + sentenceStart + term.getStart()),
+ correctOffset(offset + sentenceStart + term.getEnd()));
+ if (termNum == termSpans.length - 1) {
+ flagsAtt.setFlags(flagsAtt.getFlags() | EOS_FLAG_BIT); // mark the last token in the sentence with EOS_FLAG_BIT
+ }
+ ++termNum;
+ return true;
+ }
+
+ @Override
+ public void reset() throws IOException {
+ super.reset();
+ termSpans = null;
+ termNum = sentenceStart = 0;
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3e2f9e62/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPTokenizerFactory.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPTokenizerFactory.java b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPTokenizerFactory.java
new file mode 100644
index 0000000..a60f23f
--- /dev/null
+++ b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/OpenNLPTokenizerFactory.java
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.opennlp;
+
+import java.io.IOException;
+import java.util.Map;
+
+import org.apache.lucene.analysis.opennlp.tools.NLPSentenceDetectorOp;
+import org.apache.lucene.analysis.opennlp.tools.NLPTokenizerOp;
+import org.apache.lucene.analysis.opennlp.tools.OpenNLPOpsFactory;
+import org.apache.lucene.analysis.util.ResourceLoader;
+import org.apache.lucene.analysis.util.ResourceLoaderAware;
+import org.apache.lucene.analysis.util.TokenizerFactory;
+import org.apache.lucene.util.AttributeFactory;
+
+/**
+ * Factory for {@link OpenNLPTokenizer}.
+ *
+ * <pre class="prettyprint">
+ * <fieldType name="text_opennlp" class="solr.TextField" positionIncrementGap="100"
+ * <analyzer>
+ * <tokenizer class="solr.OpenNLPTokenizerFactory" sentenceModel="filename" tokenizerModel="filename"/>
+ * </analyzer>
+ * </fieldType></pre>
+ * @since 7.3.0
+ */
+public class OpenNLPTokenizerFactory extends TokenizerFactory implements ResourceLoaderAware {
+ public static final String SENTENCE_MODEL = "sentenceModel";
+ public static final String TOKENIZER_MODEL = "tokenizerModel";
+
+ private final String sentenceModelFile;
+ private final String tokenizerModelFile;
+
+ public OpenNLPTokenizerFactory(Map<String,String> args) {
+ super(args);
+ sentenceModelFile = require(args, SENTENCE_MODEL);
+ tokenizerModelFile = require(args, TOKENIZER_MODEL);
+ if ( ! args.isEmpty()) {
+ throw new IllegalArgumentException("Unknown parameters: " + args);
+ }
+ }
+
+ @Override
+ public OpenNLPTokenizer create(AttributeFactory factory) {
+ try {
+ NLPSentenceDetectorOp sentenceOp = OpenNLPOpsFactory.getSentenceDetector(sentenceModelFile);
+ NLPTokenizerOp tokenizerOp = OpenNLPOpsFactory.getTokenizer(tokenizerModelFile);
+ return new OpenNLPTokenizer(factory, sentenceOp, tokenizerOp);
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ @Override
+ public void inform(ResourceLoader loader) throws IOException {
+ // register models in cache with file/resource names
+ if (sentenceModelFile != null) {
+ OpenNLPOpsFactory.getSentenceModel(sentenceModelFile, loader);
+ }
+ if (tokenizerModelFile != null) {
+ OpenNLPOpsFactory.getTokenizerModel(tokenizerModelFile, loader);
+ }
+ }
+}