You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sa...@apache.org on 2017/12/15 16:26:04 UTC
[11/12] lucene-solr:branch_7x: LUCENE-2899: Add OpenNLP Analysis
capabilities as a module
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b720e1ee/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/package-info.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/package-info.java b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/package-info.java
new file mode 100644
index 0000000..527e24f
--- /dev/null
+++ b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/package-info.java
@@ -0,0 +1,21 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Analysis components based on OpenNLP
+ */
+package org.apache.lucene.analysis.opennlp;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b720e1ee/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/NLPChunkerOp.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/NLPChunkerOp.java b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/NLPChunkerOp.java
new file mode 100644
index 0000000..f6a5ea8
--- /dev/null
+++ b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/NLPChunkerOp.java
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.opennlp.tools;
+
+import java.io.IOException;
+import opennlp.tools.chunker.ChunkerME;
+import opennlp.tools.chunker.ChunkerModel;
+
+/**
+ * Supply OpenNLP Chunking tool
+ * Requires binary models from OpenNLP project on SourceForge.
+ */
+public class NLPChunkerOp {
+ private ChunkerME chunker = null;
+
+ public NLPChunkerOp(ChunkerModel chunkerModel) throws IOException {
+ chunker = new ChunkerME(chunkerModel);
+ }
+
+ public synchronized String[] getChunks(String[] words, String[] tags, double[] probs) {
+ String[] chunks = chunker.chunk(words, tags);
+ if (probs != null)
+ chunker.probs(probs);
+ return chunks;
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b720e1ee/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/NLPLemmatizerOp.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/NLPLemmatizerOp.java b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/NLPLemmatizerOp.java
new file mode 100644
index 0000000..b09c63e
--- /dev/null
+++ b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/NLPLemmatizerOp.java
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.opennlp.tools;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import opennlp.tools.lemmatizer.DictionaryLemmatizer;
+import opennlp.tools.lemmatizer.LemmatizerME;
+import opennlp.tools.lemmatizer.LemmatizerModel;
+
+/**
+ * <p>Supply OpenNLP Lemmatizer tools.</p>
+ * <p>
+ * Both a dictionary-based lemmatizer and a MaxEnt lemmatizer are supported.
+ * If both are configured, the dictionary-based lemmatizer is tried first,
+ * and then the MaxEnt lemmatizer is consulted for out-of-vocabulary tokens.
+ * </p>
+ * <p>
+ * The MaxEnt implementation requires binary models from OpenNLP project on SourceForge.
+ * </p>
+ */
+public class NLPLemmatizerOp {
+ private final DictionaryLemmatizer dictionaryLemmatizer;
+ private final LemmatizerME lemmatizerME;
+
+ public NLPLemmatizerOp(InputStream dictionary, LemmatizerModel lemmatizerModel) throws IOException {
+ assert dictionary != null || lemmatizerModel != null : "At least one parameter must be non-null";
+ dictionaryLemmatizer = dictionary == null ? null : new DictionaryLemmatizer(dictionary);
+ lemmatizerME = lemmatizerModel == null ? null : new LemmatizerME(lemmatizerModel);
+ }
+
+ public String[] lemmatize(String[] words, String[] postags) {
+ String[] lemmas = null;
+ String[] maxEntLemmas = null;
+ if (dictionaryLemmatizer != null) {
+ lemmas = dictionaryLemmatizer.lemmatize(words, postags);
+ for (int i = 0; i < lemmas.length; ++i) {
+ if (lemmas[i].equals("O")) { // this word is not in the dictionary
+ if (lemmatizerME != null) { // fall back to the MaxEnt lemmatizer if it's enabled
+ if (maxEntLemmas == null) {
+ maxEntLemmas = lemmatizerME.lemmatize(words, postags);
+ }
+ if ("_".equals(maxEntLemmas[i])) {
+ lemmas[i] = words[i]; // put back the original word if no lemma is found
+ } else {
+ lemmas[i] = maxEntLemmas[i];
+ }
+ } else { // there is no MaxEnt lemmatizer
+ lemmas[i] = words[i]; // put back the original word if no lemma is found
+ }
+ }
+ }
+ } else { // there is only a MaxEnt lemmatizer
+ maxEntLemmas = lemmatizerME.lemmatize(words, postags);
+ for (int i = 0 ; i < maxEntLemmas.length ; ++i) {
+ if ("_".equals(maxEntLemmas[i])) {
+ maxEntLemmas[i] = words[i]; // put back the original word if no lemma is found
+ }
+ }
+ lemmas = maxEntLemmas;
+ }
+ return lemmas;
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b720e1ee/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/NLPNERTaggerOp.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/NLPNERTaggerOp.java b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/NLPNERTaggerOp.java
new file mode 100644
index 0000000..22e617d
--- /dev/null
+++ b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/NLPNERTaggerOp.java
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.opennlp.tools;
+
+import opennlp.tools.namefind.NameFinderME;
+import opennlp.tools.namefind.TokenNameFinder;
+import opennlp.tools.namefind.TokenNameFinderModel;
+import opennlp.tools.util.Span;
+
+/**
+ * Supply OpenNLP Named Entity Resolution tool
+ * Requires binary models from OpenNLP project on SourceForge.
+ *
+ * Usage: from <a href="http://opennlp.apache.org/docs/1.8.3/manual/opennlp.html#tools.namefind.recognition.api"
+ * >the OpenNLP documentation</a>:
+ *
+ * "The NameFinderME class is not thread safe, it must only be called from one thread.
+ * To use multiple threads multiple NameFinderME instances sharing the same model instance
+ * can be created. The input text should be segmented into documents, sentences and tokens.
+ * To perform entity detection an application calls the find method for every sentence in
+ * the document. After every document clearAdaptiveData must be called to clear the adaptive
+ * data in the feature generators. Not calling clearAdaptiveData can lead to a sharp drop
+ * in the detection rate after a few documents."
+ *
+ */
+public class NLPNERTaggerOp {
+ private final TokenNameFinder nameFinder;
+
+ public NLPNERTaggerOp(TokenNameFinderModel model) {
+ this.nameFinder = new NameFinderME(model);
+ }
+
+ public Span[] getNames(String[] words) {
+ Span[] names = nameFinder.find(words);
+ return names;
+ }
+
+ public synchronized void reset() {
+ nameFinder.clearAdaptiveData();
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b720e1ee/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/NLPPOSTaggerOp.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/NLPPOSTaggerOp.java b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/NLPPOSTaggerOp.java
new file mode 100644
index 0000000..447e1c0
--- /dev/null
+++ b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/NLPPOSTaggerOp.java
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.opennlp.tools;
+
+import java.io.IOException;
+
+import opennlp.tools.postag.POSModel;
+import opennlp.tools.postag.POSTagger;
+import opennlp.tools.postag.POSTaggerME;
+
+/**
+ * Supply OpenNLP Parts-Of-Speech Tagging tool
+ * Requires binary models from OpenNLP project on SourceForge.
+ */
+
+public class NLPPOSTaggerOp {
+ private POSTagger tagger = null;
+
+ public NLPPOSTaggerOp(POSModel model) throws IOException {
+ tagger = new POSTaggerME(model);
+ }
+
+ public synchronized String[] getPOSTags(String[] words) {
+ return tagger.tag(words);
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b720e1ee/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/NLPSentenceDetectorOp.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/NLPSentenceDetectorOp.java b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/NLPSentenceDetectorOp.java
new file mode 100644
index 0000000..21983ce
--- /dev/null
+++ b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/NLPSentenceDetectorOp.java
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.opennlp.tools;
+
+import java.io.IOException;
+
+import opennlp.tools.sentdetect.SentenceDetectorME;
+import opennlp.tools.sentdetect.SentenceModel;
+import opennlp.tools.util.Span;
+
+/**
+ * Supply OpenNLP Sentence Detector tool
+ * Requires binary models from OpenNLP project on SourceForge.
+ */
+public class NLPSentenceDetectorOp {
+ private final SentenceDetectorME sentenceSplitter;
+
+ public NLPSentenceDetectorOp(SentenceModel model) throws IOException {
+ sentenceSplitter = new SentenceDetectorME(model);
+ }
+
+ public NLPSentenceDetectorOp() {
+ sentenceSplitter = null;
+ }
+
+ public synchronized Span[] splitSentences(String line) {
+ if (sentenceSplitter != null) {
+ return sentenceSplitter.sentPosDetect(line);
+ } else {
+ Span[] shorty = new Span[1];
+ shorty[0] = new Span(0, line.length());
+ return shorty;
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b720e1ee/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/NLPTokenizerOp.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/NLPTokenizerOp.java b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/NLPTokenizerOp.java
new file mode 100644
index 0000000..0aeb713
--- /dev/null
+++ b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/NLPTokenizerOp.java
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.opennlp.tools;
+
+import opennlp.tools.tokenize.Tokenizer;
+import opennlp.tools.tokenize.TokenizerME;
+import opennlp.tools.tokenize.TokenizerModel;
+import opennlp.tools.util.Span;
+
+/**
+ * Supply OpenNLP Sentence Tokenizer tool
+ * Requires binary models from OpenNLP project on SourceForge.
+ */
+public class NLPTokenizerOp {
+ private final Tokenizer tokenizer;
+
+ public NLPTokenizerOp(TokenizerModel model) {
+ tokenizer = new TokenizerME(model);
+ }
+
+ public NLPTokenizerOp() {
+ tokenizer = null;
+ }
+
+ public synchronized Span[] getTerms(String sentence) {
+ if (tokenizer == null) {
+ Span[] span1 = new Span[1];
+ span1[0] = new Span(0, sentence.length());
+ return span1;
+ }
+ return tokenizer.tokenizePos(sentence);
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b720e1ee/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/OpenNLPOpsFactory.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/OpenNLPOpsFactory.java b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/OpenNLPOpsFactory.java
new file mode 100644
index 0000000..5348857
--- /dev/null
+++ b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/OpenNLPOpsFactory.java
@@ -0,0 +1,176 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.opennlp.tools;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.nio.charset.StandardCharsets;
+import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
+
+import opennlp.tools.chunker.ChunkerModel;
+import opennlp.tools.lemmatizer.LemmatizerModel;
+import opennlp.tools.namefind.TokenNameFinderModel;
+import opennlp.tools.postag.POSModel;
+import opennlp.tools.sentdetect.SentenceModel;
+import opennlp.tools.tokenize.TokenizerModel;
+import org.apache.lucene.analysis.util.ResourceLoader;
+
+/**
+ * Supply OpenNLP Named Entity Recognizer
+ * Cache model file objects. Assumes model files are thread-safe.
+ */
+public class OpenNLPOpsFactory {
+ private static Map<String,SentenceModel> sentenceModels = new ConcurrentHashMap<>();
+ private static ConcurrentHashMap<String,TokenizerModel> tokenizerModels = new ConcurrentHashMap<>();
+ private static ConcurrentHashMap<String,POSModel> posTaggerModels = new ConcurrentHashMap<>();
+ private static ConcurrentHashMap<String,ChunkerModel> chunkerModels = new ConcurrentHashMap<>();
+ private static Map<String,TokenNameFinderModel> nerModels = new ConcurrentHashMap<>();
+ private static Map<String,LemmatizerModel> lemmatizerModels = new ConcurrentHashMap<>();
+ private static Map<String,String> lemmaDictionaries = new ConcurrentHashMap<>();
+
+ public static NLPSentenceDetectorOp getSentenceDetector(String modelName) throws IOException {
+ if (modelName != null) {
+ SentenceModel model = sentenceModels.get(modelName);
+ return new NLPSentenceDetectorOp(model);
+ } else {
+ return new NLPSentenceDetectorOp();
+ }
+ }
+
+ public static SentenceModel getSentenceModel(String modelName, ResourceLoader loader) throws IOException {
+ SentenceModel model = sentenceModels.get(modelName);
+ if (model == null) {
+ model = new SentenceModel(loader.openResource(modelName));
+ sentenceModels.put(modelName, model);
+ }
+ return model;
+ }
+
+ public static NLPTokenizerOp getTokenizer(String modelName) throws IOException {
+ if (modelName == null) {
+ return new NLPTokenizerOp();
+ } else {
+ TokenizerModel model = tokenizerModels.get(modelName);
+ return new NLPTokenizerOp(model);
+ }
+ }
+
+ public static TokenizerModel getTokenizerModel(String modelName, ResourceLoader loader) throws IOException {
+ TokenizerModel model = tokenizerModels.get(modelName);
+ if (model == null) {
+ model = new TokenizerModel(loader.openResource(modelName));
+ tokenizerModels.put(modelName, model);
+ }
+ return model;
+ }
+
+ public static NLPPOSTaggerOp getPOSTagger(String modelName) throws IOException {
+ POSModel model = posTaggerModels.get(modelName);
+ return new NLPPOSTaggerOp(model);
+ }
+
+ public static POSModel getPOSTaggerModel(String modelName, ResourceLoader loader) throws IOException {
+ POSModel model = posTaggerModels.get(modelName);
+ if (model == null) {
+ model = new POSModel(loader.openResource(modelName));
+ posTaggerModels.put(modelName, model);
+ }
+ return model;
+ }
+
+ public static NLPChunkerOp getChunker(String modelName) throws IOException {
+ ChunkerModel model = chunkerModels.get(modelName);
+ return new NLPChunkerOp(model);
+ }
+
+ public static ChunkerModel getChunkerModel(String modelName, ResourceLoader loader) throws IOException {
+ ChunkerModel model = chunkerModels.get(modelName);
+ if (model == null) {
+ model = new ChunkerModel(loader.openResource(modelName));
+ chunkerModels.put(modelName, model);
+ }
+ return model;
+ }
+
+ public static NLPNERTaggerOp getNERTagger(String modelName) throws IOException {
+ TokenNameFinderModel model = nerModels.get(modelName);
+ return new NLPNERTaggerOp(model);
+ }
+
+ public static TokenNameFinderModel getNERTaggerModel(String modelName, ResourceLoader loader) throws IOException {
+ TokenNameFinderModel model = nerModels.get(modelName);
+ if (model == null) {
+ model = new TokenNameFinderModel(loader.openResource(modelName));
+ nerModels.put(modelName, model);
+ }
+ return model;
+ }
+
+ public static NLPLemmatizerOp getLemmatizer(String dictionaryFile, String lemmatizerModelFile) throws IOException {
+ assert dictionaryFile != null || lemmatizerModelFile != null : "At least one parameter must be non-null";
+ InputStream dictionaryInputStream = null;
+ if (dictionaryFile != null) {
+ String dictionary = lemmaDictionaries.get(dictionaryFile);
+ dictionaryInputStream = new ByteArrayInputStream(dictionary.getBytes(StandardCharsets.UTF_8));
+ }
+ LemmatizerModel lemmatizerModel = lemmatizerModelFile == null ? null : lemmatizerModels.get(lemmatizerModelFile);
+ return new NLPLemmatizerOp(dictionaryInputStream, lemmatizerModel);
+ }
+
+ public static String getLemmatizerDictionary(String dictionaryFile, ResourceLoader loader) throws IOException {
+ String dictionary = lemmaDictionaries.get(dictionaryFile);
+ if (dictionary == null) {
+ Reader reader = new InputStreamReader(loader.openResource(dictionaryFile), StandardCharsets.UTF_8);
+ StringBuilder builder = new StringBuilder();
+ char[] chars = new char[8092];
+ int numRead = 0;
+ do {
+ numRead = reader.read(chars, 0, chars.length);
+ if (numRead > 0) {
+ builder.append(chars, 0, numRead);
+ }
+ } while (numRead > 0);
+ dictionary = builder.toString();
+ lemmaDictionaries.put(dictionaryFile, dictionary);
+ }
+ return dictionary;
+ }
+
+ public static LemmatizerModel getLemmatizerModel(String modelName, ResourceLoader loader) throws IOException {
+ LemmatizerModel model = lemmatizerModels.get(modelName);
+ if (model == null) {
+ model = new LemmatizerModel(loader.openResource(modelName));
+ lemmatizerModels.put(modelName, model);
+ }
+ return model;
+ }
+
+ // keeps unit test from blowing out memory
+ public static void clearModels() {
+ sentenceModels.clear();
+ tokenizerModels.clear();
+ posTaggerModels.clear();
+ chunkerModels.clear();
+ nerModels.clear();
+ lemmaDictionaries.clear();
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b720e1ee/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/package-info.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/package-info.java b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/package-info.java
new file mode 100644
index 0000000..523a084
--- /dev/null
+++ b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/package-info.java
@@ -0,0 +1,21 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Tools to supply access to OpenNLP components.
+ */
+package org.apache.lucene.analysis.opennlp.tools;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b720e1ee/lucene/analysis/opennlp/src/java/overview.html
----------------------------------------------------------------------
diff --git a/lucene/analysis/opennlp/src/java/overview.html b/lucene/analysis/opennlp/src/java/overview.html
new file mode 100644
index 0000000..bf70e95
--- /dev/null
+++ b/lucene/analysis/opennlp/src/java/overview.html
@@ -0,0 +1,61 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html>
+<head>
+ <META http-equiv="Content-Type" content="text/html; charset=UTF-8">
+ <title>
+ Apache Lucene OpenNLP integration module
+ </title>
+</head>
+<body>
+<p>
+ This module exposes functionality from
+ <a href="http://opennlp.apache.org">Apache OpenNLP</a> to Apache Lucene.
+ The Apache OpenNLP library is a machine learning based toolkit for the processing of natural language text.
+<p>
+ For an introduction to Lucene's analysis API, see the {@link org.apache.lucene.analysis} package documentation.
+<p>
+ The OpenNLP Tokenizer behavior is similar to the WhiteSpaceTokenizer but is smart about
+ inter-word punctuation. The term stream looks very much like the way you parse words and
+ punctuation while reading. The major difference between this tokenizer and most other
+ tokenizers shipped with Lucene is that punctuation is tokenized. This is required for
+ the following taggers to operate properly.
+<p>
+ The OpenNLP taggers annotate terms using the <code>TypeAttribute</code>.
+<ul>
+ <li><code>OpenNLPTokenizer</code> segments text into sentences or words. This Tokenizer
+ uses the OpenNLP Sentence Detector and/or Tokenizer classes. When used together, the
+ Tokenizer receives sentences and can do a better job.</li>
+ <li><code>OpenNLPFilter</code> tags words using one or more technologies: Part-of-Speech,
+ Chunking, and Named Entity Recognition. These tags are assigned as token types. Note that
+ only of these operations will tag
+ </li>
+</ul>
+<p>
+ Since the <code>TypeAttribute</code> is not stored in the index, it is recommended that one
+ of these filters is used following <code>OpenNLPFilter</code> to enable search against the
+ assigned tags:
+<ul>
+ <li><code>TypeAsPayloadFilter</code> copies the <code>TypeAttribute</code> value to the
+ <code>PayloadAttribute</code></li>
+ <li><code>TypeAsSynonymFilter</code> creates a cloned token at the same position as each
+ tagged token, and copies the {{TypeAttribute}} value to the {{CharTermAttribute}}, optionally
+ with a customized prefix (so that tags effectively occupy a different namespace from token
+ text).</li>
+</ul>
+</body>
+</html>
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b720e1ee/lucene/analysis/opennlp/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
----------------------------------------------------------------------
diff --git a/lucene/analysis/opennlp/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory b/lucene/analysis/opennlp/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
new file mode 100644
index 0000000..61a685d
--- /dev/null
+++ b/lucene/analysis/opennlp/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
@@ -0,0 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+org.apache.lucene.analysis.opennlp.OpenNLPChunkerFilterFactory
+org.apache.lucene.analysis.opennlp.OpenNLPLemmatizerFilterFactory
+org.apache.lucene.analysis.opennlp.OpenNLPPOSFilterFactory
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b720e1ee/lucene/analysis/opennlp/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenizerFactory
----------------------------------------------------------------------
diff --git a/lucene/analysis/opennlp/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenizerFactory b/lucene/analysis/opennlp/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenizerFactory
new file mode 100644
index 0000000..076b308
--- /dev/null
+++ b/lucene/analysis/opennlp/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenizerFactory
@@ -0,0 +1,16 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+org.apache.lucene.analysis.opennlp.OpenNLPTokenizerFactory
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b720e1ee/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/en-test-chunker.bin
----------------------------------------------------------------------
diff --git a/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/en-test-chunker.bin b/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/en-test-chunker.bin
new file mode 100644
index 0000000..8151914
Binary files /dev/null and b/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/en-test-chunker.bin differ
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b720e1ee/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/en-test-lemmas.dict
----------------------------------------------------------------------
diff --git a/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/en-test-lemmas.dict b/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/en-test-lemmas.dict
new file mode 100644
index 0000000..d1d486c
--- /dev/null
+++ b/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/en-test-lemmas.dict
@@ -0,0 +1,12 @@
+they NNP they
+sent VBD send
+him PRP he
+running VBG run
+in IN in
+the DT the
+evening NN evening
+he PRP he
+did VBD do
+not RB not
+come VB come
+back RB back
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b720e1ee/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/en-test-lemmatizer.bin
----------------------------------------------------------------------
diff --git a/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/en-test-lemmatizer.bin b/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/en-test-lemmatizer.bin
new file mode 100644
index 0000000..e62df7e
Binary files /dev/null and b/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/en-test-lemmatizer.bin differ
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b720e1ee/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/en-test-ner-person.bin
----------------------------------------------------------------------
diff --git a/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/en-test-ner-person.bin b/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/en-test-ner-person.bin
new file mode 100644
index 0000000..0b40aac
Binary files /dev/null and b/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/en-test-ner-person.bin differ
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b720e1ee/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/en-test-pos-maxent.bin
----------------------------------------------------------------------
diff --git a/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/en-test-pos-maxent.bin b/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/en-test-pos-maxent.bin
new file mode 100644
index 0000000..b77fb46
Binary files /dev/null and b/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/en-test-pos-maxent.bin differ
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b720e1ee/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/en-test-sent.bin
----------------------------------------------------------------------
diff --git a/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/en-test-sent.bin b/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/en-test-sent.bin
new file mode 100644
index 0000000..4252bcb
Binary files /dev/null and b/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/en-test-sent.bin differ
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b720e1ee/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/en-test-tokenizer.bin
----------------------------------------------------------------------
diff --git a/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/en-test-tokenizer.bin b/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/en-test-tokenizer.bin
new file mode 100644
index 0000000..94668c0
Binary files /dev/null and b/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/en-test-tokenizer.bin differ
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b720e1ee/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPChunkerFilterFactory.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPChunkerFilterFactory.java b/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPChunkerFilterFactory.java
new file mode 100644
index 0000000..013348c
--- /dev/null
+++ b/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPChunkerFilterFactory.java
@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.opennlp;
+
+import java.nio.charset.StandardCharsets;
+import java.util.Arrays;
+
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.custom.CustomAnalyzer;
+import org.apache.lucene.analysis.payloads.TypeAsPayloadTokenFilterFactory;
+import org.apache.lucene.analysis.util.ClasspathResourceLoader;
+
+/**
+ * Needs the OpenNLP Tokenizer because it creates full streams of punctuation.
+ * Needs the OpenNLP POS tagger for the POS tags.
+ *
+ * Tagging models are created from tiny test data in opennlp/tools/test-model-data/ and are not very accurate.
+ */
+public class TestOpenNLPChunkerFilterFactory extends BaseTokenStreamTestCase {
+
+ private static final String SENTENCES = "Sentence number 1 has 6 words. Sentence number 2, 5 words.";
+ private static final String[] SENTENCES_punc
+ = {"Sentence", "number", "1", "has", "6", "words", ".", "Sentence", "number", "2", ",", "5", "words", "."};
+ private static final int[] SENTENCES_startOffsets = {0, 9, 16, 18, 22, 24, 29, 31, 40, 47, 48, 50, 52, 57};
+ private static final int[] SENTENCES_endOffsets = {8, 15, 17, 21, 23, 29, 30, 39, 46, 48, 49, 51, 57, 58};
+ private static final String[] SENTENCES_chunks
+ = { "B-NP", "I-NP", "I-NP", "B-VP", "B-NP", "I-NP", "O", "B-NP", "I-NP", "I-NP", "O", "B-NP", "I-NP", "O" };
+
+ private static final String sentenceModelFile = "en-test-sent.bin";
+ private static final String tokenizerModelFile = "en-test-tokenizer.bin";
+ private static final String posTaggerModelFile = "en-test-pos-maxent.bin";
+ private static final String chunkerModelFile = "en-test-chunker.bin";
+
+
+ private static byte[][] toPayloads(String... strings) {
+ return Arrays.stream(strings).map(s -> s == null ? null : s.getBytes(StandardCharsets.UTF_8)).toArray(byte[][]::new);
+ }
+
+ public void testBasic() throws Exception {
+ CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
+ .withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
+ .addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
+ .addTokenFilter("opennlpChunker", "chunkerModel", chunkerModelFile)
+ .build();
+ assertAnalyzesTo(analyzer, SENTENCES, SENTENCES_punc, SENTENCES_startOffsets, SENTENCES_endOffsets,
+ SENTENCES_chunks, null, null, true);
+ }
+
+ public void testPayloads() throws Exception {
+ CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
+ .withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
+ .addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
+ .addTokenFilter("opennlpChunker", "chunkerModel", chunkerModelFile)
+ .addTokenFilter(TypeAsPayloadTokenFilterFactory.class)
+ .build();
+ assertAnalyzesTo(analyzer, SENTENCES, SENTENCES_punc, SENTENCES_startOffsets, SENTENCES_endOffsets,
+ null, null, null, true, toPayloads(SENTENCES_chunks));
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b720e1ee/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPLemmatizerFilterFactory.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPLemmatizerFilterFactory.java b/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPLemmatizerFilterFactory.java
new file mode 100644
index 0000000..0491b91
--- /dev/null
+++ b/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPLemmatizerFilterFactory.java
@@ -0,0 +1,169 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.opennlp;
+
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.custom.CustomAnalyzer;
+import org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilterFactory;
+import org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilterFactory;
+import org.apache.lucene.analysis.util.ClasspathResourceLoader;
+
+public class TestOpenNLPLemmatizerFilterFactory extends BaseTokenStreamTestCase {
+
+ private static final String SENTENCE = "They sent him running in the evening.";
+ private static final String[] SENTENCE_dict_punc = {"they", "send", "he", "run", "in", "the", "evening", "."};
+ private static final String[] SENTENCE_maxent_punc = {"they", "send", "he", "runn", "in", "the", "evening", "."};
+ private static final String[] SENTENCE_posTags = {"NNP", "VBD", "PRP", "VBG", "IN", "DT", "NN", "."};
+
+ private static final String SENTENCES = "They sent him running in the evening. He did not come back.";
+ private static final String[] SENTENCES_dict_punc
+ = {"they", "send", "he", "run", "in", "the", "evening", ".", "he", "do", "not", "come", "back", "."};
+ private static final String[] SENTENCES_maxent_punc
+ = {"they", "send", "he", "runn", "in", "the", "evening", ".", "he", "do", "not", "come", "back", "."};
+ private static final String[] SENTENCES_posTags
+ = {"NNP", "VBD", "PRP", "VBG", "IN", "DT", "NN", ".", "PRP", "VBD", "RB", "VB", "RB", "."};
+
+ private static final String SENTENCE_both = "Konstantin Kalashnitsov constantly caliphed.";
+ private static final String[] SENTENCE_both_punc
+ = {"konstantin", "kalashnitsov", "constantly", "caliph", "."};
+ private static final String[] SENTENCE_both_posTags
+ = {"IN", "JJ", "NN", "VBN", "."};
+
+ private static final String SENTENCES_both = "Konstantin Kalashnitsov constantly caliphed. Coreena could care, completely.";
+ private static final String[] SENTENCES_both_punc
+ = {"konstantin", "kalashnitsov", "constantly", "caliph", ".", "coreena", "could", "care", ",", "completely", "."};
+ private static final String[] SENTENCES_both_posTags
+ = {"IN", "JJ", "NN", "VBN", ".", "NNP", "VBN", "NN", ",", "NN", "."};
+
+ private static final String[] SENTENCES_dict_keep_orig_punc
+ = {"They", "they", "sent", "send", "him", "he", "running", "run", "in", "the", "evening", ".", "He", "he", "did", "do", "not", "come", "back", "."};
+ private static final String[] SENTENCES_max_ent_keep_orig_punc
+ = {"They", "they", "sent", "send", "him", "he", "running", "runn", "in", "the", "evening", ".", "He", "he", "did", "do", "not", "come", "back", "."};
+ private static final String[] SENTENCES_keep_orig_posTags
+ = {"NNP", "NNP", "VBD", "VBD", "PRP", "PRP", "VBG", "VBG", "IN", "DT", "NN", ".", "PRP", "PRP", "VBD", "VBD", "RB", "VB", "RB", "."};
+
+ private static final String[] SENTENCES_both_keep_orig_punc
+ = {"Konstantin", "konstantin", "Kalashnitsov", "kalashnitsov", "constantly", "caliphed", "caliph", ".", "Coreena", "coreena", "could", "care", ",", "completely", "."};
+ private static final String[] SENTENCES_both_keep_orig_posTags
+ = {"IN", "IN", "JJ", "JJ", "NN", "VBN", "VBN", ".", "NNP", "NNP", "VBN", "NN", ",", "NN", "."};
+
+
+ private static final String tokenizerModelFile = "en-test-tokenizer.bin";
+ private static final String sentenceModelFile = "en-test-sent.bin";
+ private static final String posTaggerModelFile = "en-test-pos-maxent.bin";
+ private static final String lemmatizerModelFile = "en-test-lemmatizer.bin";
+ private static final String lemmatizerDictFile = "en-test-lemmas.dict";
+
+
+ public void test1SentenceDictionaryOnly() throws Exception {
+ CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
+ .withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
+ .addTokenFilter("opennlpPOS", "posTaggerModel", "en-test-pos-maxent.bin")
+ .addTokenFilter("opennlplemmatizer", "dictionary", "en-test-lemmas.dict")
+ .build();
+ assertAnalyzesTo(analyzer, SENTENCE, SENTENCE_dict_punc, null, null,
+ SENTENCE_posTags, null, null, true);
+ }
+
+ public void test2SentencesDictionaryOnly() throws Exception {
+ CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
+ .withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
+ .addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
+ .addTokenFilter("opennlplemmatizer", "dictionary", lemmatizerDictFile)
+ .build();
+ assertAnalyzesTo(analyzer, SENTENCES, SENTENCES_dict_punc, null, null,
+ SENTENCES_posTags, null, null, true);
+ }
+
+ public void test1SentenceMaxEntOnly() throws Exception {
+ CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
+ .withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
+ .addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
+ .addTokenFilter("opennlplemmatizer", "lemmatizerModel", lemmatizerModelFile)
+ .build();
+ assertAnalyzesTo(analyzer, SENTENCE, SENTENCE_maxent_punc, null, null,
+ SENTENCE_posTags, null, null, true);
+ }
+
+ public void test2SentencesMaxEntOnly() throws Exception {
+ CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
+ .withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
+ .addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
+ .addTokenFilter("OpenNLPLemmatizer", "lemmatizerModel", lemmatizerModelFile)
+ .build();
+ assertAnalyzesTo(analyzer, SENTENCES, SENTENCES_maxent_punc, null, null,
+ SENTENCES_posTags, null, null, true);
+ }
+
+ public void test1SentenceDictionaryAndMaxEnt() throws Exception {
+ CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
+ .withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
+ .addTokenFilter("opennlpPOS", "posTaggerModel", "en-test-pos-maxent.bin")
+ .addTokenFilter("opennlplemmatizer", "dictionary", "en-test-lemmas.dict", "lemmatizerModel", lemmatizerModelFile)
+ .build();
+ assertAnalyzesTo(analyzer, SENTENCE_both, SENTENCE_both_punc, null, null,
+ SENTENCE_both_posTags, null, null, true);
+ }
+
+ public void test2SentencesDictionaryAndMaxEnt() throws Exception {
+ CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
+ .withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
+ .addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
+ .addTokenFilter("opennlplemmatizer", "dictionary", lemmatizerDictFile, "lemmatizerModel", lemmatizerModelFile)
+ .build();
+ assertAnalyzesTo(analyzer, SENTENCES_both, SENTENCES_both_punc, null, null,
+ SENTENCES_both_posTags, null, null, true);
+ }
+
+ public void testKeywordAttributeAwarenessDictionaryOnly() throws Exception {
+ CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
+ .withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
+ .addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
+ .addTokenFilter(KeywordRepeatFilterFactory.class)
+ .addTokenFilter("opennlplemmatizer", "dictionary", lemmatizerDictFile)
+ .addTokenFilter(RemoveDuplicatesTokenFilterFactory.class)
+ .build();
+ assertAnalyzesTo(analyzer, SENTENCES, SENTENCES_dict_keep_orig_punc, null, null,
+ SENTENCES_keep_orig_posTags, null, null, true);
+ }
+
+ public void testKeywordAttributeAwarenessMaxEntOnly() throws Exception {
+ CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
+ .withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
+ .addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
+ .addTokenFilter(KeywordRepeatFilterFactory.class)
+ .addTokenFilter("opennlplemmatizer", "lemmatizerModel", lemmatizerModelFile)
+ .addTokenFilter(RemoveDuplicatesTokenFilterFactory.class)
+ .build();
+ assertAnalyzesTo(analyzer, SENTENCES, SENTENCES_max_ent_keep_orig_punc, null, null,
+ SENTENCES_keep_orig_posTags, null, null, true);
+ }
+
+ public void testKeywordAttributeAwarenessDictionaryAndMaxEnt() throws Exception {
+ CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
+ .withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
+ .addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
+ .addTokenFilter(KeywordRepeatFilterFactory.class)
+ .addTokenFilter("opennlplemmatizer", "dictionary", lemmatizerDictFile, "lemmatizerModel", lemmatizerModelFile)
+ .addTokenFilter(RemoveDuplicatesTokenFilterFactory.class)
+ .build();
+ assertAnalyzesTo(analyzer, SENTENCES_both, SENTENCES_both_keep_orig_punc, null, null,
+ SENTENCES_both_keep_orig_posTags, null, null, true);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b720e1ee/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPPOSFilterFactory.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPPOSFilterFactory.java b/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPPOSFilterFactory.java
new file mode 100644
index 0000000..10372d0
--- /dev/null
+++ b/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPPOSFilterFactory.java
@@ -0,0 +1,95 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.opennlp;
+
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.util.Arrays;
+
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.custom.CustomAnalyzer;
+import org.apache.lucene.analysis.payloads.TypeAsPayloadTokenFilterFactory;
+import org.apache.lucene.analysis.util.ClasspathResourceLoader;
+
+/**
+ * Needs the OpenNLP Tokenizer because it creates full streams of punctuation.
+ * The POS model is based on this tokenization.
+ *
+ * Tagging models are created from tiny test data in opennlp/tools/test-model-data/ and are not very accurate.
+ */
+public class TestOpenNLPPOSFilterFactory extends BaseTokenStreamTestCase {
+
+ private static final String SENTENCES = "Sentence number 1 has 6 words. Sentence number 2, 5 words.";
+ private static final String[] SENTENCES_punc
+ = {"Sentence", "number", "1", "has", "6", "words", ".", "Sentence", "number", "2", ",", "5", "words", "."};
+ private static final int[] SENTENCES_startOffsets = {0, 9, 16, 18, 22, 24, 29, 31, 40, 47, 48, 50, 52, 57};
+ private static final int[] SENTENCES_endOffsets = {8, 15, 17, 21, 23, 29, 30, 39, 46, 48, 49, 51, 57, 58};
+ private static final String[] SENTENCES_posTags
+ = {"NN", "NN", "CD", "VBZ", "CD", "NNS", ".", "NN", "NN", "CD", ",", "CD", "NNS", "."};
+ private static final String NAMES2 = "Royal Flash is a tale about Harry Flashman.";
+ private static final String[] NAMES2_punc = {"Royal", "Flash", "is", "a", "tale", "about", "Harry", "Flashman", "."};
+ private static final String[] NAMES2_OUT = { "word", "word", "word", "word", "word", "word", "word", "person", "word" };
+
+ private static final String NO_BREAK = "No period";
+ private static final String[] NO_BREAK_terms = {"No", "period"};
+ private static final int[] NO_BREAK_startOffsets = {0, 3};
+ private static final int[] NO_BREAK_endOffsets = {2, 9};
+
+ private static final String sentenceModelFile = "en-test-sent.bin";
+ private static final String tokenizerModelFile = "en-test-tokenizer.bin";
+ private static final String posTaggerModelFile = "en-test-pos-maxent.bin";
+
+
+ private static byte[][] toPayloads(String... strings) {
+ return Arrays.stream(strings).map(s -> s == null ? null : s.getBytes(StandardCharsets.UTF_8)).toArray(byte[][]::new);
+ }
+
+ public void testBasic() throws IOException {
+ CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
+ .withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
+ .addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
+ .build();
+ assertAnalyzesTo(analyzer, SENTENCES, SENTENCES_punc, SENTENCES_startOffsets, SENTENCES_endOffsets);
+ }
+
+ public void testPOS() throws Exception {
+ CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
+ .withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
+ .addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
+ .build();
+ assertAnalyzesTo(analyzer, SENTENCES, SENTENCES_punc, SENTENCES_startOffsets, SENTENCES_endOffsets,
+ SENTENCES_posTags, null, null, true);
+
+ analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
+ .withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
+ .addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
+ .addTokenFilter(TypeAsPayloadTokenFilterFactory.class)
+ .build();
+ assertAnalyzesTo(analyzer, SENTENCES, SENTENCES_punc, SENTENCES_startOffsets, SENTENCES_endOffsets,
+ null, null, null, true, toPayloads(SENTENCES_posTags));
+ }
+
+ public void testNoBreak() throws Exception {
+ CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
+ .withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
+ .addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
+ .build();
+ assertAnalyzesTo(analyzer, NO_BREAK, NO_BREAK_terms, NO_BREAK_startOffsets, NO_BREAK_endOffsets,
+ null, null, null, true);
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b720e1ee/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPSentenceBreakIterator.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPSentenceBreakIterator.java b/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPSentenceBreakIterator.java
new file mode 100644
index 0000000..4ee6570
--- /dev/null
+++ b/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPSentenceBreakIterator.java
@@ -0,0 +1,201 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.opennlp;
+
+import java.io.IOException;
+import java.text.BreakIterator;
+import java.text.CharacterIterator;
+
+import org.apache.lucene.analysis.opennlp.tools.NLPSentenceDetectorOp;
+import org.apache.lucene.analysis.opennlp.tools.OpenNLPOpsFactory;
+import org.apache.lucene.analysis.util.CharArrayIterator;
+import org.apache.lucene.analysis.util.ClasspathResourceLoader;
+import org.apache.lucene.util.LuceneTestCase;
+import org.junit.BeforeClass;
+
+public class TestOpenNLPSentenceBreakIterator extends LuceneTestCase {
+
+ private static final String TEXT
+ // 111
+ // 111111111122222222223333333333444444444455555555556666666666777777777788888888889999999999000
+ // 0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012
+ = "Sentence number 1 has 6 words. Sentence number 2, 5 words. And finally, sentence number 3 has 8 words.";
+ private static final String[] SENTENCES = new String[] {
+ "Sentence number 1 has 6 words. ", "Sentence number 2, 5 words. ", "And finally, sentence number 3 has 8 words." };
+ private static final String PADDING = " Word. Word. ";
+ private static final String sentenceModelFile = "en-test-sent.bin";
+
+
+ @BeforeClass
+ public static void populateCache() throws IOException {
+ OpenNLPOpsFactory.getSentenceModel
+ (sentenceModelFile, new ClasspathResourceLoader(TestOpenNLPSentenceBreakIterator.class));
+ }
+
+ public void testThreeSentences() throws Exception {
+ NLPSentenceDetectorOp sentenceDetectorOp = OpenNLPOpsFactory.getSentenceDetector(sentenceModelFile);
+ BreakIterator bi = new OpenNLPSentenceBreakIterator(sentenceDetectorOp);
+ bi.setText(TEXT); // String is converted to StringCharacterIterator
+ do3SentenceTest(bi);
+
+ bi.setText(getCharArrayIterator(TEXT));
+ do3SentenceTest(bi);
+ }
+
+ private CharacterIterator getCharArrayIterator(String text) {
+ return getCharArrayIterator(text, 0, text.length());
+ }
+
+ private CharacterIterator getCharArrayIterator(String text, int start, int length) {
+ CharArrayIterator charArrayIterator = new CharArrayIterator() {
+ // Lie about all surrogates to the sentence tokenizer,
+ // instead we treat them all as SContinue so we won't break around them.
+ @Override
+ protected char jreBugWorkaround(char ch) {
+ return ch >= 0xD800 && ch <= 0xDFFF ? 0x002C : ch;
+ }
+ };
+ charArrayIterator.setText(text.toCharArray(), start, length);
+ return charArrayIterator;
+ }
+
+ private void do3SentenceTest(BreakIterator bi) {
+ assertEquals(0, bi.current());
+ assertEquals(0, bi.first());
+ assertEquals(SENTENCES[0], TEXT.substring(bi.current(), bi.next()));
+ assertEquals(SENTENCES[1], TEXT.substring(bi.current(), bi.next()));
+ int current = bi.current();
+ assertEquals(bi.getText().getEndIndex(), bi.next());
+ int next = bi.current();
+ assertEquals(SENTENCES[2], TEXT.substring(current, next));
+ assertEquals(BreakIterator.DONE, bi.next());
+
+ assertEquals(TEXT.length(), bi.last());
+ int end = bi.current();
+ assertEquals(SENTENCES[2], TEXT.substring(bi.previous(), end));
+ end = bi.current();
+ assertEquals(SENTENCES[1], TEXT.substring(bi.previous(), end));
+ end = bi.current();
+ assertEquals(SENTENCES[0], TEXT.substring(bi.previous(), end));
+ assertEquals(BreakIterator.DONE, bi.previous());
+ assertEquals(0, bi.current());
+
+ assertEquals(59, bi.following(39));
+ assertEquals(59, bi.following(31));
+ assertEquals(31, bi.following(30));
+
+ assertEquals(0, bi.preceding(57));
+ assertEquals(0, bi.preceding(58));
+ assertEquals(31, bi.preceding(59));
+
+ assertEquals(0, bi.first());
+ assertEquals(59, bi.next(2));
+ assertEquals(0, bi.next(-2));
+ }
+
+ public void testSingleSentence() throws Exception {
+ NLPSentenceDetectorOp sentenceDetectorOp = OpenNLPOpsFactory.getSentenceDetector(sentenceModelFile);
+ BreakIterator bi = new OpenNLPSentenceBreakIterator(sentenceDetectorOp);
+ bi.setText(getCharArrayIterator(SENTENCES[0]));
+ test1Sentence(bi, SENTENCES[0]);
+ }
+
+ private void test1Sentence(BreakIterator bi, String text) {
+ int start = bi.getText().getBeginIndex();
+ assertEquals(start, bi.first());
+ int current = bi.current();
+ assertEquals(bi.getText().getEndIndex(), bi.next());
+ int end = bi.current() - start;
+ assertEquals(text, text.substring(current - start, end - start));
+
+ assertEquals(text.length(), bi.last() - start);
+ end = bi.current();
+ bi.previous();
+ assertEquals(BreakIterator.DONE, bi.previous());
+ int previous = bi.current();
+ assertEquals(text, text.substring(previous - start, end - start));
+ assertEquals(start, bi.current());
+
+ assertEquals(BreakIterator.DONE, bi.following(bi.last() / 2 + start));
+
+ assertEquals(BreakIterator.DONE, bi.preceding(bi.last() / 2 + start));
+
+ assertEquals(start, bi.first());
+ assertEquals(BreakIterator.DONE, bi.next(13));
+ assertEquals(BreakIterator.DONE, bi.next(-8));
+ }
+
+ public void testSliceEnd() throws Exception {
+ NLPSentenceDetectorOp sentenceDetectorOp = OpenNLPOpsFactory.getSentenceDetector(sentenceModelFile);
+ BreakIterator bi = new OpenNLPSentenceBreakIterator(sentenceDetectorOp);
+ bi.setText(getCharArrayIterator(SENTENCES[0] + PADDING, 0, SENTENCES[0].length()));
+
+ test1Sentence(bi, SENTENCES[0]);
+ }
+
+ public void testSliceStart() throws Exception {
+ NLPSentenceDetectorOp sentenceDetectorOp = OpenNLPOpsFactory.getSentenceDetector(sentenceModelFile);
+ BreakIterator bi = new OpenNLPSentenceBreakIterator(sentenceDetectorOp);
+ bi.setText(getCharArrayIterator(PADDING + SENTENCES[0], PADDING.length(), SENTENCES[0].length()));
+ test1Sentence(bi, SENTENCES[0]);
+ }
+
+ public void testSliceMiddle() throws Exception {
+ NLPSentenceDetectorOp sentenceDetectorOp = OpenNLPOpsFactory.getSentenceDetector(sentenceModelFile);
+ BreakIterator bi = new OpenNLPSentenceBreakIterator(sentenceDetectorOp);
+ bi.setText(getCharArrayIterator(PADDING + SENTENCES[0] + PADDING, PADDING.length(), SENTENCES[0].length()));
+
+ test1Sentence(bi, SENTENCES[0]);
+ }
+
+ /** the current position must be ignored, initial position is always first() */
+ public void testFirstPosition() throws Exception {
+ NLPSentenceDetectorOp sentenceDetectorOp = OpenNLPOpsFactory.getSentenceDetector(sentenceModelFile);
+ BreakIterator bi = new OpenNLPSentenceBreakIterator(sentenceDetectorOp);
+ bi.setText(getCharArrayIterator(SENTENCES[0]));
+ assertEquals(SENTENCES[0].length(), bi.last()); // side-effect: set current position to last()
+ test1Sentence(bi, SENTENCES[0]);
+ }
+
+ public void testWhitespaceOnly() throws Exception {
+ NLPSentenceDetectorOp sentenceDetectorOp = OpenNLPOpsFactory.getSentenceDetector(sentenceModelFile);
+ BreakIterator bi = new OpenNLPSentenceBreakIterator(sentenceDetectorOp);
+ bi.setText(" \n \n\n\r\n\t \n");
+ test0Sentences(bi);
+ }
+
+ public void testEmptyString() throws Exception {
+ NLPSentenceDetectorOp sentenceDetectorOp = OpenNLPOpsFactory.getSentenceDetector(sentenceModelFile);
+ BreakIterator bi = new OpenNLPSentenceBreakIterator(sentenceDetectorOp);
+ bi.setText("");
+ test0Sentences(bi);
+ }
+
+ private void test0Sentences(BreakIterator bi) {
+ assertEquals(0, bi.current());
+ assertEquals(0, bi.first());
+ assertEquals(BreakIterator.DONE, bi.next());
+ assertEquals(0, bi.last());
+ assertEquals(BreakIterator.DONE, bi.previous());
+ assertEquals(BreakIterator.DONE, bi.following(0));
+ assertEquals(BreakIterator.DONE, bi.preceding(0));
+ assertEquals(0, bi.first());
+ assertEquals(BreakIterator.DONE, bi.next(13));
+ assertEquals(BreakIterator.DONE, bi.next(-8));
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b720e1ee/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPTokenizerFactory.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPTokenizerFactory.java b/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPTokenizerFactory.java
new file mode 100644
index 0000000..db2bbb2
--- /dev/null
+++ b/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPTokenizerFactory.java
@@ -0,0 +1,97 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.opennlp;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.custom.CustomAnalyzer;
+import org.apache.lucene.analysis.util.ClasspathResourceLoader;
+import org.junit.Test;
+
+/**
+ * Tests the Tokenizer as well- the Tokenizer needs the OpenNLP model files,
+ * which this can load from src/test-files/opennlp/solr/conf
+ *
+ */
+public class TestOpenNLPTokenizerFactory extends BaseTokenStreamTestCase {
+
+ static private String SENTENCES = "Sentence number 1 has 6 words. Sentence number 2, 5 words.";
+ static private String[] SENTENCES_split = {"Sentence number 1 has 6 words. ", "Sentence number 2, 5 words."};
+ static private String[] SENTENCES_punc = {"Sentence", "number", "1", "has", "6", "words", ".", "Sentence", "number", "2", ",", "5", "words", "."};
+ static private int[] SENTENCES_startOffsets = {0, 9, 16, 18, 22, 24, 29, 31, 40, 47, 48, 50, 52, 57};
+ static private int[] SENTENCES_endOffsets = {8, 15, 17, 21, 23, 29, 30, 39, 46, 48, 49, 51, 57, 58};
+
+ static private String SENTENCE1 = "Sentence number 1 has 6 words.";
+ static private String[] SENTENCE1_punc = {"Sentence", "number", "1", "has", "6", "words", "."};
+
+ @Test
+ public void testTokenizer() throws IOException {
+ CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
+ .withTokenizer("opennlp", "sentenceModel", "en-test-sent.bin", "tokenizerModel", "en-test-tokenizer.bin")
+ .build();
+ assertAnalyzesTo(analyzer, SENTENCES, SENTENCES_punc, SENTENCES_startOffsets, SENTENCES_endOffsets);
+ assertAnalyzesTo(analyzer, SENTENCE1, SENTENCE1_punc);
+ }
+
+ @Test
+ public void testTokenizerNoSentenceDetector() throws IOException {
+ IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
+ CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
+ .withTokenizer("opennlp", "tokenizerModel", "en-test-tokenizer.bin")
+ .build();
+ });
+ assertTrue(expected.getMessage().contains("Configuration Error: missing parameter 'sentenceModel'"));
+ }
+
+ @Test
+ public void testTokenizerNoTokenizer() throws IOException {
+ IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
+ CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
+ .withTokenizer("opennlp", "sentenceModel", "en-test-sent.bin")
+ .build();
+ });
+ assertTrue(expected.getMessage().contains("Configuration Error: missing parameter 'tokenizerModel'"));
+ }
+
+ // test analyzer caching the tokenizer
+ @Test
+ public void testClose() throws IOException {
+ Map<String,String> args = new HashMap<String,String>() {{ put("sentenceModel", "en-test-sent.bin");
+ put("tokenizerModel", "en-test-tokenizer.bin"); }};
+ OpenNLPTokenizerFactory factory = new OpenNLPTokenizerFactory(args);
+ factory.inform(new ClasspathResourceLoader(getClass()));
+
+ Tokenizer ts = factory.create(newAttributeFactory());
+ ts.setReader(new StringReader(SENTENCES));
+
+ ts.reset();
+ ts.close();
+ ts.reset();
+ ts.setReader(new StringReader(SENTENCES));
+ assertTokenStreamContents(ts, SENTENCES_punc);
+ ts.close();
+ ts.reset();
+ ts.setReader(new StringReader(SENTENCES));
+ assertTokenStreamContents(ts, SENTENCES_punc);
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b720e1ee/lucene/analysis/opennlp/src/tools/test-model-data/README.txt
----------------------------------------------------------------------
diff --git a/lucene/analysis/opennlp/src/tools/test-model-data/README.txt b/lucene/analysis/opennlp/src/tools/test-model-data/README.txt
new file mode 100644
index 0000000..3ac0aa3
--- /dev/null
+++ b/lucene/analysis/opennlp/src/tools/test-model-data/README.txt
@@ -0,0 +1,6 @@
+Use small training data to create small models for unit tests.
+Training data derived from Reuters corpus in very unscientific way.
+Tagging done with CCG Urbana-Champaign online demos:
+ http://cogcomp.cs.illinois.edu/page/demos
+
+Run 'ant train-test-models' to generate models from training data here.