You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sa...@apache.org on 2017/12/15 16:26:04 UTC

[11/12] lucene-solr:branch_7x: LUCENE-2899: Add OpenNLP Analysis capabilities as a module

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b720e1ee/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/package-info.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/package-info.java b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/package-info.java
new file mode 100644
index 0000000..527e24f
--- /dev/null
+++ b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/package-info.java
@@ -0,0 +1,21 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Analysis components based on OpenNLP
+ */
+package org.apache.lucene.analysis.opennlp;

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b720e1ee/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/NLPChunkerOp.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/NLPChunkerOp.java b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/NLPChunkerOp.java
new file mode 100644
index 0000000..f6a5ea8
--- /dev/null
+++ b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/NLPChunkerOp.java
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.opennlp.tools;
+
+import java.io.IOException;
+import opennlp.tools.chunker.ChunkerME;
+import opennlp.tools.chunker.ChunkerModel;
+
+/**
+ * Supply OpenNLP Chunking tool
+ * Requires binary models from OpenNLP project on SourceForge.
+ */
+public class NLPChunkerOp {
+  private ChunkerME chunker = null;
+
+  public NLPChunkerOp(ChunkerModel chunkerModel) throws IOException {
+    chunker = new ChunkerME(chunkerModel);
+  }
+
+  public synchronized String[] getChunks(String[] words, String[] tags, double[] probs) {
+    String[] chunks = chunker.chunk(words, tags);
+    if (probs != null)
+      chunker.probs(probs);
+    return chunks;
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b720e1ee/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/NLPLemmatizerOp.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/NLPLemmatizerOp.java b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/NLPLemmatizerOp.java
new file mode 100644
index 0000000..b09c63e
--- /dev/null
+++ b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/NLPLemmatizerOp.java
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.opennlp.tools;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import opennlp.tools.lemmatizer.DictionaryLemmatizer;
+import opennlp.tools.lemmatizer.LemmatizerME;
+import opennlp.tools.lemmatizer.LemmatizerModel;
+
+/**
+ * <p>Supply OpenNLP Lemmatizer tools.</p>
+ * <p>
+ *   Both a dictionary-based lemmatizer and a MaxEnt lemmatizer are supported.
+ *   If both are configured, the dictionary-based lemmatizer is tried first,
+ *   and then the MaxEnt lemmatizer is consulted for out-of-vocabulary tokens.
+ * </p>
+ * <p>
+ *   The MaxEnt implementation requires binary models from OpenNLP project on SourceForge.
+ * </p>
+ */
+public class NLPLemmatizerOp {
+  private final DictionaryLemmatizer dictionaryLemmatizer;
+  private final LemmatizerME lemmatizerME;
+
+  public NLPLemmatizerOp(InputStream dictionary, LemmatizerModel lemmatizerModel) throws IOException {
+    assert dictionary != null || lemmatizerModel != null : "At least one parameter must be non-null";
+    dictionaryLemmatizer = dictionary == null ? null : new DictionaryLemmatizer(dictionary);
+    lemmatizerME = lemmatizerModel == null ? null : new LemmatizerME(lemmatizerModel);
+  }
+
+  public String[] lemmatize(String[] words, String[] postags) {
+    String[] lemmas = null;
+    String[] maxEntLemmas = null;
+    if (dictionaryLemmatizer != null) {
+      lemmas = dictionaryLemmatizer.lemmatize(words, postags);
+      for (int i = 0; i < lemmas.length; ++i) {
+        if (lemmas[i].equals("O")) {   // this word is not in the dictionary
+          if (lemmatizerME != null) {  // fall back to the MaxEnt lemmatizer if it's enabled
+            if (maxEntLemmas == null) {
+              maxEntLemmas = lemmatizerME.lemmatize(words, postags);
+            }
+            if ("_".equals(maxEntLemmas[i])) {
+              lemmas[i] = words[i];    // put back the original word if no lemma is found
+            } else {
+              lemmas[i] = maxEntLemmas[i];
+            }
+          } else {                     // there is no MaxEnt lemmatizer
+            lemmas[i] = words[i];      // put back the original word if no lemma is found
+          }
+        }
+      }
+    } else {                           // there is only a MaxEnt lemmatizer
+      maxEntLemmas = lemmatizerME.lemmatize(words, postags);
+      for (int i = 0 ; i < maxEntLemmas.length ; ++i) {
+        if ("_".equals(maxEntLemmas[i])) {
+          maxEntLemmas[i] = words[i];  // put back the original word if no lemma is found
+        }
+      }
+      lemmas = maxEntLemmas;
+    }
+    return lemmas;
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b720e1ee/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/NLPNERTaggerOp.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/NLPNERTaggerOp.java b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/NLPNERTaggerOp.java
new file mode 100644
index 0000000..22e617d
--- /dev/null
+++ b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/NLPNERTaggerOp.java
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.opennlp.tools;
+
+import opennlp.tools.namefind.NameFinderME;
+import opennlp.tools.namefind.TokenNameFinder;
+import opennlp.tools.namefind.TokenNameFinderModel;
+import opennlp.tools.util.Span;
+
+/**
+ * Supply OpenNLP Named Entity Resolution tool
+ * Requires binary models from OpenNLP project on SourceForge.
+ *
+ * Usage: from <a href="http://opennlp.apache.org/docs/1.8.3/manual/opennlp.html#tools.namefind.recognition.api"
+ *             >the OpenNLP documentation</a>:
+ *
+ * "The NameFinderME class is not thread safe, it must only be called from one thread.
+ * To use multiple threads multiple NameFinderME instances sharing the same model instance
+ * can be created. The input text should be segmented into documents, sentences and tokens.
+ * To perform entity detection an application calls the find method for every sentence in
+ * the document. After every document clearAdaptiveData must be called to clear the adaptive
+ * data in the feature generators. Not calling clearAdaptiveData can lead to a sharp drop
+ * in the detection rate after a few documents."
+ *
+ */
+public class NLPNERTaggerOp {
+  private final TokenNameFinder nameFinder;
+
+  public NLPNERTaggerOp(TokenNameFinderModel model) {
+    this.nameFinder = new NameFinderME(model);
+  }
+
+  public Span[] getNames(String[] words) {
+    Span[] names = nameFinder.find(words);
+    return names;
+  }
+
+  public synchronized void reset() {
+    nameFinder.clearAdaptiveData();
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b720e1ee/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/NLPPOSTaggerOp.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/NLPPOSTaggerOp.java b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/NLPPOSTaggerOp.java
new file mode 100644
index 0000000..447e1c0
--- /dev/null
+++ b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/NLPPOSTaggerOp.java
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.opennlp.tools;
+
+import java.io.IOException;
+
+import opennlp.tools.postag.POSModel;
+import opennlp.tools.postag.POSTagger;
+import opennlp.tools.postag.POSTaggerME;
+
+/**
+ * Supply OpenNLP Parts-Of-Speech Tagging tool
+ * Requires binary models from OpenNLP project on SourceForge.
+ */
+
+public class NLPPOSTaggerOp {
+  private POSTagger tagger = null;
+
+  public NLPPOSTaggerOp(POSModel model) throws IOException {
+    tagger = new POSTaggerME(model);
+  }
+
+  public synchronized String[] getPOSTags(String[] words) {
+    return tagger.tag(words);
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b720e1ee/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/NLPSentenceDetectorOp.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/NLPSentenceDetectorOp.java b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/NLPSentenceDetectorOp.java
new file mode 100644
index 0000000..21983ce
--- /dev/null
+++ b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/NLPSentenceDetectorOp.java
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.opennlp.tools;
+
+import java.io.IOException;
+
+import opennlp.tools.sentdetect.SentenceDetectorME;
+import opennlp.tools.sentdetect.SentenceModel;
+import opennlp.tools.util.Span;
+
+/**
+ * Supply OpenNLP Sentence Detector tool
+ * Requires binary models from OpenNLP project on SourceForge.
+ */
+public class NLPSentenceDetectorOp {
+  private final SentenceDetectorME sentenceSplitter;
+
+  public NLPSentenceDetectorOp(SentenceModel model) throws IOException {
+    sentenceSplitter  = new SentenceDetectorME(model);
+  }
+
+  public NLPSentenceDetectorOp() {
+    sentenceSplitter = null;
+  }
+
+  public synchronized Span[] splitSentences(String line) {
+    if (sentenceSplitter != null) {
+      return sentenceSplitter.sentPosDetect(line);
+    } else {
+      Span[] shorty = new Span[1];
+      shorty[0] = new Span(0, line.length());
+      return shorty;
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b720e1ee/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/NLPTokenizerOp.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/NLPTokenizerOp.java b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/NLPTokenizerOp.java
new file mode 100644
index 0000000..0aeb713
--- /dev/null
+++ b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/NLPTokenizerOp.java
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.opennlp.tools;
+
+import opennlp.tools.tokenize.Tokenizer;
+import opennlp.tools.tokenize.TokenizerME;
+import opennlp.tools.tokenize.TokenizerModel;
+import opennlp.tools.util.Span;
+
+/**
+ * Supply OpenNLP Sentence Tokenizer tool
+ * Requires binary models from OpenNLP project on SourceForge.
+ */
+public class NLPTokenizerOp {
+  private final Tokenizer tokenizer;
+
+  public NLPTokenizerOp(TokenizerModel model) {
+    tokenizer = new TokenizerME(model);
+  }
+
+  public NLPTokenizerOp() {
+    tokenizer = null;
+  }
+
+  public synchronized Span[] getTerms(String sentence) {
+    if (tokenizer == null) {
+      Span[] span1 = new Span[1];
+      span1[0] = new Span(0, sentence.length());
+      return span1;
+    }
+    return tokenizer.tokenizePos(sentence);
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b720e1ee/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/OpenNLPOpsFactory.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/OpenNLPOpsFactory.java b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/OpenNLPOpsFactory.java
new file mode 100644
index 0000000..5348857
--- /dev/null
+++ b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/OpenNLPOpsFactory.java
@@ -0,0 +1,176 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.opennlp.tools;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.nio.charset.StandardCharsets;
+import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
+
+import opennlp.tools.chunker.ChunkerModel;
+import opennlp.tools.lemmatizer.LemmatizerModel;
+import opennlp.tools.namefind.TokenNameFinderModel;
+import opennlp.tools.postag.POSModel;
+import opennlp.tools.sentdetect.SentenceModel;
+import opennlp.tools.tokenize.TokenizerModel;
+import org.apache.lucene.analysis.util.ResourceLoader;
+
+/**
+ * Supply OpenNLP Named Entity Recognizer
+ * Cache model file objects. Assumes model files are thread-safe.
+ */
+public class OpenNLPOpsFactory {
+  private static Map<String,SentenceModel> sentenceModels = new ConcurrentHashMap<>();
+  private static ConcurrentHashMap<String,TokenizerModel> tokenizerModels = new ConcurrentHashMap<>();
+  private static ConcurrentHashMap<String,POSModel> posTaggerModels = new ConcurrentHashMap<>();
+  private static ConcurrentHashMap<String,ChunkerModel> chunkerModels = new ConcurrentHashMap<>();
+  private static Map<String,TokenNameFinderModel> nerModels = new ConcurrentHashMap<>();
+  private static Map<String,LemmatizerModel> lemmatizerModels = new ConcurrentHashMap<>();
+  private static Map<String,String> lemmaDictionaries = new ConcurrentHashMap<>();
+
+  public static NLPSentenceDetectorOp getSentenceDetector(String modelName) throws IOException {
+    if (modelName != null) {
+      SentenceModel model = sentenceModels.get(modelName);
+      return new NLPSentenceDetectorOp(model);
+    } else {
+      return new NLPSentenceDetectorOp();
+    }
+  }
+
+  public static SentenceModel getSentenceModel(String modelName, ResourceLoader loader) throws IOException {
+    SentenceModel model = sentenceModels.get(modelName);
+    if (model == null) {
+      model = new SentenceModel(loader.openResource(modelName));
+      sentenceModels.put(modelName, model);
+    }
+    return model;
+  }
+
+  public static NLPTokenizerOp getTokenizer(String modelName) throws IOException {
+    if (modelName == null) {
+      return new NLPTokenizerOp();
+    } else {
+      TokenizerModel model = tokenizerModels.get(modelName);
+      return new NLPTokenizerOp(model);
+    }
+  }
+
+  public static TokenizerModel getTokenizerModel(String modelName, ResourceLoader loader) throws IOException {
+    TokenizerModel model = tokenizerModels.get(modelName);
+    if (model == null) {
+      model = new TokenizerModel(loader.openResource(modelName));
+      tokenizerModels.put(modelName, model);
+    }
+    return model;
+  }
+
+  public static NLPPOSTaggerOp getPOSTagger(String modelName) throws IOException {
+    POSModel model = posTaggerModels.get(modelName);
+    return new NLPPOSTaggerOp(model);
+  }
+
+  public static POSModel getPOSTaggerModel(String modelName, ResourceLoader loader) throws IOException {
+    POSModel model = posTaggerModels.get(modelName);
+    if (model == null) {
+      model = new POSModel(loader.openResource(modelName));
+      posTaggerModels.put(modelName, model);
+    }
+    return model;
+  }
+
+  public static NLPChunkerOp getChunker(String modelName) throws IOException {
+    ChunkerModel model = chunkerModels.get(modelName);
+    return new NLPChunkerOp(model);
+  }
+
+  public static ChunkerModel getChunkerModel(String modelName, ResourceLoader loader) throws IOException {
+    ChunkerModel model = chunkerModels.get(modelName);
+    if (model == null) {
+      model = new ChunkerModel(loader.openResource(modelName));
+      chunkerModels.put(modelName, model);
+    }
+    return model;
+  }
+
+  public static NLPNERTaggerOp getNERTagger(String modelName) throws IOException {
+    TokenNameFinderModel model = nerModels.get(modelName);
+    return new NLPNERTaggerOp(model);
+  }
+
+  public static TokenNameFinderModel getNERTaggerModel(String modelName, ResourceLoader loader) throws IOException {
+    TokenNameFinderModel model = nerModels.get(modelName);
+    if (model == null) {
+      model = new TokenNameFinderModel(loader.openResource(modelName));
+      nerModels.put(modelName, model);
+    }
+    return model;
+  }
+
+  public static NLPLemmatizerOp getLemmatizer(String dictionaryFile, String lemmatizerModelFile) throws IOException {
+    assert dictionaryFile != null || lemmatizerModelFile != null : "At least one parameter must be non-null";
+    InputStream dictionaryInputStream = null;
+    if (dictionaryFile != null) {
+      String dictionary = lemmaDictionaries.get(dictionaryFile);
+      dictionaryInputStream = new ByteArrayInputStream(dictionary.getBytes(StandardCharsets.UTF_8));
+    }
+    LemmatizerModel lemmatizerModel = lemmatizerModelFile == null ? null : lemmatizerModels.get(lemmatizerModelFile);
+    return new NLPLemmatizerOp(dictionaryInputStream, lemmatizerModel);
+  }
+
+  public static String getLemmatizerDictionary(String dictionaryFile, ResourceLoader loader) throws IOException {
+    String dictionary = lemmaDictionaries.get(dictionaryFile);
+    if (dictionary == null) {
+      Reader reader = new InputStreamReader(loader.openResource(dictionaryFile), StandardCharsets.UTF_8);
+      StringBuilder builder = new StringBuilder();
+      char[] chars = new char[8092];
+      int numRead = 0;
+      do {
+        numRead = reader.read(chars, 0, chars.length);
+        if (numRead > 0) {
+          builder.append(chars, 0, numRead);
+        }
+      } while (numRead > 0);
+      dictionary = builder.toString();
+      lemmaDictionaries.put(dictionaryFile, dictionary);
+    }
+    return dictionary;
+  }
+
+  public static LemmatizerModel getLemmatizerModel(String modelName, ResourceLoader loader) throws IOException {
+    LemmatizerModel model = lemmatizerModels.get(modelName);
+    if (model == null) {
+      model = new LemmatizerModel(loader.openResource(modelName));
+      lemmatizerModels.put(modelName, model);
+    }
+    return model;
+  }
+
+  // keeps unit test from blowing out memory
+  public static void clearModels() {
+    sentenceModels.clear();
+    tokenizerModels.clear();
+    posTaggerModels.clear();
+    chunkerModels.clear();
+    nerModels.clear();
+    lemmaDictionaries.clear();
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b720e1ee/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/package-info.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/package-info.java b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/package-info.java
new file mode 100644
index 0000000..523a084
--- /dev/null
+++ b/lucene/analysis/opennlp/src/java/org/apache/lucene/analysis/opennlp/tools/package-info.java
@@ -0,0 +1,21 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Tools to supply access to OpenNLP components.
+ */
+package org.apache.lucene.analysis.opennlp.tools;

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b720e1ee/lucene/analysis/opennlp/src/java/overview.html
----------------------------------------------------------------------
diff --git a/lucene/analysis/opennlp/src/java/overview.html b/lucene/analysis/opennlp/src/java/overview.html
new file mode 100644
index 0000000..bf70e95
--- /dev/null
+++ b/lucene/analysis/opennlp/src/java/overview.html
@@ -0,0 +1,61 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html>
+<head>
+  <META http-equiv="Content-Type" content="text/html; charset=UTF-8">
+  <title>
+    Apache Lucene OpenNLP integration module
+  </title>
+</head>
+<body>
+<p>
+  This module exposes functionality from
+  <a href="http://opennlp.apache.org">Apache OpenNLP</a> to Apache Lucene.
+  The Apache OpenNLP library is a machine learning based toolkit for the processing of natural language text.
+<p>
+  For an introduction to Lucene's analysis API, see the {@link org.apache.lucene.analysis} package documentation.
+<p>
+  The OpenNLP Tokenizer behavior is similar to the WhiteSpaceTokenizer but is smart about
+  inter-word punctuation. The term stream looks very much like the way you parse words and
+  punctuation while reading.  The major difference between this tokenizer and most other
+  tokenizers shipped with Lucene is that punctuation is tokenized.  This is required for
+  the following taggers to operate properly.
+<p>
+  The OpenNLP taggers annotate terms using the <code>TypeAttribute</code>.
+<ul>
+  <li><code>OpenNLPTokenizer</code> segments text into sentences or words. This Tokenizer
+    uses the OpenNLP Sentence Detector and/or Tokenizer classes.  When used together, the
+    Tokenizer receives sentences and can do a better job.</li>
+  <li><code>OpenNLPFilter</code> tags words using one or more technologies: Part-of-Speech,
+    Chunking, and Named Entity Recognition.  These tags are assigned as token types.  Note that
+    only of these operations will tag
+  </li>
+</ul>
+<p>
+  Since the <code>TypeAttribute</code> is not stored in the index, it is recommended that one
+  of these filters is used following <code>OpenNLPFilter</code> to enable search against the
+  assigned tags:
+<ul>
+  <li><code>TypeAsPayloadFilter</code> copies the <code>TypeAttribute</code> value to the
+    <code>PayloadAttribute</code></li>
+  <li><code>TypeAsSynonymFilter</code> creates a cloned token at the same position as each
+    tagged token, and copies the {{TypeAttribute}} value to the {{CharTermAttribute}}, optionally
+    with a customized prefix (so that tags effectively occupy a different namespace from token
+    text).</li>
+</ul>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b720e1ee/lucene/analysis/opennlp/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
----------------------------------------------------------------------
diff --git a/lucene/analysis/opennlp/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory b/lucene/analysis/opennlp/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
new file mode 100644
index 0000000..61a685d
--- /dev/null
+++ b/lucene/analysis/opennlp/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
@@ -0,0 +1,18 @@
+#  Licensed to the Apache Software Foundation (ASF) under one or more
+#  contributor license agreements.  See the NOTICE file distributed with
+#  this work for additional information regarding copyright ownership.
+#  The ASF licenses this file to You under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with
+#  the License.  You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+org.apache.lucene.analysis.opennlp.OpenNLPChunkerFilterFactory
+org.apache.lucene.analysis.opennlp.OpenNLPLemmatizerFilterFactory
+org.apache.lucene.analysis.opennlp.OpenNLPPOSFilterFactory

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b720e1ee/lucene/analysis/opennlp/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenizerFactory
----------------------------------------------------------------------
diff --git a/lucene/analysis/opennlp/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenizerFactory b/lucene/analysis/opennlp/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenizerFactory
new file mode 100644
index 0000000..076b308
--- /dev/null
+++ b/lucene/analysis/opennlp/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenizerFactory
@@ -0,0 +1,16 @@
+#  Licensed to the Apache Software Foundation (ASF) under one or more
+#  contributor license agreements.  See the NOTICE file distributed with
+#  this work for additional information regarding copyright ownership.
+#  The ASF licenses this file to You under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with
+#  the License.  You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+org.apache.lucene.analysis.opennlp.OpenNLPTokenizerFactory

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b720e1ee/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/en-test-chunker.bin
----------------------------------------------------------------------
diff --git a/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/en-test-chunker.bin b/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/en-test-chunker.bin
new file mode 100644
index 0000000..8151914
Binary files /dev/null and b/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/en-test-chunker.bin differ

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b720e1ee/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/en-test-lemmas.dict
----------------------------------------------------------------------
diff --git a/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/en-test-lemmas.dict b/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/en-test-lemmas.dict
new file mode 100644
index 0000000..d1d486c
--- /dev/null
+++ b/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/en-test-lemmas.dict
@@ -0,0 +1,12 @@
+they	NNP	they
+sent	VBD	send
+him	PRP	he
+running	VBG	run
+in	IN	in
+the	DT	the
+evening	NN	evening
+he	PRP	he
+did	VBD	do
+not	RB	not
+come	VB	come
+back	RB	back

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b720e1ee/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/en-test-lemmatizer.bin
----------------------------------------------------------------------
diff --git a/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/en-test-lemmatizer.bin b/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/en-test-lemmatizer.bin
new file mode 100644
index 0000000..e62df7e
Binary files /dev/null and b/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/en-test-lemmatizer.bin differ

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b720e1ee/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/en-test-ner-person.bin
----------------------------------------------------------------------
diff --git a/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/en-test-ner-person.bin b/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/en-test-ner-person.bin
new file mode 100644
index 0000000..0b40aac
Binary files /dev/null and b/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/en-test-ner-person.bin differ

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b720e1ee/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/en-test-pos-maxent.bin
----------------------------------------------------------------------
diff --git a/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/en-test-pos-maxent.bin b/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/en-test-pos-maxent.bin
new file mode 100644
index 0000000..b77fb46
Binary files /dev/null and b/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/en-test-pos-maxent.bin differ

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b720e1ee/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/en-test-sent.bin
----------------------------------------------------------------------
diff --git a/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/en-test-sent.bin b/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/en-test-sent.bin
new file mode 100644
index 0000000..4252bcb
Binary files /dev/null and b/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/en-test-sent.bin differ

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b720e1ee/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/en-test-tokenizer.bin
----------------------------------------------------------------------
diff --git a/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/en-test-tokenizer.bin b/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/en-test-tokenizer.bin
new file mode 100644
index 0000000..94668c0
Binary files /dev/null and b/lucene/analysis/opennlp/src/test-files/org/apache/lucene/analysis/opennlp/en-test-tokenizer.bin differ

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b720e1ee/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPChunkerFilterFactory.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPChunkerFilterFactory.java b/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPChunkerFilterFactory.java
new file mode 100644
index 0000000..013348c
--- /dev/null
+++ b/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPChunkerFilterFactory.java
@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.opennlp;
+
+import java.nio.charset.StandardCharsets;
+import java.util.Arrays;
+
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.custom.CustomAnalyzer;
+import org.apache.lucene.analysis.payloads.TypeAsPayloadTokenFilterFactory;
+import org.apache.lucene.analysis.util.ClasspathResourceLoader;
+
+/**
+ * Needs the OpenNLP Tokenizer because it creates full streams of punctuation.
+ * Needs the OpenNLP POS tagger for the POS tags.
+ *
+ * Tagging models are created from tiny test data in opennlp/tools/test-model-data/ and are not very accurate.
+ */
+public class TestOpenNLPChunkerFilterFactory extends BaseTokenStreamTestCase {
+
+  private static final String SENTENCES = "Sentence number 1 has 6 words. Sentence number 2, 5 words.";
+  private static final String[] SENTENCES_punc
+      = {"Sentence", "number", "1", "has", "6", "words", ".", "Sentence", "number", "2", ",", "5", "words", "."};
+  private static final int[] SENTENCES_startOffsets = {0, 9, 16, 18, 22, 24, 29, 31, 40, 47, 48, 50, 52, 57};
+  private static final int[] SENTENCES_endOffsets = {8, 15, 17, 21, 23, 29, 30, 39, 46, 48, 49, 51, 57, 58};
+  private static final String[] SENTENCES_chunks
+      = { "B-NP", "I-NP", "I-NP", "B-VP", "B-NP", "I-NP", "O", "B-NP", "I-NP", "I-NP", "O", "B-NP", "I-NP", "O" };
+
+  private static final String sentenceModelFile = "en-test-sent.bin";
+  private static final String tokenizerModelFile = "en-test-tokenizer.bin";
+  private static final String posTaggerModelFile = "en-test-pos-maxent.bin";
+  private static final String chunkerModelFile = "en-test-chunker.bin";
+
+
+  private static byte[][] toPayloads(String... strings) {
+    return Arrays.stream(strings).map(s -> s == null ? null : s.getBytes(StandardCharsets.UTF_8)).toArray(byte[][]::new);
+  }
+
+  public void testBasic() throws Exception {
+    CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
+        .withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
+        .addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
+        .addTokenFilter("opennlpChunker", "chunkerModel", chunkerModelFile)
+        .build();
+    assertAnalyzesTo(analyzer, SENTENCES, SENTENCES_punc, SENTENCES_startOffsets, SENTENCES_endOffsets,
+        SENTENCES_chunks, null, null, true);
+  }
+
+  public void testPayloads() throws Exception {
+    CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
+        .withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
+        .addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
+        .addTokenFilter("opennlpChunker", "chunkerModel", chunkerModelFile)
+        .addTokenFilter(TypeAsPayloadTokenFilterFactory.class)
+        .build();
+    assertAnalyzesTo(analyzer, SENTENCES, SENTENCES_punc, SENTENCES_startOffsets, SENTENCES_endOffsets,
+        null, null, null, true, toPayloads(SENTENCES_chunks));
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b720e1ee/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPLemmatizerFilterFactory.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPLemmatizerFilterFactory.java b/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPLemmatizerFilterFactory.java
new file mode 100644
index 0000000..0491b91
--- /dev/null
+++ b/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPLemmatizerFilterFactory.java
@@ -0,0 +1,169 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.opennlp;
+
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.custom.CustomAnalyzer;
+import org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilterFactory;
+import org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilterFactory;
+import org.apache.lucene.analysis.util.ClasspathResourceLoader;
+
+public class TestOpenNLPLemmatizerFilterFactory extends BaseTokenStreamTestCase {
+
+  private static final String SENTENCE = "They sent him running in the evening.";
+  private static final String[] SENTENCE_dict_punc =   {"they", "send", "he",  "run",  "in", "the", "evening", "."};
+  private static final String[] SENTENCE_maxent_punc = {"they", "send", "he",  "runn", "in", "the", "evening", "."};
+  private static final String[] SENTENCE_posTags =     {"NNP",  "VBD",  "PRP", "VBG",  "IN", "DT",  "NN",      "."};
+
+  private static final String SENTENCES = "They sent him running in the evening. He did not come back.";
+  private static final String[] SENTENCES_dict_punc
+      = {"they", "send", "he",  "run",  "in", "the", "evening", ".", "he",  "do",  "not", "come", "back", "."};
+  private static final String[] SENTENCES_maxent_punc
+      = {"they", "send", "he",  "runn", "in", "the", "evening", ".", "he",  "do",  "not", "come", "back", "."};
+  private static final String[] SENTENCES_posTags
+      = {"NNP",  "VBD",  "PRP", "VBG",  "IN", "DT",  "NN",      ".", "PRP", "VBD", "RB",  "VB",   "RB",   "."};
+
+  private static final String SENTENCE_both = "Konstantin Kalashnitsov constantly caliphed.";
+  private static final String[] SENTENCE_both_punc
+      = {"konstantin", "kalashnitsov", "constantly", "caliph", "."};
+  private static final String[] SENTENCE_both_posTags
+      = {"IN",         "JJ",          "NN",          "VBN",    "."};
+
+  private static final String SENTENCES_both = "Konstantin Kalashnitsov constantly caliphed. Coreena could care, completely.";
+  private static final String[] SENTENCES_both_punc
+      = {"konstantin", "kalashnitsov", "constantly", "caliph", ".", "coreena", "could", "care", ",", "completely", "."};
+  private static final String[] SENTENCES_both_posTags
+      = {"IN",         "JJ",           "NN",          "VBN",    ".", "NNP",     "VBN",   "NN",   ",", "NN",         "."};
+
+  private static final String[] SENTENCES_dict_keep_orig_punc
+      = {"They", "they", "sent", "send", "him", "he", "running", "run",  "in", "the", "evening", ".", "He", "he",   "did", "do", "not", "come", "back", "."};
+  private static final String[] SENTENCES_max_ent_keep_orig_punc
+      = {"They", "they", "sent", "send", "him", "he", "running", "runn", "in", "the", "evening", ".", "He", "he",   "did", "do", "not", "come", "back", "."};
+  private static final String[] SENTENCES_keep_orig_posTags
+      = {"NNP",  "NNP",  "VBD",  "VBD",  "PRP", "PRP", "VBG",    "VBG",  "IN", "DT",  "NN",      ".", "PRP", "PRP", "VBD", "VBD", "RB",  "VB",  "RB",   "."};
+
+  private static final String[] SENTENCES_both_keep_orig_punc
+      = {"Konstantin", "konstantin", "Kalashnitsov", "kalashnitsov", "constantly", "caliphed", "caliph", ".", "Coreena", "coreena", "could", "care", ",", "completely", "."};
+  private static final String[] SENTENCES_both_keep_orig_posTags
+      = {"IN",         "IN",         "JJ",           "JJ",           "NN",         "VBN",      "VBN",    ".", "NNP",     "NNP",     "VBN",   "NN",   ",", "NN",         "."};
+
+
+  private static final String tokenizerModelFile = "en-test-tokenizer.bin";
+  private static final String sentenceModelFile = "en-test-sent.bin";
+  private static final String posTaggerModelFile = "en-test-pos-maxent.bin";
+  private static final String lemmatizerModelFile = "en-test-lemmatizer.bin";
+  private static final String lemmatizerDictFile = "en-test-lemmas.dict";
+
+
+  public void test1SentenceDictionaryOnly() throws Exception {
+    CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
+        .withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
+        .addTokenFilter("opennlpPOS", "posTaggerModel", "en-test-pos-maxent.bin")
+        .addTokenFilter("opennlplemmatizer", "dictionary", "en-test-lemmas.dict")
+        .build();
+    assertAnalyzesTo(analyzer, SENTENCE, SENTENCE_dict_punc, null, null,
+        SENTENCE_posTags, null, null, true);
+  }
+
+  public void test2SentencesDictionaryOnly() throws Exception {
+    CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
+        .withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
+        .addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
+        .addTokenFilter("opennlplemmatizer", "dictionary", lemmatizerDictFile)
+        .build();
+    assertAnalyzesTo(analyzer, SENTENCES, SENTENCES_dict_punc, null, null,
+        SENTENCES_posTags, null, null, true);
+  }
+
+  public void test1SentenceMaxEntOnly() throws Exception {
+    CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
+        .withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
+        .addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
+        .addTokenFilter("opennlplemmatizer", "lemmatizerModel", lemmatizerModelFile)
+        .build();
+    assertAnalyzesTo(analyzer, SENTENCE, SENTENCE_maxent_punc, null, null,
+        SENTENCE_posTags, null, null, true);
+  }
+
+  public void test2SentencesMaxEntOnly() throws Exception {
+    CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
+        .withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
+        .addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
+        .addTokenFilter("OpenNLPLemmatizer", "lemmatizerModel", lemmatizerModelFile)
+        .build();
+    assertAnalyzesTo(analyzer, SENTENCES, SENTENCES_maxent_punc, null, null,
+        SENTENCES_posTags, null, null, true);
+  }
+
+  public void test1SentenceDictionaryAndMaxEnt() throws Exception {
+    CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
+        .withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
+        .addTokenFilter("opennlpPOS", "posTaggerModel", "en-test-pos-maxent.bin")
+        .addTokenFilter("opennlplemmatizer", "dictionary", "en-test-lemmas.dict", "lemmatizerModel", lemmatizerModelFile)
+        .build();
+    assertAnalyzesTo(analyzer, SENTENCE_both, SENTENCE_both_punc, null, null,
+        SENTENCE_both_posTags, null, null, true);
+  }
+
+  public void test2SentencesDictionaryAndMaxEnt() throws Exception {
+    CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
+        .withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
+        .addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
+        .addTokenFilter("opennlplemmatizer", "dictionary", lemmatizerDictFile, "lemmatizerModel", lemmatizerModelFile)
+        .build();
+    assertAnalyzesTo(analyzer, SENTENCES_both, SENTENCES_both_punc, null, null,
+        SENTENCES_both_posTags, null, null, true);
+  }
+
+  public void testKeywordAttributeAwarenessDictionaryOnly() throws Exception {
+    CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
+        .withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
+        .addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
+        .addTokenFilter(KeywordRepeatFilterFactory.class)
+        .addTokenFilter("opennlplemmatizer", "dictionary", lemmatizerDictFile)
+        .addTokenFilter(RemoveDuplicatesTokenFilterFactory.class)
+        .build();
+    assertAnalyzesTo(analyzer, SENTENCES, SENTENCES_dict_keep_orig_punc, null, null,
+        SENTENCES_keep_orig_posTags, null, null, true);
+  }
+
+  public void testKeywordAttributeAwarenessMaxEntOnly() throws Exception {
+    CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
+        .withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
+        .addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
+        .addTokenFilter(KeywordRepeatFilterFactory.class)
+        .addTokenFilter("opennlplemmatizer", "lemmatizerModel", lemmatizerModelFile)
+        .addTokenFilter(RemoveDuplicatesTokenFilterFactory.class)
+        .build();
+    assertAnalyzesTo(analyzer, SENTENCES, SENTENCES_max_ent_keep_orig_punc, null, null,
+        SENTENCES_keep_orig_posTags, null, null, true);
+  }
+
+  public void testKeywordAttributeAwarenessDictionaryAndMaxEnt() throws Exception {
+    CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
+        .withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
+        .addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
+        .addTokenFilter(KeywordRepeatFilterFactory.class)
+        .addTokenFilter("opennlplemmatizer", "dictionary", lemmatizerDictFile, "lemmatizerModel", lemmatizerModelFile)
+        .addTokenFilter(RemoveDuplicatesTokenFilterFactory.class)
+        .build();
+    assertAnalyzesTo(analyzer, SENTENCES_both, SENTENCES_both_keep_orig_punc, null, null,
+        SENTENCES_both_keep_orig_posTags, null, null, true);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b720e1ee/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPPOSFilterFactory.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPPOSFilterFactory.java b/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPPOSFilterFactory.java
new file mode 100644
index 0000000..10372d0
--- /dev/null
+++ b/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPPOSFilterFactory.java
@@ -0,0 +1,95 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.opennlp;
+
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.util.Arrays;
+
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.custom.CustomAnalyzer;
+import org.apache.lucene.analysis.payloads.TypeAsPayloadTokenFilterFactory;
+import org.apache.lucene.analysis.util.ClasspathResourceLoader;
+
+/**
+ * Needs the OpenNLP Tokenizer because it creates full streams of punctuation.
+ * The POS model is based on this tokenization.
+ *
+ * Tagging models are created from tiny test data in opennlp/tools/test-model-data/ and are not very accurate.
+ */
+public class TestOpenNLPPOSFilterFactory extends BaseTokenStreamTestCase {
+
+  private static final String SENTENCES = "Sentence number 1 has 6 words. Sentence number 2, 5 words.";
+  private static final String[] SENTENCES_punc
+      = {"Sentence", "number", "1", "has", "6", "words", ".", "Sentence", "number", "2", ",", "5", "words", "."};
+  private static final int[] SENTENCES_startOffsets = {0, 9, 16, 18, 22, 24, 29, 31, 40, 47, 48, 50, 52, 57};
+  private static final int[] SENTENCES_endOffsets = {8, 15, 17, 21, 23, 29, 30, 39, 46, 48, 49, 51, 57, 58};
+  private static final String[] SENTENCES_posTags
+      = {"NN", "NN", "CD", "VBZ", "CD", "NNS", ".", "NN", "NN", "CD", ",", "CD", "NNS", "."};
+  private static final String NAMES2 = "Royal Flash is a tale about Harry Flashman.";
+  private static final String[] NAMES2_punc = {"Royal", "Flash", "is", "a", "tale", "about", "Harry", "Flashman", "."};
+  private static final String[] NAMES2_OUT = { "word", "word", "word", "word", "word", "word", "word", "person", "word" };
+
+  private static final String NO_BREAK = "No period";
+  private static final String[] NO_BREAK_terms = {"No", "period"};
+  private static final int[] NO_BREAK_startOffsets = {0, 3};
+  private static final int[] NO_BREAK_endOffsets = {2, 9};
+
+  private static final String sentenceModelFile = "en-test-sent.bin";
+  private static final String tokenizerModelFile = "en-test-tokenizer.bin";
+  private static final String posTaggerModelFile = "en-test-pos-maxent.bin";
+
+
+  private static byte[][] toPayloads(String... strings) {
+    return Arrays.stream(strings).map(s -> s == null ? null : s.getBytes(StandardCharsets.UTF_8)).toArray(byte[][]::new);
+  }
+
+  public void testBasic() throws IOException {
+    CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
+        .withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
+        .addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
+        .build();
+    assertAnalyzesTo(analyzer, SENTENCES, SENTENCES_punc, SENTENCES_startOffsets, SENTENCES_endOffsets);
+  }
+
+  public void testPOS() throws Exception {
+    CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
+        .withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
+        .addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
+        .build();
+    assertAnalyzesTo(analyzer, SENTENCES, SENTENCES_punc, SENTENCES_startOffsets, SENTENCES_endOffsets,
+        SENTENCES_posTags, null, null, true);
+
+    analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
+        .withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
+        .addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
+        .addTokenFilter(TypeAsPayloadTokenFilterFactory.class)
+        .build();
+    assertAnalyzesTo(analyzer, SENTENCES, SENTENCES_punc, SENTENCES_startOffsets, SENTENCES_endOffsets,
+        null, null, null, true, toPayloads(SENTENCES_posTags));
+  }
+
+  public void testNoBreak() throws Exception {
+    CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
+        .withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
+        .addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
+        .build();
+    assertAnalyzesTo(analyzer, NO_BREAK, NO_BREAK_terms, NO_BREAK_startOffsets, NO_BREAK_endOffsets,
+        null, null, null, true);
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b720e1ee/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPSentenceBreakIterator.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPSentenceBreakIterator.java b/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPSentenceBreakIterator.java
new file mode 100644
index 0000000..4ee6570
--- /dev/null
+++ b/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPSentenceBreakIterator.java
@@ -0,0 +1,201 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.opennlp;
+
+import java.io.IOException;
+import java.text.BreakIterator;
+import java.text.CharacterIterator;
+
+import org.apache.lucene.analysis.opennlp.tools.NLPSentenceDetectorOp;
+import org.apache.lucene.analysis.opennlp.tools.OpenNLPOpsFactory;
+import org.apache.lucene.analysis.util.CharArrayIterator;
+import org.apache.lucene.analysis.util.ClasspathResourceLoader;
+import org.apache.lucene.util.LuceneTestCase;
+import org.junit.BeforeClass;
+
+public class TestOpenNLPSentenceBreakIterator extends LuceneTestCase {
+
+  private static final String TEXT
+      //                                                                                                     111
+      //           111111111122222222223333333333444444444455555555556666666666777777777788888888889999999999000
+      // 0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012
+      = "Sentence number 1 has 6 words. Sentence number 2, 5 words. And finally, sentence number 3 has 8 words.";
+  private static final String[] SENTENCES = new String[] {
+    "Sentence number 1 has 6 words. ", "Sentence number 2, 5 words. ", "And finally, sentence number 3 has 8 words." };
+  private static final String PADDING = " Word. Word. ";
+  private static final String sentenceModelFile = "en-test-sent.bin";
+
+
+  @BeforeClass
+  public static void populateCache() throws IOException {
+    OpenNLPOpsFactory.getSentenceModel
+        (sentenceModelFile, new ClasspathResourceLoader(TestOpenNLPSentenceBreakIterator.class));
+  }
+
+  public void testThreeSentences() throws Exception {
+    NLPSentenceDetectorOp sentenceDetectorOp = OpenNLPOpsFactory.getSentenceDetector(sentenceModelFile);
+    BreakIterator bi = new OpenNLPSentenceBreakIterator(sentenceDetectorOp);
+    bi.setText(TEXT); // String is converted to StringCharacterIterator
+    do3SentenceTest(bi);
+
+    bi.setText(getCharArrayIterator(TEXT));
+    do3SentenceTest(bi);
+  }
+
+  private CharacterIterator getCharArrayIterator(String text) {
+    return getCharArrayIterator(text, 0, text.length());
+  }
+
+  private CharacterIterator getCharArrayIterator(String text, int start, int length) {
+    CharArrayIterator charArrayIterator = new CharArrayIterator() {
+      // Lie about all surrogates to the sentence tokenizer,
+      // instead we treat them all as SContinue so we won't break around them.
+      @Override
+      protected char jreBugWorkaround(char ch) {
+        return ch >= 0xD800 && ch <= 0xDFFF ? 0x002C : ch;
+      }
+    };
+    charArrayIterator.setText(text.toCharArray(), start, length);
+    return charArrayIterator;
+  }
+
+  private void do3SentenceTest(BreakIterator bi) {
+    assertEquals(0, bi.current());
+    assertEquals(0, bi.first());
+    assertEquals(SENTENCES[0], TEXT.substring(bi.current(), bi.next()));
+    assertEquals(SENTENCES[1], TEXT.substring(bi.current(), bi.next()));
+    int current = bi.current();
+    assertEquals(bi.getText().getEndIndex(), bi.next());
+    int next = bi.current();
+    assertEquals(SENTENCES[2], TEXT.substring(current, next));
+    assertEquals(BreakIterator.DONE, bi.next());
+
+    assertEquals(TEXT.length(), bi.last());
+    int end = bi.current();
+    assertEquals(SENTENCES[2], TEXT.substring(bi.previous(), end));
+    end = bi.current();
+    assertEquals(SENTENCES[1], TEXT.substring(bi.previous(), end));
+    end = bi.current();
+    assertEquals(SENTENCES[0], TEXT.substring(bi.previous(), end));
+    assertEquals(BreakIterator.DONE, bi.previous());
+    assertEquals(0, bi.current());
+
+    assertEquals(59, bi.following(39));
+    assertEquals(59, bi.following(31));
+    assertEquals(31, bi.following(30));
+
+    assertEquals(0, bi.preceding(57));
+    assertEquals(0, bi.preceding(58));
+    assertEquals(31, bi.preceding(59));
+
+    assertEquals(0, bi.first());
+    assertEquals(59, bi.next(2));
+    assertEquals(0, bi.next(-2));
+  }
+
+  public void testSingleSentence() throws Exception {
+    NLPSentenceDetectorOp sentenceDetectorOp = OpenNLPOpsFactory.getSentenceDetector(sentenceModelFile);
+    BreakIterator bi = new OpenNLPSentenceBreakIterator(sentenceDetectorOp);
+    bi.setText(getCharArrayIterator(SENTENCES[0]));
+    test1Sentence(bi, SENTENCES[0]);
+  }
+
+  private void test1Sentence(BreakIterator bi, String text) {
+    int start = bi.getText().getBeginIndex();
+    assertEquals(start, bi.first());
+    int current = bi.current();
+    assertEquals(bi.getText().getEndIndex(), bi.next());
+    int end = bi.current() - start;
+    assertEquals(text, text.substring(current - start, end - start));
+
+    assertEquals(text.length(), bi.last() - start);
+    end = bi.current();
+    bi.previous();
+    assertEquals(BreakIterator.DONE, bi.previous());
+    int previous = bi.current();
+    assertEquals(text, text.substring(previous - start, end - start));
+    assertEquals(start, bi.current());
+
+    assertEquals(BreakIterator.DONE, bi.following(bi.last() / 2 + start));
+
+    assertEquals(BreakIterator.DONE, bi.preceding(bi.last() / 2 + start));
+
+    assertEquals(start, bi.first());
+    assertEquals(BreakIterator.DONE, bi.next(13));
+    assertEquals(BreakIterator.DONE, bi.next(-8));
+  }
+
+  public void testSliceEnd() throws Exception {
+    NLPSentenceDetectorOp sentenceDetectorOp = OpenNLPOpsFactory.getSentenceDetector(sentenceModelFile);
+    BreakIterator bi = new OpenNLPSentenceBreakIterator(sentenceDetectorOp);
+    bi.setText(getCharArrayIterator(SENTENCES[0] + PADDING, 0, SENTENCES[0].length()));
+
+    test1Sentence(bi, SENTENCES[0]);
+  }
+
+  public void testSliceStart() throws Exception {
+    NLPSentenceDetectorOp sentenceDetectorOp = OpenNLPOpsFactory.getSentenceDetector(sentenceModelFile);
+    BreakIterator bi = new OpenNLPSentenceBreakIterator(sentenceDetectorOp);
+    bi.setText(getCharArrayIterator(PADDING + SENTENCES[0], PADDING.length(), SENTENCES[0].length()));
+    test1Sentence(bi, SENTENCES[0]);
+  }
+
+  public void testSliceMiddle() throws Exception {
+    NLPSentenceDetectorOp sentenceDetectorOp = OpenNLPOpsFactory.getSentenceDetector(sentenceModelFile);
+    BreakIterator bi = new OpenNLPSentenceBreakIterator(sentenceDetectorOp);
+    bi.setText(getCharArrayIterator(PADDING + SENTENCES[0] + PADDING, PADDING.length(), SENTENCES[0].length()));
+
+    test1Sentence(bi, SENTENCES[0]);
+  }
+
+  /** the current position must be ignored, initial position is always first() */
+  public void testFirstPosition() throws Exception {
+    NLPSentenceDetectorOp sentenceDetectorOp = OpenNLPOpsFactory.getSentenceDetector(sentenceModelFile);
+    BreakIterator bi = new OpenNLPSentenceBreakIterator(sentenceDetectorOp);
+    bi.setText(getCharArrayIterator(SENTENCES[0]));
+    assertEquals(SENTENCES[0].length(), bi.last()); // side-effect: set current position to last()
+    test1Sentence(bi, SENTENCES[0]);
+  }
+
+  public void testWhitespaceOnly() throws Exception {
+    NLPSentenceDetectorOp sentenceDetectorOp = OpenNLPOpsFactory.getSentenceDetector(sentenceModelFile);
+    BreakIterator bi = new OpenNLPSentenceBreakIterator(sentenceDetectorOp);
+    bi.setText("   \n \n\n\r\n\t  \n");
+    test0Sentences(bi);
+  }
+
+  public void testEmptyString() throws Exception {
+    NLPSentenceDetectorOp sentenceDetectorOp = OpenNLPOpsFactory.getSentenceDetector(sentenceModelFile);
+    BreakIterator bi = new OpenNLPSentenceBreakIterator(sentenceDetectorOp);
+    bi.setText("");
+    test0Sentences(bi);
+  }
+
+  private void test0Sentences(BreakIterator bi) {
+    assertEquals(0, bi.current());
+    assertEquals(0, bi.first());
+    assertEquals(BreakIterator.DONE, bi.next());
+    assertEquals(0, bi.last());
+    assertEquals(BreakIterator.DONE, bi.previous());
+    assertEquals(BreakIterator.DONE, bi.following(0));
+    assertEquals(BreakIterator.DONE, bi.preceding(0));
+    assertEquals(0, bi.first());
+    assertEquals(BreakIterator.DONE, bi.next(13));
+    assertEquals(BreakIterator.DONE, bi.next(-8));
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b720e1ee/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPTokenizerFactory.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPTokenizerFactory.java b/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPTokenizerFactory.java
new file mode 100644
index 0000000..db2bbb2
--- /dev/null
+++ b/lucene/analysis/opennlp/src/test/org/apache/lucene/analysis/opennlp/TestOpenNLPTokenizerFactory.java
@@ -0,0 +1,97 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.opennlp;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.custom.CustomAnalyzer;
+import org.apache.lucene.analysis.util.ClasspathResourceLoader;
+import org.junit.Test;
+
+/**
+ * Tests the Tokenizer as well- the Tokenizer needs the OpenNLP model files,
+ * which this can load from src/test-files/opennlp/solr/conf
+ *
+ */
+public class TestOpenNLPTokenizerFactory extends BaseTokenStreamTestCase {
+
+  static private String SENTENCES = "Sentence number 1 has 6 words. Sentence number 2, 5 words.";
+  static private String[] SENTENCES_split = {"Sentence number 1 has 6 words. ", "Sentence number 2, 5 words."};
+  static private String[] SENTENCES_punc = {"Sentence", "number", "1", "has", "6", "words", ".", "Sentence", "number", "2", ",", "5", "words", "."};
+  static private int[] SENTENCES_startOffsets = {0, 9, 16, 18, 22, 24, 29, 31, 40, 47, 48, 50, 52, 57};
+  static private int[] SENTENCES_endOffsets = {8, 15, 17, 21, 23, 29, 30, 39, 46, 48, 49, 51, 57, 58};
+
+  static private String SENTENCE1 = "Sentence number 1 has 6 words.";
+  static private String[] SENTENCE1_punc = {"Sentence", "number", "1", "has", "6", "words", "."};
+
+  @Test
+  public void testTokenizer() throws IOException {
+    CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
+        .withTokenizer("opennlp", "sentenceModel", "en-test-sent.bin", "tokenizerModel", "en-test-tokenizer.bin")
+        .build();
+    assertAnalyzesTo(analyzer, SENTENCES, SENTENCES_punc, SENTENCES_startOffsets, SENTENCES_endOffsets);
+    assertAnalyzesTo(analyzer, SENTENCE1, SENTENCE1_punc);
+  }
+
+  @Test
+  public void testTokenizerNoSentenceDetector() throws IOException {
+    IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
+      CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
+          .withTokenizer("opennlp", "tokenizerModel", "en-test-tokenizer.bin")
+          .build();
+    });
+    assertTrue(expected.getMessage().contains("Configuration Error: missing parameter 'sentenceModel'"));
+  }
+
+  @Test
+  public void testTokenizerNoTokenizer() throws IOException {
+    IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
+      CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
+          .withTokenizer("opennlp", "sentenceModel", "en-test-sent.bin")
+          .build();
+    });
+    assertTrue(expected.getMessage().contains("Configuration Error: missing parameter 'tokenizerModel'"));
+  }
+
+  // test analyzer caching the tokenizer
+  @Test
+  public void testClose() throws IOException {
+    Map<String,String> args = new HashMap<String,String>() {{ put("sentenceModel", "en-test-sent.bin");
+                                                              put("tokenizerModel", "en-test-tokenizer.bin"); }};
+    OpenNLPTokenizerFactory factory = new OpenNLPTokenizerFactory(args);
+    factory.inform(new ClasspathResourceLoader(getClass()));
+
+    Tokenizer ts = factory.create(newAttributeFactory());
+    ts.setReader(new StringReader(SENTENCES));
+
+    ts.reset();
+    ts.close();
+    ts.reset();
+    ts.setReader(new StringReader(SENTENCES));
+    assertTokenStreamContents(ts, SENTENCES_punc);
+    ts.close();
+    ts.reset();
+    ts.setReader(new StringReader(SENTENCES));
+    assertTokenStreamContents(ts, SENTENCES_punc);
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b720e1ee/lucene/analysis/opennlp/src/tools/test-model-data/README.txt
----------------------------------------------------------------------
diff --git a/lucene/analysis/opennlp/src/tools/test-model-data/README.txt b/lucene/analysis/opennlp/src/tools/test-model-data/README.txt
new file mode 100644
index 0000000..3ac0aa3
--- /dev/null
+++ b/lucene/analysis/opennlp/src/tools/test-model-data/README.txt
@@ -0,0 +1,6 @@
+Use small training data to create small models for unit tests.
+Training data derived from Reuters corpus in very unscientific way.
+Tagging done with CCG Urbana-Champaign online demos:
+	http://cogcomp.cs.illinois.edu/page/demos
+
+Run 'ant train-test-models' to generate models from training data here.