You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by sm...@apache.org on 2017/04/17 01:45:47 UTC
opennlp git commit: OPENNLP-1022:Fix documentation to remove
references to 'Save XXXModel to database, this closes apache/opennlp#158
Repository: opennlp
Updated Branches:
refs/heads/master a59765cd4 -> f8fbfc9fd
OPENNLP-1022:Fix documentation to remove references to 'Save XXXModel to database, this closes apache/opennlp#158
Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/f8fbfc9f
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/f8fbfc9f
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/f8fbfc9f
Branch: refs/heads/master
Commit: f8fbfc9fdca4b5e9ba1a5608ca17e7b6feb18c3c
Parents: a59765c
Author: smarthi <sm...@apache.org>
Authored: Sun Apr 16 21:45:17 2017 -0400
Committer: smarthi <sm...@apache.org>
Committed: Sun Apr 16 21:45:17 2017 -0400
----------------------------------------------------------------------
opennlp-docs/src/docbkx/chunker.xml | 29 ++-------
opennlp-docs/src/docbkx/doccat.xml | 44 ++------------
opennlp-docs/src/docbkx/introduction.xml | 17 +-----
opennlp-docs/src/docbkx/lemmatizer.xml | 38 +++---------
opennlp-docs/src/docbkx/namefinder.xml | 36 +++---------
opennlp-docs/src/docbkx/parser.xml | 2 +-
opennlp-docs/src/docbkx/postagger.xml | 62 ++------------------
opennlp-docs/src/docbkx/sentdetect.xml | 33 ++---------
opennlp-docs/src/docbkx/tokenizer.xml | 15 +----
.../main/java/opennlp/tools/ml/BeamSearch.java | 23 +++-----
10 files changed, 46 insertions(+), 253 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/opennlp/blob/f8fbfc9f/opennlp-docs/src/docbkx/chunker.xml
----------------------------------------------------------------------
diff --git a/opennlp-docs/src/docbkx/chunker.xml b/opennlp-docs/src/docbkx/chunker.xml
index 0c04e8a..b67a7fd 100644
--- a/opennlp-docs/src/docbkx/chunker.xml
+++ b/opennlp-docs/src/docbkx/chunker.xml
@@ -81,19 +81,8 @@ Rockwell_NNP said_VBD the_DT agreement_NN calls_VBZ for_IN it_PRP to_TO supply_V
InputStream modelIn = null;
ChunkerModel model = null;
-try {
- modelIn = new FileInputStream("en-chunker.bin");
+try (modelIn = new FileInputStream("en-chunker.bin")){
model = new ChunkerModel(modelIn);
-} catch (IOException e) {
- // Model loading failed, handle the error
- e.printStackTrace();
-} finally {
- if (modelIn != null) {
- try {
- modelIn.close();
- } catch (IOException e) {
- }
- }
}]]>
</programlisting>
After the model is loaded a Chunker can be instantiated.
@@ -242,28 +231,18 @@ $ opennlp ChunkerTrainerME -model en-chunker.bin -lang en -data en-chunker.train
illustrates how to do it:
<programlisting language="java">
<![CDATA[
-Charset charset = Charset.forName("UTF-8");
ObjectStream<String> lineStream =
- new PlainTextByLineStream(new FileInputStream("en-chunker.train"),charset);
-ObjectStream<ChunkSample> sampleStream = new ChunkSampleStream(lineStream);
+ new PlainTextByLineStream(new FileInputStream("en-chunker.train"), StandardCharsets.UTF_8);
ChunkerModel model;
-try {
+try(ObjectStream<ChunkSample> sampleStream = new ChunkSampleStream(lineStream)) {
model = ChunkerME.train("en", sampleStream,
new DefaultChunkerContextGenerator(), TrainingParameters.defaultParams());
}
-finally {
- sampleStream.close();
-}
-OutputStream modelOut = null;
-try {
- modelOut = new BufferedOutputStream(new FileOutputStream(modelFile));
+try (OutputStream modelOut = new BufferedOutputStream(new FileOutputStream(modelFile))) {
model.serialize(modelOut);
-} finally {
- if (modelOut != null)
- modelOut.close();
}]]>
</programlisting>
</para>
http://git-wip-us.apache.org/repos/asf/opennlp/blob/f8fbfc9f/opennlp-docs/src/docbkx/doccat.xml
----------------------------------------------------------------------
diff --git a/opennlp-docs/src/docbkx/doccat.xml b/opennlp-docs/src/docbkx/doccat.xml
index 7fe3f1f..c056732 100644
--- a/opennlp-docs/src/docbkx/doccat.xml
+++ b/opennlp-docs/src/docbkx/doccat.xml
@@ -127,33 +127,16 @@ $ opennlp DoccatTrainer -model en-doccat.bin -lang en -data en-doccat.train -enc
<programlisting language="java">
<![CDATA[
DoccatModel model = null;
-
InputStream dataIn = null;
-try {
- dataIn = new FileInputStream("en-sentiment.train");
+
+try (dataIn = new FileInputStream("en-sentiment.train")) {
ObjectStream<String> lineStream =
new PlainTextByLineStream(dataIn, "UTF-8");
ObjectStream<DocumentSample> sampleStream = new DocumentSampleStream(lineStream);
model = DocumentCategorizerME.train("en", sampleStream);
}
-catch (IOException e) {
- // Failed to read or parse training data, training failed
- e.printStackTrace();
-}
-finally {
- if (dataIn != null) {
- try {
- dataIn.close();
- }
- catch (IOException e) {
- // Not an issue, training already finished.
- // The exception should be logged and investigated
- // if part of a production system.
- e.printStackTrace();
- }
- }
-}]]>
+]]>
</programlisting>
Now might be a good time to cruise over to Hulu or something, because this could take a while if you've got a large training set.
You may see a lot of output as well. Once you're done, you can pretty quickly step to classification directly,
@@ -162,27 +145,10 @@ finally {
<para>
<programlisting language="java">
<![CDATA[
-OutputStream modelOut = null;
-try {
- modelOut = new BufferedOutputStream(new FileOutputStream(modelFile));
+try (OutputStream modelOut = new BufferedOutputStream(new FileOutputStream(modelFile))) {
model.serialize(modelOut);
}
-catch (IOException e) {
- // Failed to save model
- e.printStackTrace();
-}
-finally {
- if (modelOut != null) {
- try {
- modelOut.close();
- }
- catch (IOException e) {
- // Failed to correctly save model.
- // Written model might be invalid.
- e.printStackTrace();
- }
- }
-}]]>
+]]>
</programlisting>
</para>
</section>
http://git-wip-us.apache.org/repos/asf/opennlp/blob/f8fbfc9f/opennlp-docs/src/docbkx/introduction.xml
----------------------------------------------------------------------
diff --git a/opennlp-docs/src/docbkx/introduction.xml b/opennlp-docs/src/docbkx/introduction.xml
index a3bd482..65fcd9d 100644
--- a/opennlp-docs/src/docbkx/introduction.xml
+++ b/opennlp-docs/src/docbkx/introduction.xml
@@ -65,23 +65,10 @@ under the License.
constructor of the model class:
<programlisting language="java">
<![CDATA[
-InputStream modelIn = new FileInputStream("lang-model-name.bin");
-
-try {
+try (InputStream modelIn = new FileInputStream("lang-model-name.bin")) {
SomeModel model = new SomeModel(modelIn);
}
-catch (IOException e) {
- //handle the exception
-}
-finally {
- if (null != modelIn) {
- try {
- modelIn.close();
- }
- catch (IOException e) {
- }
- }
-}]]>
+]]>
</programlisting>
</para>
<para>
http://git-wip-us.apache.org/repos/asf/opennlp/blob/f8fbfc9f/opennlp-docs/src/docbkx/lemmatizer.xml
----------------------------------------------------------------------
diff --git a/opennlp-docs/src/docbkx/lemmatizer.xml b/opennlp-docs/src/docbkx/lemmatizer.xml
index 34668d0..1fa5540 100644
--- a/opennlp-docs/src/docbkx/lemmatizer.xml
+++ b/opennlp-docs/src/docbkx/lemmatizer.xml
@@ -88,22 +88,11 @@ signed VBD sign
In the example below it is loaded from disk:
<programlisting language="java">
<![CDATA[
-InputStream modelIn = null;
LemmatizerModel model = null;
-try {
- modelIn = new FileInputStream("en-lemmatizer.bin");
+try (InputStream modelIn = new FileInputStream("en-lemmatizer.bin"))) {
model = new LemmatizerModel(modelIn);
-} catch (IOException e) {
- // Model loading failed, handle the error
- e.printStackTrace();
-} finally {
- if (modelIn != null) {
- try {
- modelIn.close();
- } catch (IOException e) {
- }
- }
-}]]>
+}
+]]>
</programlisting>
After the model is loaded a LemmatizerME can be instantiated.
<programlisting language="java">
@@ -174,22 +163,10 @@ shrapnel NN shrapnel
<![CDATA[
InputStream dictLemmatizer = null;
-try {
- dictLemmatizer = new FileInputStream("english-lemmatizer.txt");
-}
-catch (IOException e) {
- // dictionary loading failed, handle the error
- e.printStackTrace();
+try (dictLemmatizer = new FileInputStream("english-lemmatizer.txt")) {
+
}
-finally {
- if (dictLemmatizer != null) {
- try {
- dictLemmatizer.close();
- }
- catch (IOException e) {
- }
- }
-}]]>
+]]>
</programlisting>
After the dictionary is loaded the DictionaryLemmatizer can be
instantiated.
@@ -303,8 +280,7 @@ $ opennlp LemmatizerTrainerME -model en-lemmatizer.bin -params PerceptronTrainer
TrainingParameters mlParams = CmdLineUtil.loadTrainingParameters(params.getParams(), false);
if (mlParams == null) {
mlParams = ModelUtil.createDefaultTrainingParameters();
- }
-]]>
+ }]]>
</programlisting>
Then we read the training data:
<programlisting language="java">
http://git-wip-us.apache.org/repos/asf/opennlp/blob/f8fbfc9f/opennlp-docs/src/docbkx/namefinder.xml
----------------------------------------------------------------------
diff --git a/opennlp-docs/src/docbkx/namefinder.xml b/opennlp-docs/src/docbkx/namefinder.xml
index 1e72a82..2f68c47 100644
--- a/opennlp-docs/src/docbkx/namefinder.xml
+++ b/opennlp-docs/src/docbkx/namefinder.xml
@@ -80,23 +80,10 @@ Mr . <START:person> Vinken <END> is chairman of Elsevier N.V. , the Dutch publis
In the sample below it is loaded from disk.
<programlisting language="java">
<![CDATA[
-InputStream modelIn = new FileInputStream("en-ner-person.bin");
-
-try {
+try (InputStream modelIn = new FileInputStream("en-ner-person.bin")){
TokenNameFinderModel model = new TokenNameFinderModel(modelIn);
}
-catch (IOException e) {
- e.printStackTrace();
-}
-finally {
- if (modelIn != null) {
- try {
- modelIn.close();
- }
- catch (IOException e) {
- }
- }
-}]]>
+]]>
</programlisting>
There is a number of reasons why the model loading can fail:
<itemizedlist>
@@ -274,33 +261,24 @@ $ opennlp TokenNameFinderTrainer -featuregen brown.xml -sequenceCodec BILOU -res
<para>Call the NameFinderME.train method</para>
</listitem>
<listitem>
- <para>Save the TokenNameFinderModel to a file or database</para>
+ <para>Save the TokenNameFinderModel to a file</para>
</listitem>
</itemizedlist>
The three steps are illustrated by the following sample code:
<programlisting language="java">
<![CDATA[
-Charset charset = Charset.forName("UTF-8");
ObjectStream<String> lineStream =
- new PlainTextByLineStream(new FileInputStream("en-ner-person.train"), charset);
-ObjectStream<NameSample> sampleStream = new NameSampleDataStream(lineStream);
+ new PlainTextByLineStream(new FileInputStream("en-ner-person.train"), StandardCharsets.UTF8);
TokenNameFinderModel model;
-try {
+try (ObjectStream<NameSample> sampleStream = new NameSampleDataStream(lineStream)) {
model = NameFinderME.train("en", "person", sampleStream, TrainingParameters.defaultParams(),
TokenNameFinderFactory nameFinderFactory);
}
-finally {
- sampleStream.close();
-}
-try {
- modelOut = new BufferedOutputStream(new FileOutputStream(modelFile));
+try (modelOut = new BufferedOutputStream(new FileOutputStream(modelFile)){
model.serialize(modelOut);
-} finally {
- if (modelOut != null)
- modelOut.close();
}]]>
</programlisting>
</para>
@@ -542,7 +520,7 @@ System.out.println(result.toString());]]>
<programlisting language="java">
<![CDATA[
FileInputStream sampleDataIn = new FileInputStream("en-ner-person.train");
-ObjectStream<NameSample> sampleStream = new PlainTextByLineStream(sampleDataIn.getChannel(), "UTF-8");
+ObjectStream<NameSample> sampleStream = new PlainTextByLineStream(sampleDataIn.getChannel(), StandardCharsets.UTF_8);
TokenNameFinderCrossValidator evaluator = new TokenNameFinderCrossValidator("en", 100, 5);
evaluator.evaluate(sampleStream, 10);
http://git-wip-us.apache.org/repos/asf/opennlp/blob/f8fbfc9f/opennlp-docs/src/docbkx/parser.xml
----------------------------------------------------------------------
diff --git a/opennlp-docs/src/docbkx/parser.xml b/opennlp-docs/src/docbkx/parser.xml
index a81c078..614293b 100644
--- a/opennlp-docs/src/docbkx/parser.xml
+++ b/opennlp-docs/src/docbkx/parser.xml
@@ -218,7 +218,7 @@ $ opennlp TaggerModelReplacer en-parser-chunking.bin en-pos-maxent.bin]]>
<para>Call a Parser train method: This can be either the CHUNKING or the TREEINSERT parser.</para>
</listitem>
<listitem>
- <para>Save the ParseModel to a file or database.</para>
+ <para>Save the ParseModel to a file</para>
</listitem>
</itemizedlist>
The following code snippet shows how to instantiate the HeadRules:
http://git-wip-us.apache.org/repos/asf/opennlp/blob/f8fbfc9f/opennlp-docs/src/docbkx/postagger.xml
----------------------------------------------------------------------
diff --git a/opennlp-docs/src/docbkx/postagger.xml b/opennlp-docs/src/docbkx/postagger.xml
index e981c3a..b623d2e 100644
--- a/opennlp-docs/src/docbkx/postagger.xml
+++ b/opennlp-docs/src/docbkx/postagger.xml
@@ -69,24 +69,8 @@ Mr._NNP Vinken_NNP is_VBZ chairman_NN of_IN Elsevier_NNP N.V._NNP ,_, the_DT Dut
In the sample below its loaded from disk.
<programlisting language="java">
<![CDATA[
-InputStream modelIn = null;
-
-try {
- modelIn = new FileInputStream("en-pos-maxent.bin");
+try (InputStream modelIn = new FileInputStream("en-pos-maxent.bin"){
POSModel model = new POSModel(modelIn);
-}
-catch (IOException e) {
- // Model loading failed, handle the error
- e.printStackTrace();
-}
-finally {
- if (modelIn != null) {
- try {
- modelIn.close();
- }
- catch (IOException e) {
- }
- }
}]]>
</programlisting>
After the model is loaded the POSTaggerME can be instantiated.
@@ -214,7 +198,7 @@ $ opennlp POSTaggerTrainer -type maxent -model en-pos-maxent.bin \
<para>Call the POSTagger.train method</para>
</listitem>
<listitem>
- <para>Save the POSModel to a file or database</para>
+ <para>Save the POSModel to a file</para>
</listitem>
</itemizedlist>
The following code illustrates that:
@@ -222,30 +206,11 @@ $ opennlp POSTaggerTrainer -type maxent -model en-pos-maxent.bin \
<![CDATA[
POSModel model = null;
-InputStream dataIn = null;
-try {
- dataIn = new FileInputStream("en-pos.train");
- ObjectStream<String> lineStream = new PlainTextByLineStream(dataIn, "UTF-8");
+try (InputStream dataIn = new FileInputStream("en-pos.train")){
+ ObjectStream<String> lineStream = new PlainTextByLineStream(dataIn, StandardCharsets.UTF_8);
ObjectStream<POSSample> sampleStream = new WordTagSampleStream(lineStream);
model = POSTaggerME.train("en", sampleStream, TrainingParameters.defaultParams(), null, null);
-}
-catch (IOException e) {
- // Failed to read or parse training data, training failed
- e.printStackTrace();
-}
-finally {
- if (dataIn != null) {
- try {
- dataIn.close();
- }
- catch (IOException e) {
- // Not an issue, training already finished.
- // The exception should be logged and investigated
- // if part of a production system.
- e.printStackTrace();
- }
- }
}]]>
</programlisting>
The above code performs the first two steps, opening the data and training
@@ -253,25 +218,8 @@ finally {
the sample below it is written into a file.
<programlisting language="java">
<![CDATA[
-OutputStream modelOut = null;
-try {
- modelOut = new BufferedOutputStream(new FileOutputStream(modelFile));
+try (OutputStream modelOut = new BufferedOutputStream(new FileOutputStream(modelFile))){
model.serialize(modelOut);
-}
-catch (IOException e) {
- // Failed to save model
- e.printStackTrace();
-}
-finally {
- if (modelOut != null) {
- try {
- modelOut.close();
- }
- catch (IOException e) {
- // Failed to correctly save model.
- // Written model might be invalid.
- e.printStackTrace();
- }
}]]>
</programlisting>
</para>
http://git-wip-us.apache.org/repos/asf/opennlp/blob/f8fbfc9f/opennlp-docs/src/docbkx/sentdetect.xml
----------------------------------------------------------------------
diff --git a/opennlp-docs/src/docbkx/sentdetect.xml b/opennlp-docs/src/docbkx/sentdetect.xml
index 0c67b51..aacd4d3 100644
--- a/opennlp-docs/src/docbkx/sentdetect.xml
+++ b/opennlp-docs/src/docbkx/sentdetect.xml
@@ -81,22 +81,9 @@ $ opennlp SentenceDetector en-sent.bin < input.txt > output.txt]]>
To instantiate the Sentence Detector the sentence model must be loaded first.
<programlisting language="java">
<![CDATA[
-InputStream modelIn = new FileInputStream("en-sent.bin");
-try {
+try (InputStream modelIn = new FileInputStream("en-sent.bin")) {
SentenceModel model = new SentenceModel(modelIn);
-}
-catch (IOException e) {
- e.printStackTrace();
-}
-finally {
- if (modelIn != null) {
- try {
- modelIn.close();
- }
- catch (IOException e) {
- }
- }
}]]>
</programlisting>
After the model is loaded the SentenceDetectorME can be instantiated.
@@ -123,7 +110,7 @@ Span sentences[] = sentenceDetector.sentPosDetect(" First sentence. Second sent
</section>
<section id="tools.sentdetect.training">
<title>Sentence Detector Training</title>
- <para></para>
+ <para/>
<section id="tools.sentdetect.training.tool">
<title>Training Tool</title>
<para>
@@ -220,27 +207,17 @@ Path: en-sent.bin
The following sample code illustrates these steps:
<programlisting language="java">
<![CDATA[
-Charset charset = Charset.forName("UTF-8");
ObjectStream<String> lineStream =
- new PlainTextByLineStream(new FileInputStream("en-sent.train"), charset);
-ObjectStream<SentenceSample> sampleStream = new SentenceSampleStream(lineStream);
+ new PlainTextByLineStream(new FileInputStream("en-sent.train"), StandardCharsets.UTF_8);
SentenceModel model;
-try {
+try (ObjectStream<SentenceSample> sampleStream = new SentenceSampleStream(lineStream)) {
model = SentenceDetectorME.train("en", sampleStream, true, null, TrainingParameters.defaultParams());
}
-finally {
- sampleStream.close();
-}
-OutputStream modelOut = null;
-try {
- modelOut = new BufferedOutputStream(new FileOutputStream(modelFile));
+try (OutputStream modelOut = new BufferedOutputStream(new FileOutputStream(modelFile))) {
model.serialize(modelOut);
-} finally {
- if (modelOut != null)
- modelOut.close();
}]]>
</programlisting>
</para>
http://git-wip-us.apache.org/repos/asf/opennlp/blob/f8fbfc9f/opennlp-docs/src/docbkx/tokenizer.xml
----------------------------------------------------------------------
diff --git a/opennlp-docs/src/docbkx/tokenizer.xml b/opennlp-docs/src/docbkx/tokenizer.xml
index d8df477..6d54c3c 100644
--- a/opennlp-docs/src/docbkx/tokenizer.xml
+++ b/opennlp-docs/src/docbkx/tokenizer.xml
@@ -154,22 +154,9 @@ London share prices were bolstered largely by continued gains on Wall Street and
can be loaded.
<programlisting language="java">
<![CDATA[
-InputStream modelIn = new FileInputStream("en-token.bin");
-try {
+try (InputStream modelIn = new FileInputStream("en-token.bin")) {
TokenizerModel model = new TokenizerModel(modelIn);
-}
-catch (IOException e) {
- e.printStackTrace();
-}
-finally {
- if (modelIn != null) {
- try {
- modelIn.close();
- }
- catch (IOException e) {
- }
- }
}]]>
</programlisting>
After the model is loaded the TokenizerME can be instantiated.
http://git-wip-us.apache.org/repos/asf/opennlp/blob/f8fbfc9f/opennlp-tools/src/main/java/opennlp/tools/ml/BeamSearch.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/ml/BeamSearch.java b/opennlp-tools/src/main/java/opennlp/tools/ml/BeamSearch.java
index 949a408..7987b9f 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/ml/BeamSearch.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/ml/BeamSearch.java
@@ -105,13 +105,8 @@ public class BeamSearch<T> implements SequenceClassificationModel<T> {
String[] contexts = cg.getContext(i, sequence, outcomes, additionalContext);
double[] scores;
if (contextsCache != null) {
- scores = contextsCache.get(contexts);
- if (scores == null) {
- scores = model.eval(contexts, probs);
- contextsCache.put(contexts,scores);
- }
- }
- else {
+ scores = contextsCache.computeIfAbsent(contexts, c -> model.eval(c, probs));
+ } else {
scores = model.eval(contexts, probs);
}
@@ -123,13 +118,13 @@ public class BeamSearch<T> implements SequenceClassificationModel<T> {
double min = temp_scores[Math.max(0,scores.length - size)];
for (int p = 0; p < scores.length; p++) {
- if (scores[p] < min)
- continue; //only advance first "size" outcomes
- String out = model.getOutcome(p);
- if (validator.validSequence(i, sequence, outcomes, out)) {
- Sequence ns = new Sequence(top, out, scores[p]);
- if (ns.getScore() > minSequenceScore) {
- next.add(ns);
+ if (scores[p] >= min) {
+ String out = model.getOutcome(p);
+ if (validator.validSequence(i, sequence, outcomes, out)) {
+ Sequence ns = new Sequence(top, out, scores[p]);
+ if (ns.getScore() > minSequenceScore) {
+ next.add(ns);
+ }
}
}
}