You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2016/10/17 22:59:24 UTC
opennlp-sandbox git commit: OPENNLP-864 Rename name finder annotator
classes
Repository: opennlp-sandbox
Updated Branches:
refs/heads/864 [created] 552afeab1
OPENNLP-864 Rename name finder annotator classes
Project: http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/commit/552afeab
Tree: http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/tree/552afeab
Diff: http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/diff/552afeab
Branch: refs/heads/864
Commit: 552afeab109ba9a119da6665cb5568250f5e46ee
Parents: bf255a3
Author: kottmann <jo...@apache.org>
Authored: Tue Oct 18 00:58:53 2016 +0200
Committer: kottmann <jo...@apache.org>
Committed: Tue Oct 18 00:58:53 2016 +0200
----------------------------------------------------------------------
.../opennlp/bratann/NameFinderAnnService.java | 112 ++++++++++++++
.../opennlp/bratann/NameFinderResource.java | 148 +++++++++++++++++++
.../opennlp/bratannotator/BratAnnService.java | 112 --------------
.../bratannotator/BratNameFinderResource.java | 148 -------------------
4 files changed, 260 insertions(+), 260 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/552afeab/opennlp-brat-annotator/src/main/java/opennlp/bratann/NameFinderAnnService.java
----------------------------------------------------------------------
diff --git a/opennlp-brat-annotator/src/main/java/opennlp/bratann/NameFinderAnnService.java b/opennlp-brat-annotator/src/main/java/opennlp/bratann/NameFinderAnnService.java
new file mode 100644
index 0000000..926730f
--- /dev/null
+++ b/opennlp-brat-annotator/src/main/java/opennlp/bratann/NameFinderAnnService.java
@@ -0,0 +1,112 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.bratann;
+
+import java.io.File;
+import java.util.Arrays;
+import java.util.List;
+
+import org.eclipse.jetty.server.Server;
+import org.eclipse.jetty.servlet.ServletContextHandler;
+import org.eclipse.jetty.servlet.ServletHolder;
+
+import opennlp.tools.namefind.NameFinderME;
+import opennlp.tools.namefind.TokenNameFinder;
+import opennlp.tools.namefind.TokenNameFinderModel;
+import opennlp.tools.sentdetect.NewlineSentenceDetector;
+import opennlp.tools.sentdetect.SentenceDetector;
+import opennlp.tools.sentdetect.SentenceDetectorME;
+import opennlp.tools.sentdetect.SentenceModel;
+import opennlp.tools.tokenize.SimpleTokenizer;
+import opennlp.tools.tokenize.Tokenizer;
+import opennlp.tools.tokenize.TokenizerME;
+import opennlp.tools.tokenize.TokenizerModel;
+import opennlp.tools.tokenize.WhitespaceTokenizer;
+
+public class NameFinderAnnService {
+
+ public static SentenceDetector sentenceDetector = new NewlineSentenceDetector();;
+ public static Tokenizer tokenizer = WhitespaceTokenizer.INSTANCE;
+ public static TokenNameFinder nameFinders[];
+
+ public static void main(String[] args) throws Exception {
+
+ if (args.length == 0) {
+ System.out.println(
+ "[-tokenizerModel file] [-ruleBasedTokenizer whitespace|simple] [-sentenceDetectorModel file] "
+ + "namefinderFile|nameFinderURI");
+ return;
+ }
+
+ List<String> argList = Arrays.asList(args);
+
+ int sentenceModelIndex = argList.indexOf("-sentenceDetectorModel")
+ + 1;
+ if (sentenceModelIndex > 0 && sentenceModelIndex < args.length) {
+ sentenceDetector = new SentenceDetectorME(
+ new SentenceModel(new File(args[sentenceModelIndex])));
+ }
+
+ int ruleBasedTokenizerIndex = argList.indexOf("-ruleBasedTokenizer") + 1;
+
+ if (ruleBasedTokenizerIndex > 0 && ruleBasedTokenizerIndex < args.length) {
+ if ("whitespace".equals(args[ruleBasedTokenizerIndex])) {
+ tokenizer = WhitespaceTokenizer.INSTANCE;
+ } else if ("simple".equals(args[ruleBasedTokenizerIndex])) {
+ tokenizer = SimpleTokenizer.INSTANCE;
+ } else {
+ System.out
+ .println("unkown tokenizer: " + args[ruleBasedTokenizerIndex]);
+ return;
+ }
+ }
+
+ int tokenizerModelIndex = argList.indexOf("-tokenizerModel") + 1;
+ if (tokenizerModelIndex > 0 && tokenizerModelIndex < args.length) {
+ tokenizer = new TokenizerME(
+ new TokenizerModel(new File(args[tokenizerModelIndex])));
+ }
+
+ nameFinders = new TokenNameFinder[] { new NameFinderME(
+ new TokenNameFinderModel(new File(args[args.length - 1]))) };
+
+ ServletContextHandler context = new ServletContextHandler(
+ ServletContextHandler.SESSIONS);
+ context.setContextPath("/");
+
+ Server jettyServer = new Server(8080);
+ jettyServer.setHandler(context);
+
+ ServletHolder jerseyServlet = context
+ .addServlet(com.sun.jersey.spi.container.servlet.ServletContainer.class, "/*");
+ jerseyServlet.setInitParameter("com.sun.jersey.config.property.packages",
+ "opennlp.bratann");
+ jerseyServlet.setInitParameter("com.sun.jersey.api.json.POJOMappingFeature", "true");
+ jerseyServlet.setInitOrder(0);
+
+ jerseyServlet.setInitParameter("jersey.config.server.provider.classnames",
+ NameFinderResource.class.getCanonicalName());
+
+ try {
+ jettyServer.start();
+ jettyServer.join();
+ } finally {
+ jettyServer.destroy();
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/552afeab/opennlp-brat-annotator/src/main/java/opennlp/bratann/NameFinderResource.java
----------------------------------------------------------------------
diff --git a/opennlp-brat-annotator/src/main/java/opennlp/bratann/NameFinderResource.java b/opennlp-brat-annotator/src/main/java/opennlp/bratann/NameFinderResource.java
new file mode 100644
index 0000000..39cec0e
--- /dev/null
+++ b/opennlp-brat-annotator/src/main/java/opennlp/bratann/NameFinderResource.java
@@ -0,0 +1,148 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.bratann;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import javax.ws.rs.Consumes;
+import javax.ws.rs.POST;
+import javax.ws.rs.Path;
+import javax.ws.rs.Produces;
+import javax.ws.rs.QueryParam;
+import javax.ws.rs.core.MediaType;
+
+import opennlp.tools.namefind.TokenNameFinder;
+import opennlp.tools.sentdetect.SentenceDetector;
+import opennlp.tools.tokenize.Tokenizer;
+import opennlp.tools.util.Span;
+
+@Path("/ner")
+public class NameFinderResource {
+
+ public static class NameAnn {
+ public int[][] offsets;
+ public String[] texts;
+ public String type;
+ }
+
+ private SentenceDetector sentDetect = NameFinderAnnService.sentenceDetector;
+ private Tokenizer tokenizer = NameFinderAnnService.tokenizer;
+ private TokenNameFinder nameFinders[] = NameFinderAnnService.nameFinders;
+
+ private static int findNextNonWhitespaceChar(CharSequence s, int beginOffset,
+ int endOffset) {
+
+ for (int i = beginOffset; i < endOffset; i++) {
+ if (!Character.isSpaceChar(s.charAt(i))) {
+ return i;
+ }
+ }
+
+ return -1;
+ }
+
+ @POST
+ @Consumes(MediaType.TEXT_PLAIN)
+ @Produces(MediaType.APPLICATION_JSON)
+ public Map<String, NameAnn> findNames(@QueryParam("model") String modelName,
+ String text) {
+
+ Span sentenceSpans[] = sentDetect.sentPosDetect(text);
+
+ Map<String, NameAnn> map = new HashMap<String, NameAnn>();
+
+ int indexCounter = 0;
+
+ for (int i = 0; i < sentenceSpans.length; i++) {
+
+ String sentenceText = sentenceSpans[i].getCoveredText(text).toString();
+
+ // offset of sentence gets lost here!
+ Span tokenSpans[] = tokenizer
+ .tokenizePos(sentenceText);
+
+ String tokens[] = Span.spansToStrings(tokenSpans, sentenceText);
+
+ for (TokenNameFinder nameFinder : nameFinders) {
+ Span names[] = nameFinder.find(tokens);
+
+ for (Span name : names) {
+
+ int beginOffset = tokenSpans[name.getStart()].getStart()
+ + sentenceSpans[i].getStart();
+ int endOffset = tokenSpans[name.getEnd() - 1].getEnd()
+ + sentenceSpans[i].getStart();
+
+ // create a list of new line indexes
+ List<Integer> newLineIndexes = new ArrayList<Integer>();
+
+ // TODO: Code needs to handle case that there are multiple new lines
+ // in a row
+
+ boolean inNewLineSequence = false;
+ for (int ci = beginOffset; ci < endOffset; ci++) {
+ if (text.charAt(ci) == '\n' || text.charAt(ci) == '\r') {
+ if (!inNewLineSequence) {
+ newLineIndexes.add(ci);
+ }
+ inNewLineSequence = true;
+ } else {
+ inNewLineSequence = false;
+ }
+ }
+
+ List<String> textSegments = new ArrayList<String>();
+ List<int[]> spanSegments = new ArrayList<int[]>();
+
+ int segmentBegin = beginOffset;
+
+ for (int newLineOffset : newLineIndexes) {
+ // create segment from begin to offset
+ textSegments.add(text.substring(segmentBegin, newLineOffset));
+ spanSegments.add(new int[] { segmentBegin, newLineOffset });
+
+ segmentBegin = findNextNonWhitespaceChar(text, newLineOffset + 1,
+ endOffset);
+
+ if (segmentBegin == -1) {
+ break;
+ }
+ }
+
+ // create left over segment
+ if (segmentBegin != -1) {
+ textSegments.add(text.substring(segmentBegin, endOffset));
+ spanSegments.add(new int[] { segmentBegin, endOffset });
+ }
+
+ NameAnn ann = new NameAnn();
+ ann.texts = textSegments.toArray(new String[textSegments.size()]);
+ ann.offsets = spanSegments.toArray(new int[spanSegments.size()][]);
+ ann.type = name.getType();
+
+ map.put(Integer.toString(indexCounter++), ann);
+ }
+ }
+ }
+
+ return map;
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/552afeab/opennlp-brat-annotator/src/main/java/opennlp/bratannotator/BratAnnService.java
----------------------------------------------------------------------
diff --git a/opennlp-brat-annotator/src/main/java/opennlp/bratannotator/BratAnnService.java b/opennlp-brat-annotator/src/main/java/opennlp/bratannotator/BratAnnService.java
deleted file mode 100644
index 6bfbb5c..0000000
--- a/opennlp-brat-annotator/src/main/java/opennlp/bratannotator/BratAnnService.java
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package opennlp.bratannotator;
-
-import java.io.File;
-import java.net.URI;
-import java.net.URL;
-
-import org.eclipse.jetty.server.Server;
-import org.eclipse.jetty.servlet.ServletContextHandler;
-import org.eclipse.jetty.servlet.ServletHolder;
-
-import opennlp.tools.namefind.NameFinderME;
-import opennlp.tools.namefind.TokenNameFinder;
-import opennlp.tools.namefind.TokenNameFinderModel;
-import opennlp.tools.sentdetect.NewlineSentenceDetector;
-import opennlp.tools.sentdetect.SentenceDetector;
-import opennlp.tools.sentdetect.SentenceDetectorME;
-import opennlp.tools.sentdetect.SentenceModel;
-import opennlp.tools.tokenize.SimpleTokenizer;
-import opennlp.tools.tokenize.Tokenizer;
-import opennlp.tools.tokenize.TokenizerME;
-import opennlp.tools.tokenize.TokenizerModel;
-import opennlp.tools.tokenize.WhitespaceTokenizer;
-
-public class BratAnnService {
-
- public static SentenceDetector sentenceDetector;
- public static Tokenizer tokenizer;
- public static TokenNameFinder nameFinders[];
-
- public static void main(String[] args) throws Exception {
-
- if (args.length < 3) {
- System.out.println("sentenceDetectorURI tokenizerURI namefinderURI_1 ... nameFinderURI_n");
- return;
- }
-
- URI sentenceDetectorUri = URI.create(args[0]);
- if ("sentenceDetector".equals(sentenceDetectorUri.getScheme())) {
-
- if ("newline".equals(sentenceDetectorUri.getSchemeSpecificPart())) {
- sentenceDetector = new NewlineSentenceDetector();
- }
- else {
- System.out.println("unkown sentence detector");
- return;
- }
- }
- else {
- sentenceDetector = new SentenceDetectorME(new SentenceModel(new File(args[0])));
- }
-
- URI tokenizerUri = URI.create(args[1]);
- if ("tokenizer".equals(tokenizerUri.getScheme())) {
- if ("whitespace".equals(tokenizerUri.getSchemeSpecificPart())) {
- tokenizer = WhitespaceTokenizer.INSTANCE;
- }
- else if ("simple".equals(tokenizerUri.getSchemeSpecificPart())) {
- tokenizer = SimpleTokenizer.INSTANCE;
- }
- else {
- System.out.println("unkown tokenizer");
- return;
- }
-
- }
- else {
- tokenizer = new TokenizerME(new TokenizerModel(new File(args[1])));
- }
-
- nameFinders = new TokenNameFinder[] {new NameFinderME(new TokenNameFinderModel(new URL(args[2])))};
-
- ServletContextHandler context = new ServletContextHandler(
- ServletContextHandler.SESSIONS);
- context.setContextPath("/");
-
- Server jettyServer = new Server(8080);
- jettyServer.setHandler(context);
-
- ServletHolder jerseyServlet = context
- .addServlet(com.sun.jersey.spi.container.servlet.ServletContainer.class, "/*");
- jerseyServlet.setInitParameter("com.sun.jersey.config.property.packages", "opennlp.bratannotator");
- jerseyServlet.setInitParameter("com.sun.jersey.api.json.POJOMappingFeature", "true");
- jerseyServlet.setInitOrder(0);
-
- jerseyServlet.setInitParameter("jersey.config.server.provider.classnames",
- BratNameFinderResource.class.getCanonicalName());
-
- try {
- jettyServer.start();
- jettyServer.join();
- } finally {
- jettyServer.destroy();
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/552afeab/opennlp-brat-annotator/src/main/java/opennlp/bratannotator/BratNameFinderResource.java
----------------------------------------------------------------------
diff --git a/opennlp-brat-annotator/src/main/java/opennlp/bratannotator/BratNameFinderResource.java b/opennlp-brat-annotator/src/main/java/opennlp/bratannotator/BratNameFinderResource.java
deleted file mode 100644
index 88dacc6..0000000
--- a/opennlp-brat-annotator/src/main/java/opennlp/bratannotator/BratNameFinderResource.java
+++ /dev/null
@@ -1,148 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package opennlp.bratannotator;
-
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-
-import javax.ws.rs.Consumes;
-import javax.ws.rs.POST;
-import javax.ws.rs.Path;
-import javax.ws.rs.Produces;
-import javax.ws.rs.QueryParam;
-import javax.ws.rs.core.MediaType;
-
-import opennlp.tools.namefind.TokenNameFinder;
-import opennlp.tools.sentdetect.SentenceDetector;
-import opennlp.tools.tokenize.Tokenizer;
-import opennlp.tools.util.Span;
-
-@Path("/ner")
-public class BratNameFinderResource {
-
- public static class NameAnn {
- public int[][] offsets;
- public String[] texts;
- public String type;
- }
-
- private SentenceDetector sentDetect = BratAnnService.sentenceDetector;
- private Tokenizer tokenizer = BratAnnService.tokenizer;
- private TokenNameFinder nameFinders[] = BratAnnService.nameFinders;
-
- private static int findNextNonWhitespaceChar(CharSequence s, int beginOffset,
- int endOffset) {
-
- for (int i = beginOffset; i < endOffset; i++) {
- if (!Character.isSpaceChar(s.charAt(i))) {
- return i;
- }
- }
-
- return -1;
- }
-
- @POST
- @Consumes(MediaType.TEXT_PLAIN)
- @Produces(MediaType.APPLICATION_JSON)
- public Map<String, NameAnn> findNames(@QueryParam("model") String modelName,
- String text) {
-
- Span sentenceSpans[] = sentDetect.sentPosDetect(text);
-
- Map<String, NameAnn> map = new HashMap<String, NameAnn>();
-
- int indexCounter = 0;
-
- for (int i = 0; i < sentenceSpans.length; i++) {
-
- String sentenceText = sentenceSpans[i].getCoveredText(text).toString();
-
- // offset of sentence gets lost here!
- Span tokenSpans[] = tokenizer
- .tokenizePos(sentenceText);
-
- String tokens[] = Span.spansToStrings(tokenSpans, sentenceText);
-
- for (TokenNameFinder nameFinder : nameFinders) {
- Span names[] = nameFinder.find(tokens);
-
- for (Span name : names) {
-
- int beginOffset = tokenSpans[name.getStart()].getStart()
- + sentenceSpans[i].getStart();
- int endOffset = tokenSpans[name.getEnd() - 1].getEnd()
- + sentenceSpans[i].getStart();
-
- // create a list of new line indexes
- List<Integer> newLineIndexes = new ArrayList<Integer>();
-
- // TODO: Code needs to handle case that there are multiple new lines
- // in a row
-
- boolean inNewLineSequence = false;
- for (int ci = beginOffset; ci < endOffset; ci++) {
- if (text.charAt(ci) == '\n' || text.charAt(ci) == '\r') {
- if (!inNewLineSequence) {
- newLineIndexes.add(ci);
- }
- inNewLineSequence = true;
- } else {
- inNewLineSequence = false;
- }
- }
-
- List<String> textSegments = new ArrayList<String>();
- List<int[]> spanSegments = new ArrayList<int[]>();
-
- int segmentBegin = beginOffset;
-
- for (int newLineOffset : newLineIndexes) {
- // create segment from begin to offset
- textSegments.add(text.substring(segmentBegin, newLineOffset));
- spanSegments.add(new int[] { segmentBegin, newLineOffset });
-
- segmentBegin = findNextNonWhitespaceChar(text, newLineOffset + 1,
- endOffset);
-
- if (segmentBegin == -1) {
- break;
- }
- }
-
- // create left over segment
- if (segmentBegin != -1) {
- textSegments.add(text.substring(segmentBegin, endOffset));
- spanSegments.add(new int[] { segmentBegin, endOffset });
- }
-
- NameAnn ann = new NameAnn();
- ann.texts = textSegments.toArray(new String[textSegments.size()]);
- ann.offsets = spanSegments.toArray(new int[spanSegments.size()][]);
- ann.type = name.getType();
-
- map.put(Integer.toString(indexCounter++), ann);
- }
- }
- }
-
- return map;
- }
-}