You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2017/05/03 10:07:44 UTC

opennlp git commit: OPENNLP-1047: Add detokenizer and sent detect abbreviations for Irish

Repository: opennlp
Updated Branches:
  refs/heads/master 3df659b9b -> caeaaeea6


OPENNLP-1047: Add detokenizer and sent detect abbreviations for Irish

Closes #188


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/caeaaeea
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/caeaaeea
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/caeaaeea

Branch: refs/heads/master
Commit: caeaaeea61e88fe4222b997b2dad49728b91ba68
Parents: 3df659b
Author: Jim O'Regan <ja...@tcd.ie>
Authored: Sat Apr 29 00:06:42 2017 +0100
Committer: Jörn Kottmann <jo...@apache.org>
Committed: Wed May 3 12:05:16 2017 +0200

----------------------------------------------------------------------
 opennlp-tools/lang/ga/sentdetect/abb.xml        | 164 +++++++++++++++++++
 .../lang/ga/tokenizer/ga-detokenizer.xml        | 113 +++++++++++++
 2 files changed, 277 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/caeaaeea/opennlp-tools/lang/ga/sentdetect/abb.xml
----------------------------------------------------------------------
diff --git a/opennlp-tools/lang/ga/sentdetect/abb.xml b/opennlp-tools/lang/ga/sentdetect/abb.xml
new file mode 100644
index 0000000..9d15aed
--- /dev/null
+++ b/opennlp-tools/lang/ga/sentdetect/abb.xml
@@ -0,0 +1,164 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one
+   or more contributor license agreements.  See the NOTICE file
+   distributed with this work for additional information
+   regarding copyright ownership.  The ASF licenses this file
+   to you under the Apache License, Version 2.0 (the
+   "License"); you may not use this file except in compliance
+   with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing,
+   software distributed under the License is distributed on an
+   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+   KIND, either express or implied.  See the License for the
+   specific language governing permissions and limitations
+   under the License.    
+-->
+
+<dictionary case_sensitive="false">
+<entry>
+<token>tel.</token>
+</entry>
+<entry>
+<token>Mr.</token>
+</entry>
+<entry>
+<token>Mrs.</token>
+</entry>
+<entry>
+<token>.i.</token>
+</entry>
+<entry>
+<token>Uacht.</token>
+</entry>
+<entry>
+<token>m.sh.</token>
+</entry>
+<entry>
+<token>lch.</token>
+</entry>
+<entry>
+<token>lgh.</token>
+</entry>
+<entry>
+<token>Dr.</token>
+</entry>
+<entry>
+<token>uimh.</token>
+</entry>
+<entry>
+<token>Co.</token>
+</entry>
+<entry>
+<token>gCo.</token>
+</entry>
+<entry>
+<token>tUacht.</token>
+</entry>
+<entry>
+<token>Uas.</token>
+</entry>
+<entry>
+<token>tUas.</token>
+</entry>
+<entry>
+<token>Msc.</token>
+</entry>
+<entry>
+<token>Ms.</token>
+</entry>
+<entry>
+<token>Sr.</token>
+</entry>
+<entry>
+<token>Jr.</token>
+</entry>
+<entry>
+<token>Bros.</token>
+</entry>
+<entry>
+<token>fig.</token>
+</entry>
+<entry>
+<token>Jan.</token>
+</entry>
+<entry>
+<token>Feb.</token>
+</entry>
+<entry>
+<token>Mar.</token>
+</entry>
+<entry>
+<token>Apr.</token>
+</entry>
+<entry>
+<token>Jun.</token>
+</entry>
+<entry>
+<token>Jul.</token>
+</entry>
+<entry>
+<token>Aug.</token>
+</entry>
+<entry>
+<token>Sep.</token>
+</entry>
+<entry>
+<token>Sept.</token>
+</entry>
+<entry>
+<token>Oct.</token>
+</entry>
+<entry>
+<token>Nov.</token>
+</entry>
+<entry>
+<token>Dec.</token>
+</entry>
+<entry>
+<token>Ean.</token>
+</entry>
+<entry>
+<token>Fea.</token>
+</entry>
+<entry>
+<token>Már.</token>
+</entry>
+<entry>
+<token>Aib.</token>
+</entry>
+<entry>
+<token>Bea.</token>
+</entry>
+<entry>
+<token>Mei.</token>
+</entry>
+<entry>
+<token>Iúl.</token>
+</entry>
+<entry>
+<token>Lún.</token>
+</entry>
+<entry>
+<token>M.Fr.</token>
+</entry>
+<entry>
+<token>D.Fr.</token>
+</entry>
+<entry>
+<token>Sam.</token>
+</entry>
+<entry>
+<token>Nol.</token>
+</entry>
+<entry>
+<token>Ltd.</token>
+</entry>
+<entry>
+<token>Teo.</token>
+</entry>
+</dictionary>

http://git-wip-us.apache.org/repos/asf/opennlp/blob/caeaaeea/opennlp-tools/lang/ga/tokenizer/ga-detokenizer.xml
----------------------------------------------------------------------
diff --git a/opennlp-tools/lang/ga/tokenizer/ga-detokenizer.xml b/opennlp-tools/lang/ga/tokenizer/ga-detokenizer.xml
new file mode 100644
index 0000000..23fe96a
--- /dev/null
+++ b/opennlp-tools/lang/ga/tokenizer/ga-detokenizer.xml
@@ -0,0 +1,113 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.    
+-->
+
+<dictionary>
+  <entry operation="RIGHT_LEFT_MATCHING">
+    <token>"</token>
+  </entry>
+  <entry operation="RIGHT_LEFT_MATCHING">
+    <token>'</token>
+  </entry>
+  <entry operation="MOVE_LEFT">
+    <token>.</token>
+  </entry>
+  <entry operation="MOVE_LEFT">
+    <token>?</token>
+  </entry>
+  <entry operation="MOVE_LEFT">
+    <token>!</token>
+  </entry>
+  <entry operation="MOVE_LEFT">
+    <token>,</token>
+  </entry>
+  <entry operation="MOVE_LEFT">
+    <token>;</token>
+  </entry>
+  <entry operation="MOVE_LEFT">
+    <token>:</token>
+  </entry>
+  <entry operation="MOVE_RIGHT">
+    <token>(</token>
+  </entry>
+  <entry operation="MOVE_LEFT">
+    <token>)</token>
+  </entry>
+  <entry operation="MOVE_LEFT">
+    <token>}</token>
+  </entry>
+  <entry operation="MOVE_RIGHT">
+    <token>{</token>
+  </entry>
+  <entry operation="MOVE_LEFT">
+    <token>]</token>
+  </entry>
+  <entry operation="MOVE_RIGHT">
+    <token>[</token>
+  </entry>
+  <entry operation="MOVE_LEFT">
+    <token>»</token>
+  </entry>
+  <entry operation="MOVE_RIGHT">
+    <token>«</token>
+  </entry>
+  <entry operation="MOVE_RIGHT">
+    <token>``</token>
+  </entry>
+  <entry operation="MOVE_LEFT">
+    <token>''</token>
+  </entry>
+  <entry operation="MOVE_LEFT">
+    <token>%</token>
+  </entry>
+  <entry operation="MOVE_LEFT">
+    <token>.org</token>
+  </entry>
+  <entry operation="MOVE_LEFT">
+    <token>.com</token>
+  </entry>
+  <entry operation="MOVE_LEFT">
+    <token>.net</token>
+  </entry>
+  <entry operation="MOVE_RIGHT">
+    <token>#</token>
+  </entry>
+  <entry operation="MOVE_BOTH">
+    <token>-</token>
+  </entry>
+  <entry operation="MOVE_RIGHT">
+    <token>m'</token>
+  </entry>
+  <entry operation="MOVE_RIGHT">
+    <token>d'</token>
+  </entry>
+  <entry operation="MOVE_RIGHT">
+    <token>b'</token>
+  </entry>
+  <entry operation="MOVE_RIGHT">
+    <token>mb'</token>
+  </entry>
+  <entry operation="MOVE_RIGHT">
+    <token>dh'</token>
+  </entry>
+  <entry operation="MOVE_RIGHT">
+    <token>lem'</token>
+  </entry>
+</dictionary>