You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2017/05/03 10:07:44 UTC
opennlp git commit: OPENNLP-1047: Add detokenizer and sent detect
abbreviations for Irish
Repository: opennlp
Updated Branches:
refs/heads/master 3df659b9b -> caeaaeea6
OPENNLP-1047: Add detokenizer and sent detect abbreviations for Irish
Closes #188
Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/caeaaeea
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/caeaaeea
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/caeaaeea
Branch: refs/heads/master
Commit: caeaaeea61e88fe4222b997b2dad49728b91ba68
Parents: 3df659b
Author: Jim O'Regan <ja...@tcd.ie>
Authored: Sat Apr 29 00:06:42 2017 +0100
Committer: Jörn Kottmann <jo...@apache.org>
Committed: Wed May 3 12:05:16 2017 +0200
----------------------------------------------------------------------
opennlp-tools/lang/ga/sentdetect/abb.xml | 164 +++++++++++++++++++
.../lang/ga/tokenizer/ga-detokenizer.xml | 113 +++++++++++++
2 files changed, 277 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/opennlp/blob/caeaaeea/opennlp-tools/lang/ga/sentdetect/abb.xml
----------------------------------------------------------------------
diff --git a/opennlp-tools/lang/ga/sentdetect/abb.xml b/opennlp-tools/lang/ga/sentdetect/abb.xml
new file mode 100644
index 0000000..9d15aed
--- /dev/null
+++ b/opennlp-tools/lang/ga/sentdetect/abb.xml
@@ -0,0 +1,164 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+
+<dictionary case_sensitive="false">
+<entry>
+<token>tel.</token>
+</entry>
+<entry>
+<token>Mr.</token>
+</entry>
+<entry>
+<token>Mrs.</token>
+</entry>
+<entry>
+<token>.i.</token>
+</entry>
+<entry>
+<token>Uacht.</token>
+</entry>
+<entry>
+<token>m.sh.</token>
+</entry>
+<entry>
+<token>lch.</token>
+</entry>
+<entry>
+<token>lgh.</token>
+</entry>
+<entry>
+<token>Dr.</token>
+</entry>
+<entry>
+<token>uimh.</token>
+</entry>
+<entry>
+<token>Co.</token>
+</entry>
+<entry>
+<token>gCo.</token>
+</entry>
+<entry>
+<token>tUacht.</token>
+</entry>
+<entry>
+<token>Uas.</token>
+</entry>
+<entry>
+<token>tUas.</token>
+</entry>
+<entry>
+<token>Msc.</token>
+</entry>
+<entry>
+<token>Ms.</token>
+</entry>
+<entry>
+<token>Sr.</token>
+</entry>
+<entry>
+<token>Jr.</token>
+</entry>
+<entry>
+<token>Bros.</token>
+</entry>
+<entry>
+<token>fig.</token>
+</entry>
+<entry>
+<token>Jan.</token>
+</entry>
+<entry>
+<token>Feb.</token>
+</entry>
+<entry>
+<token>Mar.</token>
+</entry>
+<entry>
+<token>Apr.</token>
+</entry>
+<entry>
+<token>Jun.</token>
+</entry>
+<entry>
+<token>Jul.</token>
+</entry>
+<entry>
+<token>Aug.</token>
+</entry>
+<entry>
+<token>Sep.</token>
+</entry>
+<entry>
+<token>Sept.</token>
+</entry>
+<entry>
+<token>Oct.</token>
+</entry>
+<entry>
+<token>Nov.</token>
+</entry>
+<entry>
+<token>Dec.</token>
+</entry>
+<entry>
+<token>Ean.</token>
+</entry>
+<entry>
+<token>Fea.</token>
+</entry>
+<entry>
+<token>Már.</token>
+</entry>
+<entry>
+<token>Aib.</token>
+</entry>
+<entry>
+<token>Bea.</token>
+</entry>
+<entry>
+<token>Mei.</token>
+</entry>
+<entry>
+<token>Iúl.</token>
+</entry>
+<entry>
+<token>Lún.</token>
+</entry>
+<entry>
+<token>M.Fr.</token>
+</entry>
+<entry>
+<token>D.Fr.</token>
+</entry>
+<entry>
+<token>Sam.</token>
+</entry>
+<entry>
+<token>Nol.</token>
+</entry>
+<entry>
+<token>Ltd.</token>
+</entry>
+<entry>
+<token>Teo.</token>
+</entry>
+</dictionary>
http://git-wip-us.apache.org/repos/asf/opennlp/blob/caeaaeea/opennlp-tools/lang/ga/tokenizer/ga-detokenizer.xml
----------------------------------------------------------------------
diff --git a/opennlp-tools/lang/ga/tokenizer/ga-detokenizer.xml b/opennlp-tools/lang/ga/tokenizer/ga-detokenizer.xml
new file mode 100644
index 0000000..23fe96a
--- /dev/null
+++ b/opennlp-tools/lang/ga/tokenizer/ga-detokenizer.xml
@@ -0,0 +1,113 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements. See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership. The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied. See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
+<dictionary>
+ <entry operation="RIGHT_LEFT_MATCHING">
+ <token>"</token>
+ </entry>
+ <entry operation="RIGHT_LEFT_MATCHING">
+ <token>'</token>
+ </entry>
+ <entry operation="MOVE_LEFT">
+ <token>.</token>
+ </entry>
+ <entry operation="MOVE_LEFT">
+ <token>?</token>
+ </entry>
+ <entry operation="MOVE_LEFT">
+ <token>!</token>
+ </entry>
+ <entry operation="MOVE_LEFT">
+ <token>,</token>
+ </entry>
+ <entry operation="MOVE_LEFT">
+ <token>;</token>
+ </entry>
+ <entry operation="MOVE_LEFT">
+ <token>:</token>
+ </entry>
+ <entry operation="MOVE_RIGHT">
+ <token>(</token>
+ </entry>
+ <entry operation="MOVE_LEFT">
+ <token>)</token>
+ </entry>
+ <entry operation="MOVE_LEFT">
+ <token>}</token>
+ </entry>
+ <entry operation="MOVE_RIGHT">
+ <token>{</token>
+ </entry>
+ <entry operation="MOVE_LEFT">
+ <token>]</token>
+ </entry>
+ <entry operation="MOVE_RIGHT">
+ <token>[</token>
+ </entry>
+ <entry operation="MOVE_LEFT">
+ <token>»</token>
+ </entry>
+ <entry operation="MOVE_RIGHT">
+ <token>«</token>
+ </entry>
+ <entry operation="MOVE_RIGHT">
+ <token>``</token>
+ </entry>
+ <entry operation="MOVE_LEFT">
+ <token>''</token>
+ </entry>
+ <entry operation="MOVE_LEFT">
+ <token>%</token>
+ </entry>
+ <entry operation="MOVE_LEFT">
+ <token>.org</token>
+ </entry>
+ <entry operation="MOVE_LEFT">
+ <token>.com</token>
+ </entry>
+ <entry operation="MOVE_LEFT">
+ <token>.net</token>
+ </entry>
+ <entry operation="MOVE_RIGHT">
+ <token>#</token>
+ </entry>
+ <entry operation="MOVE_BOTH">
+ <token>-</token>
+ </entry>
+ <entry operation="MOVE_RIGHT">
+ <token>m'</token>
+ </entry>
+ <entry operation="MOVE_RIGHT">
+ <token>d'</token>
+ </entry>
+ <entry operation="MOVE_RIGHT">
+ <token>b'</token>
+ </entry>
+ <entry operation="MOVE_RIGHT">
+ <token>mb'</token>
+ </entry>
+ <entry operation="MOVE_RIGHT">
+ <token>dh'</token>
+ </entry>
+ <entry operation="MOVE_RIGHT">
+ <token>lem'</token>
+ </entry>
+</dictionary>