You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by dm...@apache.org on 2020/12/04 21:24:44 UTC
[tika-docker] 01/01: Added docker-compose example for
NamedEntityParser
This is an automated email from the ASF dual-hosted git repository.
dmeikle pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika-docker.git
commit b795997dfb12d931aa5704641e844260095e5db9
Author: David Meikle <dm...@apache.org>
AuthorDate: Fri Dec 4 21:23:58 2020 +0000
Added docker-compose example for NamedEntityParser
---
docker-compose-tika-ner.yml | 30 +++++++++++++++++
sample-configs/ner/run_tika_server.sh | 62 +++++++++++++++++++++++++++++++++++
sample-configs/ner/tika-config.xml | 28 ++++++++++++++++
3 files changed, 120 insertions(+)
diff --git a/docker-compose-tika-ner.yml b/docker-compose-tika-ner.yml
new file mode 100644
index 0000000..702cdfb
--- /dev/null
+++ b/docker-compose-tika-ner.yml
@@ -0,0 +1,30 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+version: "3.8"
+services:
+
+ ## Apache Tika Server
+ tika:
+ image: apache/tika:1.25-full
+ # Use custom script as entrypoint to go fetch models and setup recognisers
+ entrypoint: [ "/ner/run_tika_server.sh"]
+ restart: on-failure
+ ports:
+ - "9998:9998"
+ volumes:
+ - ./sample-configs/ner/:/ner/
+
+
\ No newline at end of file
diff --git a/sample-configs/ner/run_tika_server.sh b/sample-configs/ner/run_tika_server.sh
new file mode 100755
index 0000000..9b46034
--- /dev/null
+++ b/sample-configs/ner/run_tika_server.sh
@@ -0,0 +1,62 @@
+#!/bin/bash
+
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+#############################################################################
+# See https://cwiki.apache.org/confluence/display/TIKA/TikaAndNER for details
+# on how to configure additional NER libraries
+#############################################################################
+
+# ------------------------------------
+# Download OpenNLP Models to classpath
+# ------------------------------------
+
+OPENNLP_LOCATION="/ner/org/apache/tika/parser/ner/opennlp"
+URL="http://opennlp.sourceforge.net/models-1.5"
+
+mkdir -p $OPENNLP_LOCATION
+if [ "$(ls -A $OPENNLP_LOCATION/*.bin)" ]; then
+ echo "OpenNLP models directory has files, so skipping fetch";
+else
+ echo "No OpenNLP models found, so fetching them"
+ wget "$URL/en-ner-person.bin" -O $OPENNLP_LOCATION/ner-person.bin
+ wget "$URL/en-ner-location.bin" -O $OPENNLP_LOCATION/ner-location.bin
+ wget "$URL/en-ner-organization.bin" -O $OPENNLP_LOCATION/ner-organization.bin;
+ wget "$URL/en-ner-date.bin" -O $OPENNLP_LOCATION/ner-date.bin
+ wget "$URL/en-ner-time.bin" -O $OPENNLP_LOCATION/ner-time.bin
+ wget "$URL/en-ner-percentage.bin" -O $OPENNLP_LOCATION/ner-percentage.bin
+ wget "$URL/en-ner-money.bin" -O $OPENNLP_LOCATION/ner-money.bin
+fi
+
+# --------------------------------------------
+# Create RexExp Example for Email on classpath
+# --------------------------------------------
+REGEXP_LOCATION="/ner/org/apache/tika/parser/ner/regex"
+mkdir -p $REGEXP_LOCATION
+echo "EMAIL=(?:[a-z0-9!#$%&'*+/=?^_\`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_\`{|}~-]+)*|\"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*\")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])" > $REGEXP_LOCATION/ner-regex.txt
+
+
+# -------------------
+# Now run Tika Server
+# -------------------
+
+# Can be a single implementation or comma seperated list for multiple for "ner.impl.class" property
+RECOGNISERS=org.apache.tika.parser.ner.opennlp.OpenNLPNERecogniser,org.apache.tika.parser.ner.regex.RegexNERecogniser
+# Set classpath to the Tika Server JAR and the /ner folder so it has the configuration and models from above
+CLASSPATH=/ner:/tika-server-1.25.jar
+# Run the server with the custom configuration ner.impl.class property and custom /ner/tika-config.xml
+exec java -Dner.impl.class=$RECOGNISERS -cp $CLASSPATH org.apache.tika.server.TikaServerCli -h 0.0.0.0 -c /ner/tika-config.xml
\ No newline at end of file
diff --git a/sample-configs/ner/tika-config.xml b/sample-configs/ner/tika-config.xml
new file mode 100644
index 0000000..65d5774
--- /dev/null
+++ b/sample-configs/ner/tika-config.xml
@@ -0,0 +1,28 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!--
+ ~ Licensed to the Apache Software Foundation (ASF) under one or more
+ ~ contributor license agreements. See the NOTICE file distributed with
+ ~ this work for additional information regarding copyright ownership.
+ ~ The ASF licenses this file to You under the Apache License, Version 2.0
+ ~ (the "License"); you may not use this file except in compliance with
+ ~ the License. You may obtain a copy of the License at
+ ~
+ ~ http://www.apache.org/licenses/LICENSE-2.0
+ ~
+ ~ Unless required by applicable law or agreed to in writing, software
+ ~ distributed under the License is distributed on an "AS IS" BASIS,
+ ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ~ See the License for the specific language governing permissions and
+ ~ limitations under the License.
+ -->
+<properties>
+ <parsers>
+ <parser class="org.apache.tika.parser.ner.NamedEntityParser">
+ <mime>application/pdf</mime>
+ <mime>text/plain</mime>
+ <mime>text/html</mime>
+ <mime>application/xhtml+xml</mime>
+ </parser>
+ </parsers>
+</properties>
+