You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by dm...@apache.org on 2020/12/04 21:24:44 UTC

[tika-docker] 01/01: Added docker-compose example for NamedEntityParser

This is an automated email from the ASF dual-hosted git repository.

dmeikle pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika-docker.git

commit b795997dfb12d931aa5704641e844260095e5db9
Author: David Meikle <dm...@apache.org>
AuthorDate: Fri Dec 4 21:23:58 2020 +0000

    Added docker-compose example for NamedEntityParser
---
 docker-compose-tika-ner.yml           | 30 +++++++++++++++++
 sample-configs/ner/run_tika_server.sh | 62 +++++++++++++++++++++++++++++++++++
 sample-configs/ner/tika-config.xml    | 28 ++++++++++++++++
 3 files changed, 120 insertions(+)

diff --git a/docker-compose-tika-ner.yml b/docker-compose-tika-ner.yml
new file mode 100644
index 0000000..702cdfb
--- /dev/null
+++ b/docker-compose-tika-ner.yml
@@ -0,0 +1,30 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+version: "3.8"
+services:
+ 
+  ## Apache Tika Server 
+  tika:
+    image: apache/tika:1.25-full
+    # Use custom script as entrypoint to go fetch models and setup recognisers
+    entrypoint: [ "/ner/run_tika_server.sh"]
+    restart: on-failure
+    ports:
+      - "9998:9998"
+    volumes:
+      -  ./sample-configs/ner/:/ner/
+
+   
\ No newline at end of file
diff --git a/sample-configs/ner/run_tika_server.sh b/sample-configs/ner/run_tika_server.sh
new file mode 100755
index 0000000..9b46034
--- /dev/null
+++ b/sample-configs/ner/run_tika_server.sh
@@ -0,0 +1,62 @@
+#!/bin/bash
+
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+#############################################################################
+# See https://cwiki.apache.org/confluence/display/TIKA/TikaAndNER for details
+# on how to configure additional NER libraries
+#############################################################################
+
+# ------------------------------------
+# Download OpenNLP Models to classpath
+# ------------------------------------
+
+OPENNLP_LOCATION="/ner/org/apache/tika/parser/ner/opennlp"
+URL="http://opennlp.sourceforge.net/models-1.5"
+
+mkdir -p $OPENNLP_LOCATION
+if [ "$(ls -A $OPENNLP_LOCATION/*.bin)" ]; then
+    echo "OpenNLP models directory has files, so skipping fetch";
+else
+	echo "No OpenNLP models found, so fetching them"
+	wget "$URL/en-ner-person.bin" -O $OPENNLP_LOCATION/ner-person.bin
+	wget "$URL/en-ner-location.bin" -O $OPENNLP_LOCATION/ner-location.bin
+	wget "$URL/en-ner-organization.bin" -O $OPENNLP_LOCATION/ner-organization.bin;
+	wget "$URL/en-ner-date.bin" -O $OPENNLP_LOCATION/ner-date.bin
+	wget "$URL/en-ner-time.bin" -O $OPENNLP_LOCATION/ner-time.bin
+	wget "$URL/en-ner-percentage.bin" -O $OPENNLP_LOCATION/ner-percentage.bin
+	wget "$URL/en-ner-money.bin" -O $OPENNLP_LOCATION/ner-money.bin
+fi
+
+# --------------------------------------------
+# Create RexExp Example for Email on classpath
+# --------------------------------------------
+REGEXP_LOCATION="/ner/org/apache/tika/parser/ner/regex"
+mkdir -p $REGEXP_LOCATION
+echo "EMAIL=(?:[a-z0-9!#$%&'*+/=?^_\`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_\`{|}~-]+)*|\"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*\")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])" > $REGEXP_LOCATION/ner-regex.txt
+
+
+# -------------------
+# Now run Tika Server
+# -------------------
+
+# Can be a single implementation or comma seperated list for multiple for "ner.impl.class" property
+RECOGNISERS=org.apache.tika.parser.ner.opennlp.OpenNLPNERecogniser,org.apache.tika.parser.ner.regex.RegexNERecogniser
+# Set classpath to the Tika Server JAR and the /ner folder so it has the configuration and models from above
+CLASSPATH=/ner:/tika-server-1.25.jar
+# Run the server with the custom configuration ner.impl.class property and custom /ner/tika-config.xml
+exec java -Dner.impl.class=$RECOGNISERS -cp $CLASSPATH org.apache.tika.server.TikaServerCli -h 0.0.0.0 -c /ner/tika-config.xml
\ No newline at end of file
diff --git a/sample-configs/ner/tika-config.xml b/sample-configs/ner/tika-config.xml
new file mode 100644
index 0000000..65d5774
--- /dev/null
+++ b/sample-configs/ner/tika-config.xml
@@ -0,0 +1,28 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~    http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+<properties>
+    <parsers>
+        <parser class="org.apache.tika.parser.ner.NamedEntityParser">
+            <mime>application/pdf</mime>
+            <mime>text/plain</mime>
+            <mime>text/html</mime>
+            <mime>application/xhtml+xml</mime>
+        </parser>
+    </parsers>
+</properties>
+