You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by dm...@apache.org on 2020/12/03 23:58:21 UTC

[tika-docker] branch master updated: Added docker-compose example for TesseractOCR

This is an automated email from the ASF dual-hosted git repository.

dmeikle pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika-docker.git


The following commit(s) were added to refs/heads/master by this push:
     new fc7355f  Added docker-compose example for TesseractOCR
fc7355f is described below

commit fc7355fb1e3191cb11d39fa1f810f87acbfdbdb5
Author: David Meikle <dm...@apache.org>
AuthorDate: Thu Dec 3 23:58:02 2020 +0000

    Added docker-compose example for TesseractOCR
---
 docker-compose-tika-customocr.yml                  | 34 +++++++++++++++++++
 .../tika/parser/ocr/TesseractOCRConfig.properties  | 25 ++++++++++++++
 sample-configs/customocr/tika-config-inline.xml    | 31 ++++++++++++++++++
 sample-configs/customocr/tika-config-rendered.xml  | 38 ++++++++++++++++++++++
 4 files changed, 128 insertions(+)

diff --git a/docker-compose-tika-customocr.yml b/docker-compose-tika-customocr.yml
new file mode 100644
index 0000000..2084a0d
--- /dev/null
+++ b/docker-compose-tika-customocr.yml
@@ -0,0 +1,34 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+version: "3.8"
+services:
+ 
+  ## Apache Tika Server 
+  tika:
+    image: apache/tika:1.25-full
+    # Override default so we can add configuration on classpath
+    entrypoint: [ "/bin/sh", "-c", "exec java -cp /customocr:/tika-server-1.25.jar org.apache.tika.server.TikaServerCli -h 0.0.0.0 $$0 $$@"]
+    # Kept command as example but could be added to entrypoint too
+    command: -c /tika-config.xml
+    restart: on-failure
+    ports:
+      - "9998:9998"
+    volumes:
+      # Choose the configuration you want, or add your own custom one
+      # -  ./sample-configs/customocr/tika-config-inline.xml:/tika-config.xml
+      -  ./sample-configs/customocr/tika-config-rendered.xml:/tika-config.xml
+
+   
\ No newline at end of file
diff --git a/sample-configs/customocr/org/apache/tika/parser/ocr/TesseractOCRConfig.properties b/sample-configs/customocr/org/apache/tika/parser/ocr/TesseractOCRConfig.properties
new file mode 100644
index 0000000..b4b787f
--- /dev/null
+++ b/sample-configs/customocr/org/apache/tika/parser/ocr/TesseractOCRConfig.properties
@@ -0,0 +1,25 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# You customise or add the settings you want here
+language=eng+spa+fra+deu+ita
+timeout=240
+minFileSizeToOcr=1
+enableImageProcessing=0
+density=200
+depth=8
+filter=box
+resize=300
+applyRotation=true
\ No newline at end of file
diff --git a/sample-configs/customocr/tika-config-inline.xml b/sample-configs/customocr/tika-config-inline.xml
new file mode 100644
index 0000000..1c9b613
--- /dev/null
+++ b/sample-configs/customocr/tika-config-inline.xml
@@ -0,0 +1,31 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~    http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+<properties>
+  <parsers>     
+        <!-- Load TesseractOCRParser (could use DefaultParser if you want others too) -->
+        <parser class="org.apache.tika.parser.ocr.TesseractOCRParser"/>   
+
+        <!-- Extract and OCR Inline Images in PDF -->
+        <parser class="org.apache.tika.parser.pdf.PDFParser">
+            <params>
+                <param name="extractInlineImages" type="bool">true</param>
+            </params>
+        </parser>
+        
+  </parsers>
+</properties>
diff --git a/sample-configs/customocr/tika-config-rendered.xml b/sample-configs/customocr/tika-config-rendered.xml
new file mode 100644
index 0000000..bcd8666
--- /dev/null
+++ b/sample-configs/customocr/tika-config-rendered.xml
@@ -0,0 +1,38 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~    http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+<properties>
+  <parsers>     
+        <!-- Load TesseractOCRParser (could use DefaultParser if you want others too) -->
+        <parser class="org.apache.tika.parser.ocr.TesseractOCRParser"/>   
+
+        <!-- OCR on Rendered Pages -->
+        <parser class="org.apache.tika.parser.pdf.PDFParser">
+            <params>
+                <!-- no_ocr - extract text only
+                     ocr_only - don't extract text and just attempt OCR
+                     ocr_and_text - extract text and attempt OCR (from Tika 1.24)
+                     auto - extract text but if < 10 characters try OCR
+                -->
+                <param name="ocrStrategy" type="string">ocr_only</param>
+                <param name="ocrImageType" type="string">rgb</param>
+                <param name="ocrDPI" type="int">100</param>
+            </params>
+        </parser>
+
+  </parsers>
+</properties>