You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@camel.apache.org by ja...@apache.org on 2022/03/18 13:11:14 UTC

[camel-quarkus] 04/08: Work around Tika version incompatibilities between Quarkus Tika & Camel Tika #3599

This is an automated email from the ASF dual-hosted git repository.

jamesnetherton pushed a commit to branch camel-main
in repository https://gitbox.apache.org/repos/asf/camel-quarkus.git

commit 836b8b5e31e479ff7801e0230f5dbe8ab119421b
Author: James Netherton <ja...@gmail.com>
AuthorDate: Tue Mar 8 10:50:20 2022 +0000

    Work around Tika version incompatibilities between Quarkus Tika & Camel Tika #3599
---
 extensions/tika/runtime/pom.xml                    | 11 ++--
 .../camel/quarkus/component/tika/TikaRecorder.java | 33 +++++++++-
 .../tika/graalvm/TikaProducerSubstitutions.java    | 77 ++++++++++++++++++++++
 3 files changed, 115 insertions(+), 6 deletions(-)

diff --git a/extensions/tika/runtime/pom.xml b/extensions/tika/runtime/pom.xml
index 3470945..74ae8a2 100644
--- a/extensions/tika/runtime/pom.xml
+++ b/extensions/tika/runtime/pom.xml
@@ -58,11 +58,7 @@
             <exclusions>
                 <exclusion>
                     <groupId>org.apache.tika</groupId>
-                    <artifactId>tika-core</artifactId>
-                </exclusion>
-                <exclusion>
-                    <groupId>org.apache.tika</groupId>
-                    <artifactId>tika-parsers</artifactId>
+                    <artifactId>*</artifactId>
                 </exclusion>
             </exclusions>
         </dependency>
@@ -74,6 +70,11 @@
             <groupId>io.quarkiverse.tika</groupId>
             <artifactId>quarkus-tika</artifactId>
         </dependency>
+        <dependency>
+            <groupId>org.graalvm.nativeimage</groupId>
+            <artifactId>svm</artifactId>
+            <scope>provided</scope>
+        </dependency>
     </dependencies>
 
     <build>
diff --git a/extensions/tika/runtime/src/main/java/org/apache/camel/quarkus/component/tika/TikaRecorder.java b/extensions/tika/runtime/src/main/java/org/apache/camel/quarkus/component/tika/TikaRecorder.java
index 6d6760b..c5ea87f 100644
--- a/extensions/tika/runtime/src/main/java/org/apache/camel/quarkus/component/tika/TikaRecorder.java
+++ b/extensions/tika/runtime/src/main/java/org/apache/camel/quarkus/component/tika/TikaRecorder.java
@@ -18,9 +18,14 @@ package org.apache.camel.quarkus.component.tika;
 
 import java.io.IOException;
 import java.io.InputStream;
+import java.io.OutputStream;
+import java.io.OutputStreamWriter;
+import java.io.UnsupportedEncodingException;
 import java.util.Collections;
 import java.util.Set;
 
+import javax.xml.transform.TransformerConfigurationException;
+
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
@@ -36,12 +41,14 @@ import org.apache.camel.Producer;
 import org.apache.camel.component.tika.TikaComponent;
 import org.apache.camel.component.tika.TikaConfiguration;
 import org.apache.camel.component.tika.TikaEndpoint;
+import org.apache.camel.component.tika.TikaParseOutputFormat;
 import org.apache.camel.component.tika.TikaProducer;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.html.BoilerpipeContentHandler;
 
 @Recorder
 public class TikaRecorder {
@@ -78,7 +85,7 @@ public class TikaRecorder {
         @Override
         public Producer createProducer() throws Exception {
             TikaParser tikaParser = tikaParserProducer.tikaParser();
-            return new TikaProducer(this, new Parser() {
+            return new QuarkusTikaProducer(this, new Parser() {
                 @Override
                 public Set<MediaType> getSupportedTypes(ParseContext parseContext) {
                     return Collections.emptySet();
@@ -99,4 +106,28 @@ public class TikaRecorder {
         }
     }
 
+    // TODO: Remove this when Camel Tika & Quarkus Tika versions are aligned
+    // https://github.com/apache/camel-quarkus/issues/3599
+    static class QuarkusTikaProducer extends TikaProducer {
+
+        public QuarkusTikaProducer(TikaEndpoint endpoint) {
+            super(endpoint);
+        }
+
+        public QuarkusTikaProducer(TikaEndpoint endpoint, Parser parser) {
+            super(endpoint, parser);
+        }
+
+        @Override
+        protected ContentHandler getContentHandler(TikaConfiguration configuration, OutputStream outputStream)
+                throws TransformerConfigurationException, UnsupportedEncodingException {
+            TikaParseOutputFormat outputFormat = configuration.getTikaParseOutputFormat();
+            if (outputFormat.equals(TikaParseOutputFormat.textMain)) {
+                return new BoilerpipeContentHandler(
+                        new OutputStreamWriter(outputStream, configuration.getTikaParseOutputEncoding()));
+            }
+            return super.getContentHandler(configuration, outputStream);
+        }
+    }
+
 }
diff --git a/extensions/tika/runtime/src/main/java/org/apache/camel/quarkus/component/tika/graalvm/TikaProducerSubstitutions.java b/extensions/tika/runtime/src/main/java/org/apache/camel/quarkus/component/tika/graalvm/TikaProducerSubstitutions.java
new file mode 100644
index 0000000..343edae
--- /dev/null
+++ b/extensions/tika/runtime/src/main/java/org/apache/camel/quarkus/component/tika/graalvm/TikaProducerSubstitutions.java
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.camel.quarkus.component.tika.graalvm;
+
+import java.io.OutputStream;
+import java.io.OutputStreamWriter;
+import java.io.UnsupportedEncodingException;
+
+import javax.xml.transform.TransformerConfigurationException;
+import javax.xml.transform.sax.TransformerHandler;
+
+import org.xml.sax.ContentHandler;
+
+import com.oracle.svm.core.annotate.Alias;
+import com.oracle.svm.core.annotate.Substitute;
+import com.oracle.svm.core.annotate.TargetClass;
+import org.apache.camel.component.tika.TikaConfiguration;
+import org.apache.camel.component.tika.TikaParseOutputFormat;
+import org.apache.camel.component.tika.TikaProducer;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.ExpandedTitleContentHandler;
+
+// TODO: Remove this when Camel Tika & Quarkus Tika versions are aligned
+// https://github.com/apache/camel-quarkus/issues/3599
+@TargetClass(TikaProducer.class)
+public final class TikaProducerSubstitutions {
+
+    @Alias
+    private String encoding;
+
+    // Removes problematic textMain switch case since it's covered in the custom TikaProducer in TikaRecorder
+    @Substitute
+    private ContentHandler getContentHandler(TikaConfiguration configuration, OutputStream outputStream)
+            throws TransformerConfigurationException, UnsupportedEncodingException {
+
+        ContentHandler result = null;
+
+        TikaParseOutputFormat outputFormat = configuration.getTikaParseOutputFormat();
+        switch (outputFormat) {
+        case xml:
+            result = getTransformerHandler(outputStream, "xml", true);
+            break;
+        case text:
+            result = new BodyContentHandler(new OutputStreamWriter(outputStream, this.encoding));
+            break;
+        case html:
+            result = new ExpandedTitleContentHandler(getTransformerHandler(outputStream, "html", true));
+            break;
+        default:
+            throw new IllegalArgumentException(
+                    String.format("Unknown format %s", configuration.getTikaParseOutputFormat()));
+        }
+        return result;
+    }
+
+    @Alias
+    private TransformerHandler getTransformerHandler(
+            OutputStream output, String method,
+            boolean prettyPrint)
+            throws TransformerConfigurationException, UnsupportedEncodingException {
+        return null;
+    }
+}