You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@camel.apache.org by ja...@apache.org on 2022/03/18 13:11:14 UTC
[camel-quarkus] 04/08: Work around Tika version incompatibilities between Quarkus Tika & Camel Tika #3599
This is an automated email from the ASF dual-hosted git repository.
jamesnetherton pushed a commit to branch camel-main
in repository https://gitbox.apache.org/repos/asf/camel-quarkus.git
commit 836b8b5e31e479ff7801e0230f5dbe8ab119421b
Author: James Netherton <ja...@gmail.com>
AuthorDate: Tue Mar 8 10:50:20 2022 +0000
Work around Tika version incompatibilities between Quarkus Tika & Camel Tika #3599
---
extensions/tika/runtime/pom.xml | 11 ++--
.../camel/quarkus/component/tika/TikaRecorder.java | 33 +++++++++-
.../tika/graalvm/TikaProducerSubstitutions.java | 77 ++++++++++++++++++++++
3 files changed, 115 insertions(+), 6 deletions(-)
diff --git a/extensions/tika/runtime/pom.xml b/extensions/tika/runtime/pom.xml
index 3470945..74ae8a2 100644
--- a/extensions/tika/runtime/pom.xml
+++ b/extensions/tika/runtime/pom.xml
@@ -58,11 +58,7 @@
<exclusions>
<exclusion>
<groupId>org.apache.tika</groupId>
- <artifactId>tika-core</artifactId>
- </exclusion>
- <exclusion>
- <groupId>org.apache.tika</groupId>
- <artifactId>tika-parsers</artifactId>
+ <artifactId>*</artifactId>
</exclusion>
</exclusions>
</dependency>
@@ -74,6 +70,11 @@
<groupId>io.quarkiverse.tika</groupId>
<artifactId>quarkus-tika</artifactId>
</dependency>
+ <dependency>
+ <groupId>org.graalvm.nativeimage</groupId>
+ <artifactId>svm</artifactId>
+ <scope>provided</scope>
+ </dependency>
</dependencies>
<build>
diff --git a/extensions/tika/runtime/src/main/java/org/apache/camel/quarkus/component/tika/TikaRecorder.java b/extensions/tika/runtime/src/main/java/org/apache/camel/quarkus/component/tika/TikaRecorder.java
index 6d6760b..c5ea87f 100644
--- a/extensions/tika/runtime/src/main/java/org/apache/camel/quarkus/component/tika/TikaRecorder.java
+++ b/extensions/tika/runtime/src/main/java/org/apache/camel/quarkus/component/tika/TikaRecorder.java
@@ -18,9 +18,14 @@ package org.apache.camel.quarkus.component.tika;
import java.io.IOException;
import java.io.InputStream;
+import java.io.OutputStream;
+import java.io.OutputStreamWriter;
+import java.io.UnsupportedEncodingException;
import java.util.Collections;
import java.util.Set;
+import javax.xml.transform.TransformerConfigurationException;
+
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@@ -36,12 +41,14 @@ import org.apache.camel.Producer;
import org.apache.camel.component.tika.TikaComponent;
import org.apache.camel.component.tika.TikaConfiguration;
import org.apache.camel.component.tika.TikaEndpoint;
+import org.apache.camel.component.tika.TikaParseOutputFormat;
import org.apache.camel.component.tika.TikaProducer;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.html.BoilerpipeContentHandler;
@Recorder
public class TikaRecorder {
@@ -78,7 +85,7 @@ public class TikaRecorder {
@Override
public Producer createProducer() throws Exception {
TikaParser tikaParser = tikaParserProducer.tikaParser();
- return new TikaProducer(this, new Parser() {
+ return new QuarkusTikaProducer(this, new Parser() {
@Override
public Set<MediaType> getSupportedTypes(ParseContext parseContext) {
return Collections.emptySet();
@@ -99,4 +106,28 @@ public class TikaRecorder {
}
}
+ // TODO: Remove this when Camel Tika & Quarkus Tika versions are aligned
+ // https://github.com/apache/camel-quarkus/issues/3599
+ static class QuarkusTikaProducer extends TikaProducer {
+
+ public QuarkusTikaProducer(TikaEndpoint endpoint) {
+ super(endpoint);
+ }
+
+ public QuarkusTikaProducer(TikaEndpoint endpoint, Parser parser) {
+ super(endpoint, parser);
+ }
+
+ @Override
+ protected ContentHandler getContentHandler(TikaConfiguration configuration, OutputStream outputStream)
+ throws TransformerConfigurationException, UnsupportedEncodingException {
+ TikaParseOutputFormat outputFormat = configuration.getTikaParseOutputFormat();
+ if (outputFormat.equals(TikaParseOutputFormat.textMain)) {
+ return new BoilerpipeContentHandler(
+ new OutputStreamWriter(outputStream, configuration.getTikaParseOutputEncoding()));
+ }
+ return super.getContentHandler(configuration, outputStream);
+ }
+ }
+
}
diff --git a/extensions/tika/runtime/src/main/java/org/apache/camel/quarkus/component/tika/graalvm/TikaProducerSubstitutions.java b/extensions/tika/runtime/src/main/java/org/apache/camel/quarkus/component/tika/graalvm/TikaProducerSubstitutions.java
new file mode 100644
index 0000000..343edae
--- /dev/null
+++ b/extensions/tika/runtime/src/main/java/org/apache/camel/quarkus/component/tika/graalvm/TikaProducerSubstitutions.java
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.camel.quarkus.component.tika.graalvm;
+
+import java.io.OutputStream;
+import java.io.OutputStreamWriter;
+import java.io.UnsupportedEncodingException;
+
+import javax.xml.transform.TransformerConfigurationException;
+import javax.xml.transform.sax.TransformerHandler;
+
+import org.xml.sax.ContentHandler;
+
+import com.oracle.svm.core.annotate.Alias;
+import com.oracle.svm.core.annotate.Substitute;
+import com.oracle.svm.core.annotate.TargetClass;
+import org.apache.camel.component.tika.TikaConfiguration;
+import org.apache.camel.component.tika.TikaParseOutputFormat;
+import org.apache.camel.component.tika.TikaProducer;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.ExpandedTitleContentHandler;
+
+// TODO: Remove this when Camel Tika & Quarkus Tika versions are aligned
+// https://github.com/apache/camel-quarkus/issues/3599
+@TargetClass(TikaProducer.class)
+public final class TikaProducerSubstitutions {
+
+ @Alias
+ private String encoding;
+
+ // Removes problematic textMain switch case since it's covered in the custom TikaProducer in TikaRecorder
+ @Substitute
+ private ContentHandler getContentHandler(TikaConfiguration configuration, OutputStream outputStream)
+ throws TransformerConfigurationException, UnsupportedEncodingException {
+
+ ContentHandler result = null;
+
+ TikaParseOutputFormat outputFormat = configuration.getTikaParseOutputFormat();
+ switch (outputFormat) {
+ case xml:
+ result = getTransformerHandler(outputStream, "xml", true);
+ break;
+ case text:
+ result = new BodyContentHandler(new OutputStreamWriter(outputStream, this.encoding));
+ break;
+ case html:
+ result = new ExpandedTitleContentHandler(getTransformerHandler(outputStream, "html", true));
+ break;
+ default:
+ throw new IllegalArgumentException(
+ String.format("Unknown format %s", configuration.getTikaParseOutputFormat()));
+ }
+ return result;
+ }
+
+ @Alias
+ private TransformerHandler getTransformerHandler(
+ OutputStream output, String method,
+ boolean prettyPrint)
+ throws TransformerConfigurationException, UnsupportedEncodingException {
+ return null;
+ }
+}