You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@camel.apache.org by da...@apache.org on 2017/01/29 16:54:33 UTC
[3/6] camel git commit: CAMEL-10740 - Code cleanup and encoding
support.
CAMEL-10740 - Code cleanup and encoding support.
Project: http://git-wip-us.apache.org/repos/asf/camel/repo
Commit: http://git-wip-us.apache.org/repos/asf/camel/commit/c73068e7
Tree: http://git-wip-us.apache.org/repos/asf/camel/tree/c73068e7
Diff: http://git-wip-us.apache.org/repos/asf/camel/diff/c73068e7
Branch: refs/heads/master
Commit: c73068e7d42f5f8a83b218463389383d6fb26837
Parents: 17c83ba
Author: Bob Paulin <bo...@bobpaulin.com>
Authored: Sat Jan 28 23:58:12 2017 -0600
Committer: Claus Ibsen <da...@apache.org>
Committed: Sun Jan 29 17:06:27 2017 +0100
----------------------------------------------------------------------
components/camel-tika/pom.xml | 141 +++++++++----------
.../src/main/docs/tika-component.adoc | 8 +-
.../camel/component/tika/TikaConfiguration.java | 26 +++-
.../camel/component/tika/TikaEndpoint.java | 2 +-
.../camel/component/tika/TikaProducer.java | 38 ++---
.../camel/component/tika/TikaParseTest.java | 67 ++++++++-
.../src/test/resources/testOpenOffice2.odt | Bin 0 -> 26460 bytes
7 files changed, 175 insertions(+), 107 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/camel/blob/c73068e7/components/camel-tika/pom.xml
----------------------------------------------------------------------
diff --git a/components/camel-tika/pom.xml b/components/camel-tika/pom.xml
index 86f0131..6233b9f 100644
--- a/components/camel-tika/pom.xml
+++ b/components/camel-tika/pom.xml
@@ -15,80 +15,79 @@
See the License for the specific language governing permissions and
limitations under the License.
-->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
- <modelVersion>4.0.0</modelVersion>
+ <modelVersion>4.0.0</modelVersion>
+ <parent>
+ <groupId>org.apache.camel</groupId>
+ <artifactId>components</artifactId>
+ <version>2.19.0-SNAPSHOT</version>
+ </parent>
- <parent>
- <groupId>org.apache.camel</groupId>
- <artifactId>components</artifactId>
- <version>2.19.0-SNAPSHOT</version>
- </parent>
+ <artifactId>camel-tika</artifactId>
+ <packaging>jar</packaging>
+ <name>Camel :: Tika</name>
+ <description>This component integrates with Apache Tika to extract content and metadata from thousands of file types.</description>
- <artifactId>camel-tika</artifactId>
- <packaging>jar</packaging>
- <name>Camel :: Tika</name>
- <description>This component integrates with Apache Tika to extract content and metadata from thousands of file types.</description>
+ <properties>
+ <camel.osgi.export.pkg>org.apache.camel.component.tika.*</camel.osgi.export.pkg>
+ <camel.osgi.export.service>org.apache.camel.spi.ComponentResolver;component=tika</camel.osgi.export.service>
+ </properties>
- <properties>
- <camel.osgi.export.pkg>org.apache.camel.component.tika.*</camel.osgi.export.pkg>
- <camel.osgi.export.service>org.apache.camel.spi.ComponentResolver;component=tika</camel.osgi.export.service>
- </properties>
-
- <dependencies>
-
- <dependency>
- <groupId>org.apache.camel</groupId>
- <artifactId>camel-core</artifactId>
- </dependency>
- <dependency>
- <groupId>org.apache.tika</groupId>
- <artifactId>tika-core</artifactId>
- <version>${tika-version}</version>
- </dependency>
- <dependency>
- <groupId>org.apache.tika</groupId>
- <artifactId>tika-parsers</artifactId>
- <version>${tika-version}</version>
- </dependency>
- <!-- test dependencies -->
- <dependency>
- <groupId>org.apache.camel</groupId>
- <artifactId>camel-test-spring</artifactId>
- <scope>test</scope>
- </dependency>
- <dependency>
- <groupId>org.apache.logging.log4j</groupId>
- <artifactId>log4j-api</artifactId>
- <scope>test</scope>
- </dependency>
- <dependency>
- <groupId>org.apache.logging.log4j</groupId>
- <artifactId>log4j-core</artifactId>
- <scope>test</scope>
- </dependency>
- <dependency>
- <groupId>org.apache.logging.log4j</groupId>
- <artifactId>log4j-slf4j-impl</artifactId>
- <scope>test</scope>
- </dependency>
- <dependency>
- <groupId>junit</groupId>
- <artifactId>junit</artifactId>
- <scope>test</scope>
- </dependency>
- <dependency>
- <groupId>commons-io</groupId>
- <artifactId>commons-io</artifactId>
- <version>${commons-io-version}</version>
- <scope>test</scope>
- </dependency>
- <dependency>
- <groupId>org.hamcrest</groupId>
- <artifactId>java-hamcrest</artifactId>
- <version>${hamcrest-version}</version>
- <scope>test</scope>
- </dependency>
- </dependencies>
+ <dependencies>
+ <dependency>
+ <groupId>org.apache.camel</groupId>
+ <artifactId>camel-core</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-core</artifactId>
+ <version>${tika-version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-parsers</artifactId>
+ <version>${tika-version}</version>
+ </dependency>
+ <!-- test dependencies -->
+ <dependency>
+ <groupId>org.apache.camel</groupId>
+ <artifactId>camel-test-spring</artifactId>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.logging.log4j</groupId>
+ <artifactId>log4j-api</artifactId>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.logging.log4j</groupId>
+ <artifactId>log4j-core</artifactId>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.logging.log4j</groupId>
+ <artifactId>log4j-slf4j-impl</artifactId>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>commons-io</groupId>
+ <artifactId>commons-io</artifactId>
+ <version>${commons-io-version}</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.hamcrest</groupId>
+ <artifactId>java-hamcrest</artifactId>
+ <version>${hamcrest-version}</version>
+ <scope>test</scope>
+ </dependency>
+ </dependencies>
</project>
http://git-wip-us.apache.org/repos/asf/camel/blob/c73068e7/components/camel-tika/src/main/docs/tika-component.adoc
----------------------------------------------------------------------
diff --git a/components/camel-tika/src/main/docs/tika-component.adoc b/components/camel-tika/src/main/docs/tika-component.adoc
index 7049a59..f077452 100644
--- a/components/camel-tika/src/main/docs/tika-component.adoc
+++ b/components/camel-tika/src/main/docs/tika-component.adoc
@@ -41,7 +41,7 @@ The Tika component has no options.
// endpoint options: START
-The Tika component supports 5 endpoint options which are listed below:
+The Tika component supports 6 endpoint options which are listed below:
{% raw %}
[width="100%",cols="2,1,1m,1m,5",options="header"]
@@ -49,8 +49,9 @@ The Tika component supports 5 endpoint options which are listed below:
| Name | Group | Default | Java Type | Description
| operation | producer | | TikaOperation | *Required* Tika Operation. parse or detect
| tikaConfig | producer | | TikaConfig | Tika Config
-| tikaConfigUri | producer | | String | Tika Config Uri
-| tikaParseOutputFormat | producer | xml | TikaParseOutputFormat | Tika Output Format. Supported output formats are xml html text textMain
+| tikaConfigUri | producer | | String | Tika Config Uri: The URI of tika-config.xml
+| tikaParseOutputEncoding | producer | | String | Tika Parse Output Encoding - Used to specify the character encoding of the parsed output. Defaults to Charset.defaultCharset() .
+| tikaParseOutputFormat | producer | xml | TikaParseOutputFormat | Tika Output Format. Supported output formats. xml: Returns Parsed Content as XML. html: Returns Parsed Content as HTML. text: Returns Parsed Content as Text. textMain: Uses the boilerpipe library to automatically extract the main content from a web page.
| synchronous | advanced | false | boolean | Sets whether synchronous processing should be strictly used or Camel is allowed to use asynchronous processing (if supported).
|=======================================================================
{% endraw %}
@@ -61,7 +62,6 @@ The Tika component supports 5 endpoint options which are listed below:
[width="100%",cols="10%,90%",options="header",]
|=======================================================================
|Header |Description
-|TikaXXXX | Any Tika Metadata Header is converted to a Camel Header with Prefix Tika
|=======================================================================
### To Detect a file's MIME Type
http://git-wip-us.apache.org/repos/asf/camel/blob/c73068e7/components/camel-tika/src/main/java/org/apache/camel/component/tika/TikaConfiguration.java
----------------------------------------------------------------------
diff --git a/components/camel-tika/src/main/java/org/apache/camel/component/tika/TikaConfiguration.java b/components/camel-tika/src/main/java/org/apache/camel/component/tika/TikaConfiguration.java
index 051ad2a..33542c0 100644
--- a/components/camel-tika/src/main/java/org/apache/camel/component/tika/TikaConfiguration.java
+++ b/components/camel-tika/src/main/java/org/apache/camel/component/tika/TikaConfiguration.java
@@ -17,6 +17,7 @@
package org.apache.camel.component.tika;
import java.io.IOException;
+import java.nio.charset.Charset;
import org.xml.sax.SAXException;
@@ -36,6 +37,8 @@ public class TikaConfiguration {
private TikaOperation operation;
@UriParam(defaultValue = "xml")
private TikaParseOutputFormat tikaParseOutputFormat = TikaParseOutputFormat.xml;
+ @UriParam(description = "Tika Parse Output Encoding")
+ private String tikaParseOutputEncoding = Charset.defaultCharset().name();
@UriParam(description = "Tika Config")
private TikaConfig tikaConfig = TikaConfig.getDefaultConfig();
@UriParam(description = "Tika Config Url")
@@ -64,12 +67,31 @@ public class TikaConfiguration {
/**
*
- * Tika Output Format. Supported output formats are xml, html, text, textMain
+ * Tika Output Format. Supported output formats.
+ * <ul>
+ * <li>xml: Returns Parsed Content as XML. </li>
+ * <li>html: Returns Parsed Content as HTML. </li>
+ * <li>text: Returns Parsed Content as Text. </li>
+ * <li>textMain: Uses the <a href="http://code.google.com/p/boilerpipe/">boilerpipe</a> library to automatically extract the main content from a web page. </li>
+ * </ul>
*
*/
public void setTikaParseOutputFormat(TikaParseOutputFormat tikaParseOutputFormat) {
this.tikaParseOutputFormat = tikaParseOutputFormat;
}
+
+ public String getTikaParseOutputEncoding() {
+ return tikaParseOutputEncoding;
+ }
+
+ /**
+ * Tika Parse Output Encoding - Used to specify the character encoding of the parsed output.
+ * Defaults to Charset.defaultCharset() .
+ *
+ */
+ public void setTikaParseOutputEncoding(String tikaParseOutputEncoding) {
+ this.tikaParseOutputEncoding = tikaParseOutputEncoding;
+ }
public TikaConfig getTikaConfig() {
return tikaConfig;
@@ -90,7 +112,7 @@ public class TikaConfiguration {
/**
*
- * Tika Config Uri
+ * Tika Config Uri: The URI of tika-config.xml
*
*/
public void setTikaConfigUri(String tikaConfigUri) throws TikaException, IOException, SAXException {
http://git-wip-us.apache.org/repos/asf/camel/blob/c73068e7/components/camel-tika/src/main/java/org/apache/camel/component/tika/TikaEndpoint.java
----------------------------------------------------------------------
diff --git a/components/camel-tika/src/main/java/org/apache/camel/component/tika/TikaEndpoint.java b/components/camel-tika/src/main/java/org/apache/camel/component/tika/TikaEndpoint.java
index cb8fbdd..a1701d3 100644
--- a/components/camel-tika/src/main/java/org/apache/camel/component/tika/TikaEndpoint.java
+++ b/components/camel-tika/src/main/java/org/apache/camel/component/tika/TikaEndpoint.java
@@ -24,7 +24,7 @@ import org.apache.camel.impl.DefaultEndpoint;
import org.apache.camel.spi.UriEndpoint;
import org.apache.camel.spi.UriParam;
-@UriEndpoint(scheme = "tika", title = "Tika", syntax = "tika:operation", producerOnly = true, label = "tika")
+@UriEndpoint(scheme = "tika", title = "Tika", syntax = "tika:operation", producerOnly = true, label = "transformation")
public class TikaEndpoint extends DefaultEndpoint {
@UriParam
http://git-wip-us.apache.org/repos/asf/camel/blob/c73068e7/components/camel-tika/src/main/java/org/apache/camel/component/tika/TikaProducer.java
----------------------------------------------------------------------
diff --git a/components/camel-tika/src/main/java/org/apache/camel/component/tika/TikaProducer.java b/components/camel-tika/src/main/java/org/apache/camel/component/tika/TikaProducer.java
index 1e0d9ca..309df98 100644
--- a/components/camel-tika/src/main/java/org/apache/camel/component/tika/TikaProducer.java
+++ b/components/camel-tika/src/main/java/org/apache/camel/component/tika/TikaProducer.java
@@ -22,10 +22,6 @@ import java.io.InputStream;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
-import java.io.Writer;
-import java.nio.charset.Charset;
-import java.nio.charset.StandardCharsets;
-import java.util.Locale;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.TransformerConfigurationException;
@@ -57,10 +53,13 @@ public class TikaProducer extends DefaultProducer {
private final Parser parser;
private final Detector detector;
+
+ private final String encoding;
public TikaProducer(TikaEndpoint endpoint) {
super(endpoint);
this.tikaConfiguration = endpoint.getTikaConfiguration();
+ this.encoding = this.tikaConfiguration.getTikaParseOutputEncoding();
TikaConfig config = this.tikaConfiguration.getTikaConfig();
this.parser = new AutoDetectParser(config);
this.detector = config.getDetector();
@@ -111,7 +110,7 @@ public class TikaProducer extends DefaultProducer {
private void convertMetadataToHeaders(Metadata metadata, Exchange exchange) {
if (metadata != null) {
for (String metaname : metadata.names()) {
- exchange.getIn().setHeader("Tika" + metaname, metadata.get(metaname));
+ exchange.getIn().setHeader(metaname, metadata.get(metaname));
}
}
}
@@ -122,19 +121,18 @@ public class TikaProducer extends DefaultProducer {
ContentHandler result = null;
TikaParseOutputFormat outputFormat = configuration.getTikaParseOutputFormat();
- String encoding = Charset.defaultCharset().name();
switch (outputFormat) {
case xml:
- result = getTransformerHandler(outputStream, "xml", encoding, true);
+ result = getTransformerHandler(outputStream, "xml", true);
break;
case text:
- result = new BodyContentHandler(outputStream);
+ result = new BodyContentHandler(new OutputStreamWriter(outputStream, this.encoding));
break;
case textMain:
- result = new BoilerpipeContentHandler(getOutputWriter(outputStream, encoding));
+ result = new BoilerpipeContentHandler(new OutputStreamWriter(outputStream, this.encoding));
break;
case html:
- result = new ExpandedTitleContentHandler(getTransformerHandler(outputStream, "html", encoding, true));
+ result = new ExpandedTitleContentHandler(getTransformerHandler(outputStream, "html", true));
break;
default:
throw new IllegalArgumentException(
@@ -143,26 +141,16 @@ public class TikaProducer extends DefaultProducer {
return result;
}
- private TransformerHandler getTransformerHandler(OutputStream output, String method, String encoding,
- boolean prettyPrint) throws TransformerConfigurationException {
+ private TransformerHandler getTransformerHandler(OutputStream output, String method,
+ boolean prettyPrint) throws TransformerConfigurationException, UnsupportedEncodingException {
SAXTransformerFactory factory = (SAXTransformerFactory) SAXTransformerFactory.newInstance();
TransformerHandler handler = factory.newTransformerHandler();
handler.getTransformer().setOutputProperty(OutputKeys.METHOD, method);
handler.getTransformer().setOutputProperty(OutputKeys.INDENT, prettyPrint ? "yes" : "no");
- if (encoding != null) {
- handler.getTransformer().setOutputProperty(OutputKeys.ENCODING, encoding);
+ if (this.encoding != null) {
+ handler.getTransformer().setOutputProperty(OutputKeys.ENCODING, this.encoding);
}
- handler.setResult(new StreamResult(output));
+ handler.setResult(new StreamResult(new OutputStreamWriter(output, this.encoding)));
return handler;
}
-
- private Writer getOutputWriter(OutputStream output, String encoding) throws UnsupportedEncodingException {
- if (encoding != null) {
- return new OutputStreamWriter(output, encoding);
- } else if (System.getProperty("os.name").toLowerCase(Locale.ROOT).startsWith("mac os x")) {
- return new OutputStreamWriter(output, StandardCharsets.UTF_8);
- } else {
- return new OutputStreamWriter(output, Charset.defaultCharset());
- }
- }
}
http://git-wip-us.apache.org/repos/asf/camel/blob/c73068e7/components/camel-tika/src/test/java/org/apache/camel/component/tika/TikaParseTest.java
----------------------------------------------------------------------
diff --git a/components/camel-tika/src/test/java/org/apache/camel/component/tika/TikaParseTest.java b/components/camel-tika/src/test/java/org/apache/camel/component/tika/TikaParseTest.java
index dc6d97e..1db2a8d 100644
--- a/components/camel-tika/src/test/java/org/apache/camel/component/tika/TikaParseTest.java
+++ b/components/camel-tika/src/test/java/org/apache/camel/component/tika/TikaParseTest.java
@@ -16,7 +16,15 @@
*/
package org.apache.camel.component.tika;
+import java.io.ByteArrayInputStream;
import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Reader;
+import java.io.StringReader;
+import java.io.UnsupportedEncodingException;
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
import java.util.Map;
import org.apache.camel.EndpointInject;
@@ -26,7 +34,11 @@ import org.apache.camel.builder.RouteBuilder;
import org.apache.camel.component.mock.MockEndpoint;
import org.apache.camel.impl.JndiRegistry;
import org.apache.camel.test.junit4.CamelTestSupport;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.txt.UniversalEncodingDetector;
import org.junit.Test;
+import org.mozilla.universalchardet.UniversalDetector;
+
import static org.hamcrest.Matchers.*;
public class TikaParseTest extends CamelTestSupport {
@@ -48,8 +60,54 @@ public class TikaParseTest extends CamelTestSupport {
Object body = exchange.getIn().getBody(String.class);
Map<String, Object> headerMap = exchange.getIn().getHeaders();
assertThat(body, instanceOf(String.class));
+
+ Charset detectedCharset = null;
+ try {
+ InputStream bodyIs = new ByteArrayInputStream(((String)body).getBytes());
+ UniversalEncodingDetector encodingDetector = new UniversalEncodingDetector();
+ detectedCharset = encodingDetector.detect(bodyIs, new Metadata());
+ } catch (IOException e1) {
+ fail();
+ }
+
+
+ assertThat(detectedCharset.name(), startsWith(Charset.defaultCharset().name()));
+
assertThat((String) body, containsString("test"));
- assertThat(headerMap.get("TikaContent-Type"), equalTo("application/msword"));
+ assertThat(headerMap.get(Exchange.CONTENT_TYPE), equalTo("application/msword"));
+ return true;
+ }
+ });
+ resultEndpoint.assertIsSatisfied();
+ }
+
+ @Test
+ public void testDocumentParseWithEncoding() throws Exception {
+
+ File document = new File("src/test/resources/testOpenOffice2.odt");
+ template.sendBody("direct:start4", document);
+
+ resultEndpoint.setExpectedMessageCount(1);
+
+ resultEndpoint.expectedMessagesMatches(new Predicate() {
+ @Override
+ public boolean matches(Exchange exchange) {
+ Object body = exchange.getIn().getBody(String.class);
+ Map<String, Object> headerMap = exchange.getIn().getHeaders();
+ assertThat(body, instanceOf(String.class));
+
+ Charset detectedCharset = null;
+ try {
+ InputStream bodyIs = new ByteArrayInputStream(((String)body).getBytes(StandardCharsets.UTF_16));
+ UniversalEncodingDetector encodingDetector = new UniversalEncodingDetector();
+ detectedCharset = encodingDetector.detect(bodyIs, new Metadata());
+ } catch (IOException e1) {
+ fail();
+ }
+
+
+ assertThat(detectedCharset.name(), startsWith(StandardCharsets.UTF_16.name()));
+ assertThat(headerMap.get(Exchange.CONTENT_TYPE), equalTo("application/vnd.oasis.opendocument.text"));
return true;
}
});
@@ -70,7 +128,7 @@ public class TikaParseTest extends CamelTestSupport {
Map<String, Object> headerMap = exchange.getIn().getHeaders();
assertThat(body, instanceOf(String.class));
assertThat((String) body, containsString("<body/>"));
- assertThat(headerMap.get("TikaContent-Type"), equalTo("image/gif"));
+ assertThat(headerMap.get(Exchange.CONTENT_TYPE), equalTo("image/gif"));
return true;
}
});
@@ -91,7 +149,7 @@ public class TikaParseTest extends CamelTestSupport {
Map<String, Object> headerMap = exchange.getIn().getHeaders();
assertThat(body, instanceOf(String.class));
assertThat((String) body, containsString("<body/>"));
- assertThat(headerMap.get("TikaContent-Type"), equalTo("application/msword"));
+ assertThat(headerMap.get(Exchange.CONTENT_TYPE), equalTo("application/msword"));
return true;
}
});
@@ -112,7 +170,7 @@ public class TikaParseTest extends CamelTestSupport {
Map<String, Object> headerMap = exchange.getIn().getHeaders();
assertThat(body, instanceOf(String.class));
assertThat((String) body, containsString("<body/>"));
- assertThat(headerMap.get("TikaContent-Type"), equalTo("application/msword"));
+ assertThat(headerMap.get(Exchange.CONTENT_TYPE), equalTo("application/msword"));
return true;
}
});
@@ -128,6 +186,7 @@ public class TikaParseTest extends CamelTestSupport {
from("direct:start2").to("tika:parse?tikaConfigUri=src/test/resources/tika-empty.xml")
.to("mock:result");
from("direct:start3").to("tika:parse?tikaConfig=#testConfig").to("mock:result");
+ from("direct:start4").to("tika:parse?tikaParseOutputEncoding=" + StandardCharsets.UTF_16.name()).to("mock:result");
}
};
}
http://git-wip-us.apache.org/repos/asf/camel/blob/c73068e7/components/camel-tika/src/test/resources/testOpenOffice2.odt
----------------------------------------------------------------------
diff --git a/components/camel-tika/src/test/resources/testOpenOffice2.odt b/components/camel-tika/src/test/resources/testOpenOffice2.odt
new file mode 100644
index 0000000..0b1bb11
Binary files /dev/null and b/components/camel-tika/src/test/resources/testOpenOffice2.odt differ