You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ma...@apache.org on 2011/02/24 10:39:19 UTC
svn commit: r1074088 - in /tika/trunk: ./ tika-server/ tika-server/src/
tika-server/src/main/ tika-server/src/main/java/
tika-server/src/main/java/org/ tika-server/src/main/java/org/apache/
tika-server/src/main/java/org/apache/tika/ tika-server/src/mai...
Author: maxcom
Date: Thu Feb 24 09:39:17 2011
New Revision: 1074088
URL: http://svn.apache.org/viewvc?rev=1074088&view=rev
Log:
TIKA-593: JAX-RS network server
Added:
tika/trunk/tika-server/
tika/trunk/tika-server/README
tika/trunk/tika-server/pom.xml
tika/trunk/tika-server/src/
tika/trunk/tika-server/src/main/
tika/trunk/tika-server/src/main/java/
tika/trunk/tika-server/src/main/java/org/
tika/trunk/tika-server/src/main/java/org/apache/
tika/trunk/tika-server/src/main/java/org/apache/tika/
tika/trunk/tika-server/src/main/java/org/apache/tika/server/
tika/trunk/tika-server/src/main/java/org/apache/tika/server/MetadataResource.java
tika/trunk/tika-server/src/main/java/org/apache/tika/server/PartExtractor.java
tika/trunk/tika-server/src/main/java/org/apache/tika/server/TikaExceptionMapper.java
tika/trunk/tika-server/src/main/java/org/apache/tika/server/TikaResource.java
tika/trunk/tika-server/src/main/java/org/apache/tika/server/TikaServerCli.java
tika/trunk/tika-server/src/main/java/org/apache/tika/server/UnpackerResource.java
tika/trunk/tika-server/src/main/java/org/apache/tika/server/ZipOutput.java
tika/trunk/tika-server/src/main/java/org/apache/tika/server/ZipUtils.java
tika/trunk/tika-server/src/main/resources/
tika/trunk/tika-server/src/main/resources/commons-logging.properties
tika/trunk/tika-server/src/test/
tika/trunk/tika-server/src/test/java/
tika/trunk/tika-server/src/test/java/org/
tika/trunk/tika-server/src/test/java/org/apache/
tika/trunk/tika-server/src/test/java/org/apache/tika/
tika/trunk/tika-server/src/test/java/org/apache/tika/server/
tika/trunk/tika-server/src/test/java/org/apache/tika/server/MetadataResourceTest.java
tika/trunk/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java
tika/trunk/tika-server/src/test/java/org/apache/tika/server/UnpackerResourceTest.java
tika/trunk/tika-server/src/test/resources/
tika/trunk/tika-server/src/test/resources/2exe.docx (with props)
tika/trunk/tika-server/src/test/resources/2pic.doc (with props)
tika/trunk/tika-server/src/test/resources/2pic.docx (with props)
tika/trunk/tika-server/src/test/resources/Doc1_ole.doc (with props)
tika/trunk/tika-server/src/test/resources/password.xls (with props)
tika/trunk/tika-server/src/test/resources/pic.xls (with props)
tika/trunk/tika-server/src/test/resources/pic.xlsx (with props)
tika/trunk/tika-server/src/test/resources/test.doc (with props)
Modified:
tika/trunk/NOTICE.txt
tika/trunk/pom.xml
Modified: tika/trunk/NOTICE.txt
URL: http://svn.apache.org/viewvc/tika/trunk/NOTICE.txt?rev=1074088&r1=1074087&r2=1074088&view=diff
==============================================================================
--- tika/trunk/NOTICE.txt (original)
+++ tika/trunk/NOTICE.txt Thu Feb 24 09:39:17 2011
@@ -7,3 +7,8 @@ The Apache Software Foundation (http://w
Copyright 1993-2010 University Corporation for Atmospheric Research/Unidata
This software contains code derived from UCAR/Unidata's NetCDF library.
+Tika-server compoment uses CDDL-licensed dependencies: jersey (http://jersey.java.net/) and
+Grizzly (http://grizzly.java.net/)
+
+OpenCSV: Copyright 2005 Bytecode Pty Ltd. Licensed under the Apache License, Version 2.0
+
Modified: tika/trunk/pom.xml
URL: http://svn.apache.org/viewvc/tika/trunk/pom.xml?rev=1074088&r1=1074087&r2=1074088&view=diff
==============================================================================
--- tika/trunk/pom.xml (original)
+++ tika/trunk/pom.xml Thu Feb 24 09:39:17 2011
@@ -50,6 +50,7 @@
<module>tika-parsers</module>
<module>tika-app</module>
<module>tika-bundle</module>
+ <module>tika-server</module>
</modules>
<build>
Added: tika/trunk/tika-server/README
URL: http://svn.apache.org/viewvc/tika/trunk/tika-server/README?rev=1074088&view=auto
==============================================================================
--- tika/trunk/tika-server/README (added)
+++ tika/trunk/tika-server/README Thu Feb 24 09:39:17 2011
@@ -0,0 +1,35 @@
+This is JAX-RS Tika server for Tika
+(https://issues.apache.org/jira/browse/TIKA-593)
+
+Running
+-------
+java -jar target/tikaserver-1.0-SNAPSHOT.jar
+
+Usage
+-----
+Usage examples from command line with curl utility:
+
+1) Extract plain text:
+
+curl -T price.xls http://localhost:9998/tika
+
+2) Extract text with mime-type hint:
+
+curl -v -H "Content-type: application/vnd.openxmlformats-officedocument.wordprocessingml.document" -T document.docx http://localhost:9998/tika
+
+3) Get all document attachments as ZIP-file:
+
+curl -v -T Doc1_ole.doc http://localhost:9998/unpacker > /var/tmp/x.zip
+
+4) Extract metadata to CSV format:
+
+curl -T price.xls http://localhost:9998/meta
+
+HTTP Codes
+----------
+200 - Ok
+204 - No content (for example when we are unpacking file without attachments)
+415 - Unknown file type
+422 - Unparsable document of known type (password protected documents and unsupported versions like Biff5 Excel)
+500 - Internal error
+
Added: tika/trunk/tika-server/pom.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-server/pom.xml?rev=1074088&view=auto
==============================================================================
--- tika/trunk/tika-server/pom.xml (added)
+++ tika/trunk/tika-server/pom.xml Thu Feb 24 09:39:17 2011
@@ -0,0 +1,183 @@
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-parent</artifactId>
+ <version>1.0-SNAPSHOT</version>
+ <relativePath>../tika-parent/pom.xml</relativePath>
+ </parent>
+
+ <artifactId>tika-server</artifactId>
+ <packaging>bundle</packaging>
+ <version>1.0-SNAPSHOT</version>
+
+ <dependencies>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-parsers</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+
+ <dependency>
+ <groupId>com.sun.jersey</groupId>
+ <artifactId>jersey-server</artifactId>
+ <version>1.0.3.1</version>
+ </dependency>
+ <dependency>
+ <groupId>com.sun.jersey</groupId>
+ <artifactId>jersey-core</artifactId>
+ <version>1.0.3.1</version>
+ </dependency>
+ <dependency>
+ <groupId>com.sun.jersey</groupId>
+ <artifactId>jersey-client</artifactId>
+ <version>1.0.3.1</version>
+ </dependency>
+ <dependency>
+ <groupId>javax.ws.rs</groupId>
+ <artifactId>jsr311-api</artifactId>
+ <version>1.0</version>
+ </dependency>
+ <dependency>
+ <groupId>com.sun.jersey.test.framework</groupId>
+ <artifactId>jersey-test-framework</artifactId>
+ <version>1.0.3.1</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>com.sun.grizzly</groupId>
+ <artifactId>grizzly-servlet-webserver</artifactId>
+ <version>1.9.8</version>
+ </dependency>
+ <dependency>
+ <groupId>commons-logging</groupId>
+ <artifactId>commons-logging</artifactId>
+ <version>1.1.1</version>
+ </dependency>
+ <dependency>
+ <groupId>commons-codec</groupId>
+ <artifactId>commons-codec</artifactId>
+ <version>1.3</version>
+ </dependency>
+ <dependency>
+ <groupId>commons-collections</groupId>
+ <artifactId>commons-collections</artifactId>
+ <version>3.2.1</version>
+ </dependency>
+ <dependency>
+ <groupId>commons-cli</groupId>
+ <artifactId>commons-cli</artifactId>
+ <version>1.2</version>
+ </dependency>
+ <dependency>
+ <groupId>commons-lang</groupId>
+ <artifactId>commons-lang</artifactId>
+ <version>2.5</version>
+ </dependency>
+ <dependency>
+ <groupId>net.sf.opencsv</groupId>
+ <artifactId>opencsv</artifactId>
+ <version>2.0</version>
+ </dependency>
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <scope>test</scope>
+ <version>4.8</version>
+ </dependency>
+ </dependencies>
+
+ <build>
+ <plugins>
+ <!-- Maven Exec Plug-In: http://mojo.codehaus.org/exec-maven-plugin/ -->
+ <plugin>
+ <groupId>org.codehaus.mojo</groupId>
+ <artifactId>exec-maven-plugin</artifactId>
+ <version>1.1</version>
+ <executions>
+ <execution>
+ <goals>
+ <goal>java</goal>
+ </goals>
+ </execution>
+ </executions>
+ <configuration>
+ <mainClass>su.msk.jet.tikaserver.TikaServerCli</mainClass>
+ </configuration>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-compiler-plugin</artifactId>
+ <inherited>true</inherited>
+ <configuration>
+ <source>1.6</source>
+ <target>1.6</target>
+ </configuration>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.felix</groupId>
+ <artifactId>maven-bundle-plugin</artifactId>
+ <extensions>true</extensions>
+ <configuration>
+ <instructions>
+ <Export-Package>org.apache.tika.*</Export-Package>
+ <Embed-Dependency>
+ !jersey-server;scope=compile;inline=META-INF/services/**|au/**|javax/**|org/**|com/**|Resources/**|font_metrics.properties|repackage/**|schema*/**,
+ jersey-server;scope=compile;inline=com/** |META-INF/services/com.sun*|META-INF/services/javax.ws.rs.ext.RuntimeDelegate
+ </Embed-Dependency>
+ <Embed-Transitive>true</Embed-Transitive>
+ <Bundle-DocURL>${project.url}</Bundle-DocURL>
+ <Main-Class>org.apache.tika.server.TikaServerCli</Main-Class>
+ </instructions>
+ </configuration>
+ </plugin>
+
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-surefire-plugin</artifactId>
+ <configuration>
+ <version>2.6</version>
+ <redirectTestOutputToFile>true</redirectTestOutputToFile>
+ <argLine>-da -XX:+HeapDumpOnOutOfMemoryError -Xmx512m</argLine>
+<!-- <argLine>-agentlib:jprofilerti=port=8849 -Xbootclasspath/a:/arc/opt/jprofiler5/bin/agent.jar</argLine> -->
+ </configuration>
+ </plugin>
+ </plugins>
+ </build>
+
+ <repositories>
+ <repository>
+ <id>maven2-repository.dev.java.net</id>
+ <name>Java.net Repository for Maven</name>
+ <url>http://download.java.net/maven/2/</url>
+ <layout>default</layout>
+ </repository>
+ <repository>
+ <id>maven-repository.dev.java.net</id>
+ <name>Java.net Maven 1 Repository (legacy)</name>
+ <url>http://download.java.net/maven/1</url>
+ <layout>legacy</layout>
+ </repository>
+ </repositories>
+</project>
+
Added: tika/trunk/tika-server/src/main/java/org/apache/tika/server/MetadataResource.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-server/src/main/java/org/apache/tika/server/MetadataResource.java?rev=1074088&view=auto
==============================================================================
--- tika/trunk/tika-server/src/main/java/org/apache/tika/server/MetadataResource.java (added)
+++ tika/trunk/tika-server/src/main/java/org/apache/tika/server/MetadataResource.java Thu Feb 24 09:39:17 2011
@@ -0,0 +1,100 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.server;
+
+import au.com.bytecode.opencsv.CSVWriter;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.detect.Detector;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.mime.MimeTypeException;
+import org.xml.sax.helpers.DefaultHandler;
+
+import javax.ws.rs.PUT;
+import javax.ws.rs.Path;
+import javax.ws.rs.Produces;
+import javax.ws.rs.WebApplicationException;
+import javax.ws.rs.core.Context;
+import javax.ws.rs.core.HttpHeaders;
+import javax.ws.rs.core.StreamingOutput;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.io.OutputStreamWriter;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+@Path("/meta")
+public class MetadataResource {
+ private static final String CONTENT_LENGTH = "Content-Length";
+ private static final String FILE_NNAME = "File-Name";
+ private static final String RESOURCE_NAME = "resourceName";
+
+ @PUT
+ @Produces("text/csv")
+ public StreamingOutput getMetadata( InputStream is, @Context HttpHeaders httpHeaders ) throws Exception {
+ final Detector detector = new HeaderTrustingDetectorFactory ().createDetector( httpHeaders );
+ final AutoDetectParser parser = new AutoDetectParser(detector);
+ final ParseContext context = new ParseContext();
+ context.set(Parser.class, parser);
+ final Metadata metadata = new Metadata();
+ parser.parse( is, new DefaultHandler(), metadata, context );
+ fillMetadata ( httpHeaders, metadata );
+
+ return new StreamingOutput() {
+ @Override
+ public void write(OutputStream outputStream) throws IOException, WebApplicationException {
+ CSVWriter writer = new CSVWriter(new OutputStreamWriter(outputStream));
+ for (String name : metadata.names()) {
+ String[] values = metadata.getValues(name);
+ ArrayList<String> list = new ArrayList<String>(values.length+1);
+ list.add(name);
+ list.addAll(Arrays.asList(values));
+ writer.writeNext(list.toArray(values));
+ }
+ writer.close();
+ }
+ };
+ }
+
+ private void fillMetadata ( HttpHeaders httpHeaders, Metadata metadata ) {
+ final List < String > fileName = httpHeaders.getRequestHeader(FILE_NNAME), cl = httpHeaders.getRequestHeader(CONTENT_LENGTH);
+ if ( cl != null && !cl.isEmpty() )
+ metadata.set( CONTENT_LENGTH, cl.get(0) );
+
+ if ( fileName != null && !fileName.isEmpty() )
+ metadata.set( RESOURCE_NAME, fileName.get(0) );
+ }
+
+ private static class HeaderTrustingDetectorFactory {
+ public Detector createDetector( HttpHeaders httpHeaders ) throws IOException, MimeTypeException {
+ final javax.ws.rs.core.MediaType mediaType = httpHeaders.getMediaType();
+ if (mediaType == null || mediaType.equals(javax.ws.rs.core.MediaType.APPLICATION_OCTET_STREAM_TYPE ))
+ return (new TikaConfig()).getMimeRepository();
+ else return new Detector() {
+ @Override
+ public MediaType detect(InputStream inputStream, Metadata metadata) throws IOException {
+ return MediaType.parse( mediaType.toString() );
+ }
+ };
+ }
+ }
+}
Added: tika/trunk/tika-server/src/main/java/org/apache/tika/server/PartExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-server/src/main/java/org/apache/tika/server/PartExtractor.java?rev=1074088&view=auto
==============================================================================
--- tika/trunk/tika-server/src/main/java/org/apache/tika/server/PartExtractor.java (added)
+++ tika/trunk/tika-server/src/main/java/org/apache/tika/server/PartExtractor.java Thu Feb 24 09:39:17 2011
@@ -0,0 +1,24 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.server;
+
+import java.io.IOException;
+import java.util.zip.ZipOutputStream;
+
+public interface PartExtractor<T> {
+ void extract(T part, ZipOutputStream output) throws IOException;
+}
Added: tika/trunk/tika-server/src/main/java/org/apache/tika/server/TikaExceptionMapper.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-server/src/main/java/org/apache/tika/server/TikaExceptionMapper.java?rev=1074088&view=auto
==============================================================================
--- tika/trunk/tika-server/src/main/java/org/apache/tika/server/TikaExceptionMapper.java (added)
+++ tika/trunk/tika-server/src/main/java/org/apache/tika/server/TikaExceptionMapper.java Thu Feb 24 09:39:17 2011
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.server;
+
+import org.apache.tika.exception.TikaException;
+
+import javax.ws.rs.WebApplicationException;
+import javax.ws.rs.core.Response;
+import javax.ws.rs.ext.ExceptionMapper;
+import javax.ws.rs.ext.Provider;
+
+@Provider
+public class TikaExceptionMapper implements ExceptionMapper<TikaException> {
+ @Override
+ public Response toResponse(TikaException e) {
+ if (e.getCause() !=null && e.getCause() instanceof WebApplicationException) {
+ return ((WebApplicationException) e.getCause()).getResponse();
+ } else {
+ return Response.serverError().build();
+ }
+ }
+}
Added: tika/trunk/tika-server/src/main/java/org/apache/tika/server/TikaResource.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-server/src/main/java/org/apache/tika/server/TikaResource.java?rev=1074088&view=auto
==============================================================================
--- tika/trunk/tika-server/src/main/java/org/apache/tika/server/TikaResource.java (added)
+++ tika/trunk/tika-server/src/main/java/org/apache/tika/server/TikaResource.java Thu Feb 24 09:39:17 2011
@@ -0,0 +1,144 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.server;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.poi.EncryptedDocumentException;
+import org.apache.poi.extractor.ExtractorFactory;
+import org.apache.poi.hwpf.OldWordFileFormatException;
+import org.apache.tika.detect.Detector;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import javax.ws.rs.*;
+import javax.ws.rs.core.Context;
+import javax.ws.rs.core.HttpHeaders;
+import javax.ws.rs.core.Response;
+import javax.ws.rs.core.StreamingOutput;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.util.Set;
+
+@Path("/tika")
+public class TikaResource {
+ public static final String GREETING = "This is Tika Server. Please PUT\n";
+ private final Log logger = LogFactory.getLog(TikaResource.class);
+
+ @SuppressWarnings({"SameReturnValue"})
+ @GET
+ @Produces("text/plain")
+ public String getMessage() {
+ return GREETING;
+ }
+
+ public static AutoDetectParser createParser() {
+ final AutoDetectParser parser = new AutoDetectParser();
+
+ parser.setFallback(new Parser() {
+ public Set<MediaType> getSupportedTypes(ParseContext parseContext) {
+ return parser.getSupportedTypes(parseContext);
+ }
+
+ @Override
+ public void parse(InputStream inputStream, ContentHandler contentHandler, Metadata metadata, ParseContext parseContext) {
+ throw new WebApplicationException(Response.Status.UNSUPPORTED_MEDIA_TYPE);
+ }
+
+ @Override
+ public void parse(InputStream inputStream, ContentHandler contentHandler, Metadata metadata) {
+ throw new WebApplicationException(Response.Status.UNSUPPORTED_MEDIA_TYPE);
+ }
+ });
+
+ return parser;
+ }
+
+ public static void fillMetadata(AutoDetectParser parser, Metadata metadata, HttpHeaders httpHeaders) {
+ javax.ws.rs.core.MediaType mediaType = httpHeaders.getMediaType();
+
+ if (mediaType !=null && !mediaType.equals(javax.ws.rs.core.MediaType.APPLICATION_OCTET_STREAM_TYPE)) {
+ metadata.add(org.apache.tika.metadata.HttpHeaders.CONTENT_TYPE, mediaType.toString());
+
+ final Detector detector = parser.getDetector();
+
+ parser.setDetector(new Detector() {
+ @Override
+ public MediaType detect(InputStream inputStream, Metadata metadata) throws IOException {
+ String ct = metadata.get(org.apache.tika.metadata.HttpHeaders.CONTENT_TYPE);
+
+ if (ct!=null) {
+ return MediaType.parse(ct);
+ } else {
+ return detector.detect(inputStream, metadata);
+ }
+ }
+ });
+ }
+ }
+
+ @PUT
+ @Consumes("*/*")
+ @Produces("text/plain")
+ public StreamingOutput getText(final InputStream is, @Context HttpHeaders httpHeaders) {
+ final AutoDetectParser parser = createParser();
+ final Metadata metadata = new Metadata();
+
+ fillMetadata(parser, metadata, httpHeaders);
+
+ return new StreamingOutput() {
+ @Override
+ public void write(OutputStream outputStream) throws IOException, WebApplicationException {
+ BodyContentHandler body = new BodyContentHandler(outputStream);
+
+ try {
+ parser.parse(is, body, metadata);
+ } catch (SAXException e) {
+ throw new WebApplicationException(e);
+ } catch (TikaException e) {
+ if (e.getCause()!=null && e.getCause() instanceof WebApplicationException) {
+ throw (WebApplicationException) e.getCause();
+ }
+
+ if (e.getCause()!=null && e.getCause() instanceof IllegalStateException) {
+ throw new WebApplicationException(Response.status(422).build());
+ }
+
+ if (e.getCause()!=null && e.getCause() instanceof EncryptedDocumentException) {
+ throw new WebApplicationException(Response.status(422).build());
+ }
+
+ if (e.getCause()!=null && e.getCause() instanceof OldWordFileFormatException) {
+ throw new WebApplicationException(Response.status(422).build());
+ }
+
+ logger.warn("Text extraction failed", e);
+
+ throw new WebApplicationException(Response.Status.INTERNAL_SERVER_ERROR);
+ }
+ }
+ };
+ }
+}
Added: tika/trunk/tika-server/src/main/java/org/apache/tika/server/TikaServerCli.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-server/src/main/java/org/apache/tika/server/TikaServerCli.java?rev=1074088&view=auto
==============================================================================
--- tika/trunk/tika-server/src/main/java/org/apache/tika/server/TikaServerCli.java (added)
+++ tika/trunk/tika-server/src/main/java/org/apache/tika/server/TikaServerCli.java Thu Feb 24 09:39:17 2011
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.server;
+
+import com.sun.grizzly.http.SelectorThread;
+import com.sun.jersey.api.container.grizzly.GrizzlyWebContainerFactory;
+import org.apache.commons.cli.*;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+import javax.ws.rs.core.UriBuilder;
+import java.net.URI;
+import java.util.HashMap;
+import java.util.Map;
+
+public class TikaServerCli {
+ private static final Log logger = LogFactory.getLog(TikaServerCli.class);
+
+ public static final int DEFAULT_PORT = 9998;
+
+ private static Options getOptions() {
+ Options options = new Options();
+ options.addOption("p", "port", true, "listen port (default = "+DEFAULT_PORT+ ')');
+
+ options.addOption("h", "help", false, "this help message");
+
+ return options;
+ }
+
+ public static void main(String[] args) {
+ try {
+ TikaServerCli cli = new TikaServerCli();
+
+ Map<String, String> params = new HashMap<String, String>();
+
+ params.put("com.sun.jersey.config.property.packages", "org.apache.tika.server");
+
+ Options options = cli.getOptions();
+
+ CommandLineParser cliParser = new GnuParser();
+ CommandLine line = cliParser.parse(options, args);
+
+ int port = DEFAULT_PORT;
+
+ if (line.hasOption("port")) {
+ port = Integer.valueOf(line.getOptionValue("port"));
+ }
+
+ if (line.hasOption("help")) {
+ HelpFormatter helpFormatter = new HelpFormatter();
+ helpFormatter.printHelp("tikaserver", options);
+ System.exit(-1);
+ }
+
+ String baseUri = "http://localhost/";
+ URI buildUri = UriBuilder.fromUri(baseUri).port(port).build();
+ SelectorThread threadSelector = GrizzlyWebContainerFactory.create(buildUri, params);
+
+ logger.info("Started at " + buildUri);
+ } catch (Exception ex) {
+ logger.fatal("Can't start", ex);
+ System.exit(-1);
+ }
+ }
+}
Added: tika/trunk/tika-server/src/main/java/org/apache/tika/server/UnpackerResource.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-server/src/main/java/org/apache/tika/server/UnpackerResource.java?rev=1074088&view=auto
==============================================================================
--- tika/trunk/tika-server/src/main/java/org/apache/tika/server/UnpackerResource.java (added)
+++ tika/trunk/tika-server/src/main/java/org/apache/tika/server/UnpackerResource.java Thu Feb 24 09:39:17 2011
@@ -0,0 +1,180 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.server;
+
+import org.apache.commons.lang.mutable.MutableInt;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.poi.poifs.filesystem.Ole10Native;
+import org.apache.poi.poifs.filesystem.Ole10NativeException;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.poi.util.IOUtils;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaMetadataKeys;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.mime.MimeTypeException;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.microsoft.OfficeParser;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+import javax.ws.rs.PUT;
+import javax.ws.rs.Path;
+import javax.ws.rs.Produces;
+import javax.ws.rs.WebApplicationException;
+import javax.ws.rs.core.Context;
+import javax.ws.rs.core.HttpHeaders;
+import javax.ws.rs.core.Response;
+import javax.ws.rs.core.StreamingOutput;
+import java.io.*;
+import java.util.Collections;
+import java.util.zip.ZipOutputStream;
+
+@Path("/unpacker")
+public class UnpackerResource {
+ private static final Log logger = LogFactory.getLog(UnpackerResource.class);
+
+ private final TikaConfig tikaConfig;
+
+ public UnpackerResource() {
+ tikaConfig = TikaConfig.getDefaultConfig();
+ }
+
+ @PUT
+ @Produces("application/zip")
+ public StreamingOutput getText(
+ InputStream is,
+ @Context HttpHeaders httpHeaders
+ ) throws Exception {
+ if (!is.markSupported()) {
+ is = new BufferedInputStream(is);
+ }
+
+ Parser parser;
+
+ javax.ws.rs.core.MediaType mediaType = httpHeaders.getMediaType();
+ if (mediaType !=null && !mediaType.equals(javax.ws.rs.core.MediaType.APPLICATION_OCTET_STREAM_TYPE)) {
+ parser = tikaConfig.getParser(new MediaType(httpHeaders.getMediaType().getType(), httpHeaders.getMediaType().getSubtype()));
+ } else {
+ MediaType type = tikaConfig.getMimeRepository().detect(is, new Metadata());
+ parser = tikaConfig.getParser(type);
+ }
+
+ if (parser==null) {
+ throw new WebApplicationException(Response.Status.UNSUPPORTED_MEDIA_TYPE);
+ }
+
+ ContentHandler ch = new DefaultHandler();
+
+ ParseContext pc = new ParseContext();
+
+ ZipOutput zout = new ZipOutput();
+ MutableInt count = new MutableInt();
+
+ pc.set(EmbeddedDocumentExtractor.class, new MyEmbeddedDocumentExtractor(count, zout));
+
+ parser.parse(is, ch, new Metadata(), pc);
+
+ if (count.intValue()==0) {
+ throw new WebApplicationException(Response.Status.NO_CONTENT);
+ }
+
+ return zout;
+ }
+
+ private class MyEmbeddedDocumentExtractor implements EmbeddedDocumentExtractor {
+ private final MutableInt count;
+ private final ZipOutput zout;
+
+ MyEmbeddedDocumentExtractor(MutableInt count, ZipOutput zout) {
+ this.count = count;
+ this.zout = zout;
+ }
+
+ @Override
+ public boolean shouldParseEmbedded(Metadata metadata) {
+ return true;
+ }
+
+ @Override
+ public void parseEmbedded(InputStream inputStream, ContentHandler contentHandler, Metadata metadata, boolean b) throws SAXException, IOException {
+ ByteArrayOutputStream bos = new ByteArrayOutputStream();
+ IOUtils.copy(inputStream, bos);
+ byte[] data = bos.toByteArray();
+
+ String name = metadata.get(TikaMetadataKeys.RESOURCE_NAME_KEY);
+ String contentType = metadata.get(org.apache.tika.metadata.HttpHeaders.CONTENT_TYPE);
+
+ if (name == null) {
+ name = Integer.toString(count.intValue());
+ }
+
+ if (!name.contains(".")) {
+ try {
+ String ext = tikaConfig.getMimeRepository().forName(contentType).getExtension();
+
+ if (ext!=null) {
+ name += ext;
+ }
+ } catch (MimeTypeException e) {
+ logger.warn("Unexpected MimeTypeException", e);
+ }
+ }
+
+ if ("application/vnd.openxmlformats-officedocument.oleObject".equals(contentType)) {
+ POIFSFileSystem poifs = new POIFSFileSystem(new ByteArrayInputStream(data));
+ OfficeParser.POIFSDocumentType type = OfficeParser.POIFSDocumentType.detectType(poifs);
+
+ if (type == OfficeParser.POIFSDocumentType.OLE10_NATIVE) {
+ try {
+ Ole10Native ole = Ole10Native.createFromEmbeddedOleObject(poifs);
+ if (ole.getDataSize()>0) {
+ String label = ole.getLabel();
+
+ if (label.startsWith("ole-")) {
+ label = Integer.toString(count.intValue()) + '-' + label;
+ }
+
+ name = label;
+
+ data = ole.getDataBuffer();
+ }
+ } catch (Ole10NativeException ex) {
+ logger.warn("Skipping invalid part", ex);
+ }
+ } else {
+ name += '.' + type.getExtension();
+ }
+ }
+
+ final String finalName = name;
+
+ zout.put(new PartExtractor<byte[]>() {
+ @Override
+ public void extract(byte[] part, ZipOutputStream output) throws IOException {
+ ZipUtils.zipStoreBuffer(output, finalName, part);
+ }
+ }, Collections.singletonList(data));
+
+ count.increment();
+ }
+ }
+}
Added: tika/trunk/tika-server/src/main/java/org/apache/tika/server/ZipOutput.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-server/src/main/java/org/apache/tika/server/ZipOutput.java?rev=1074088&view=auto
==============================================================================
--- tika/trunk/tika-server/src/main/java/org/apache/tika/server/ZipOutput.java (added)
+++ tika/trunk/tika-server/src/main/java/org/apache/tika/server/ZipOutput.java Thu Feb 24 09:39:17 2011
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.server;
+
+import javax.ws.rs.WebApplicationException;
+import javax.ws.rs.core.StreamingOutput;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.zip.ZipOutputStream;
+
+public class ZipOutput implements StreamingOutput {
+ private final Map<PartExtractor, Collection> parts = new HashMap<PartExtractor, Collection>();
+
+ public <T> void put(PartExtractor<T> extractor, Collection<T> parts) {
+ if (parts.isEmpty()) {
+ return;
+ }
+
+ this.parts.put(extractor, parts);
+ }
+
+ @Override
+ public void write(OutputStream outputStream) throws IOException, WebApplicationException {
+ ZipOutputStream zip = new ZipOutputStream(outputStream);
+
+ zip.setMethod(ZipOutputStream.STORED);
+
+ addParts(zip);
+
+ zip.close();
+ }
+
+ private void addParts(ZipOutputStream zip) throws IOException {
+ for (Map.Entry<PartExtractor, Collection> entry : parts.entrySet()) {
+ for (Object part : entry.getValue()) {
+ entry.getKey().extract(part, zip);
+ }
+ }
+ }
+
+ public boolean isEmpty() {
+ return parts.isEmpty();
+ }
+}
Added: tika/trunk/tika-server/src/main/java/org/apache/tika/server/ZipUtils.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-server/src/main/java/org/apache/tika/server/ZipUtils.java?rev=1074088&view=auto
==============================================================================
--- tika/trunk/tika-server/src/main/java/org/apache/tika/server/ZipUtils.java (added)
+++ tika/trunk/tika-server/src/main/java/org/apache/tika/server/ZipUtils.java Thu Feb 24 09:39:17 2011
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.server;
+
+import java.io.IOException;
+import java.util.zip.CRC32;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipOutputStream;
+import java.util.zip.ZipException;
+import java.util.UUID;
+
+public class ZipUtils {
+ private ZipUtils() {
+ }
+
+ public static void zipStoreBuffer(ZipOutputStream zip, String name, byte[] dataBuffer) throws IOException {
+ ZipEntry zipEntry = new ZipEntry(name!=null?name: UUID.randomUUID().toString());
+ zipEntry.setMethod(ZipOutputStream.STORED);
+
+ zipEntry.setSize(dataBuffer.length);
+ CRC32 crc32 = new CRC32();
+ crc32.update(dataBuffer);
+ zipEntry.setCrc(crc32.getValue());
+
+ try {
+ zip.putNextEntry(zipEntry);
+ } catch (ZipException ex) {
+ if (name!=null) {
+ zipStoreBuffer(zip, null, dataBuffer);
+ return;
+ }
+ }
+
+ zip.write(dataBuffer);
+
+ zip.closeEntry();
+ }
+
+ public static String cleanupFilename(String name) {
+ if (name.charAt(0)=='/') {
+ name = name.substring(1);
+ }
+
+ return name;
+ }
+}
Added: tika/trunk/tika-server/src/main/resources/commons-logging.properties
URL: http://svn.apache.org/viewvc/tika/trunk/tika-server/src/main/resources/commons-logging.properties?rev=1074088&view=auto
==============================================================================
--- tika/trunk/tika-server/src/main/resources/commons-logging.properties (added)
+++ tika/trunk/tika-server/src/main/resources/commons-logging.properties Thu Feb 24 09:39:17 2011
@@ -0,0 +1 @@
+org.apache.commons.logging.Log=org.apache.commons.logging.impl.Jdk14Logger
Added: tika/trunk/tika-server/src/test/java/org/apache/tika/server/MetadataResourceTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-server/src/test/java/org/apache/tika/server/MetadataResourceTest.java?rev=1074088&view=auto
==============================================================================
--- tika/trunk/tika-server/src/test/java/org/apache/tika/server/MetadataResourceTest.java (added)
+++ tika/trunk/tika-server/src/test/java/org/apache/tika/server/MetadataResourceTest.java Thu Feb 24 09:39:17 2011
@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.server;
+
+import au.com.bytecode.opencsv.CSVReader;
+import com.sun.jersey.test.framework.JerseyTest;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import org.junit.Test;
+
+import java.io.Reader;
+import java.util.HashMap;
+import java.util.Map;
+
+public class MetadataResourceTest extends JerseyTest {
+ private static final String META_PATH = "/meta";
+
+ public MetadataResourceTest() throws Exception {
+ super("org.apache.tika.server");
+ }
+
+ @Test
+ public void testSimpleWord() throws Exception {
+ Reader reader =
+ webResource.path(META_PATH)
+ .type("application/msword")
+ .put(Reader.class, ClassLoader.getSystemResourceAsStream(TikaResourceTest.TEST_DOC));
+
+ CSVReader csvReader = new CSVReader(reader);
+
+ Map<String,String> metadata = new HashMap<String, String>();
+
+ String[] nextLine;
+ while ((nextLine = csvReader.readNext()) != null) {
+ metadata.put(nextLine[0], nextLine[1]);
+ }
+
+ assertNotNull(metadata.get("Author"));
+ assertEquals("Maxim Valyanskiy", metadata.get("Author"));
+ }
+/*
+ @Test
+ public void testXLSX() throws Exception {
+ Reader reader =
+ webResource.path(META_PATH)
+ .type("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")
+ .header("File-Name", TikaResourceTest.TEST_XLSX)
+ .put(Reader.class, ClassLoader.getSystemResourceAsStream(TikaResourceTest.TEST_XLSX));
+
+ CSVReader csvReader = new CSVReader(reader);
+
+ final Map < String, String > metadataActual = new HashMap < String, String > (),
+ metadataExpected = new HashMap < String, String > ();
+
+ String[] nextLine;
+ while ((nextLine = csvReader.readNext()) != null) {
+ metadataActual.put(nextLine[0], nextLine[1]);
+ }
+ metadataExpected.put("Author", "jet");
+ metadataExpected.put("Application-Name", "Microsoft Excel");
+ metadataExpected.put("description", "ТеÑÑовÑй комменÑаÑий");
+ metadataExpected.put("resourceName", TikaResourceTest.TEST_XLSX);
+ metadataExpected.put("protected", "false");
+ metadataExpected.put("Creation-Date", "2010-05-11T12:37:42Z");
+ metadataExpected.put("Last-Modified", "2010-05-11T14:46:20Z");
+ assertEquals( true, metadataActual.size() >= metadataExpected.size() );
+ for ( final Map.Entry < String, String > field : metadataExpected.entrySet() ) {
+ final String key = field.getKey(), valueActual = metadataActual.get(key), valueExpected = field.getValue();
+ assertNotNull( valueActual );
+ assertEquals( valueExpected, valueActual );
+ }
+ }
+*/
+}
Added: tika/trunk/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java?rev=1074088&view=auto
==============================================================================
--- tika/trunk/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java (added)
+++ tika/trunk/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java Thu Feb 24 09:39:17 2011
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.server;
+
+import com.sun.jersey.api.client.ClientResponse;
+import com.sun.jersey.core.header.MediaTypes;
+import com.sun.jersey.test.framework.JerseyTest;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+import org.junit.Test;
+
+public class TikaResourceTest extends JerseyTest {
+ private static final String TIKA_PATH = "tika";
+ public static final String TEST_DOC = "test.doc";
+ public static final String TEST_XLSX = "16637.xlsx";
+ private static final int UNPROCESSEABLE = 422;
+
+ public TikaResourceTest() throws Exception {
+ super("org.apache.tika.server");
+ }
+
+ /**
+ * Test to see that the message "Hello World" is sent in the response.
+ */
+ @Test
+ public void testHelloWorld() {
+ String responseMsg = webResource.path(TIKA_PATH).get(String.class);
+ assertEquals(TikaResource.GREETING, responseMsg);
+ }
+
+ @Test
+ public void testSimpleWord() {
+ String responseMsg =
+ webResource.path(TIKA_PATH)
+ .type("application/msword")
+ .put(String.class, ClassLoader.getSystemResourceAsStream(TEST_DOC));
+
+ assertTrue(responseMsg.contains("test"));
+ }
+
+ @Test
+ public void testApplicationWadl() {
+ String serviceWadl = webResource.path("application.wadl").
+ accept(MediaTypes.WADL).get(String.class);
+
+ assertTrue(serviceWadl.length() > 0);
+ }
+
+ @Test
+ public void testPasswordXLS() throws Exception {
+ ClientResponse cr =
+ webResource
+ .path(TIKA_PATH)
+ .type("application/vnd.ms-excel")
+ .put(ClientResponse.class, ClassLoader.getSystemResourceAsStream("password.xls"));
+
+ assertEquals(UNPROCESSEABLE, cr.getStatus());
+ }
+}
Added: tika/trunk/tika-server/src/test/java/org/apache/tika/server/UnpackerResourceTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-server/src/test/java/org/apache/tika/server/UnpackerResourceTest.java?rev=1074088&view=auto
==============================================================================
--- tika/trunk/tika-server/src/test/java/org/apache/tika/server/UnpackerResourceTest.java (added)
+++ tika/trunk/tika-server/src/test/java/org/apache/tika/server/UnpackerResourceTest.java Thu Feb 24 09:39:17 2011
@@ -0,0 +1,197 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.server;
+
+import com.sun.jersey.api.client.ClientResponse;
+import com.sun.jersey.test.framework.JerseyTest;
+import org.apache.commons.codec.digest.DigestUtils;
+import org.apache.tika.io.IOUtils;
+import org.junit.Test;
+
+import java.io.*;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipInputStream;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+
+public class UnpackerResourceTest extends JerseyTest {
+ private static final String UNPACKER_PATH = "/unpacker";
+
+ private static final String TEST_DOC_WAV = "Doc1_ole.doc";
+ private static final String WAV1_MD5 = "bdd0a78a54968e362445364f95d8dc96";
+ private static final String WAV1_NAME = "_1310388059/MSj00974840000[1].wav";
+ private static final String WAV2_MD5 = "3bbd42fb1ac0e46a95350285f16d9596";
+ private static final String WAV2_NAME = "_1310388058/MSj00748450000[1].wav";
+ private static final String APPLICATION_MSWORD = "application/msword";
+ private static final int NO_CONTENT = 204;
+ private static final String JPG_NAME = "image1.jpg";
+ private static final String XSL_IMAGE1_MD5 = "68ead8f4995a3555f48a2f738b2b0c3d";
+ private static final String JPG_MD5 = XSL_IMAGE1_MD5;
+ private static final String JPG2_NAME = "image2.jpg";
+ private static final String JPG2_MD5 = "b27a41d12c646d7fc4f3826cf8183c68";
+ private static final String TEST_DOCX_IMAGE = "2pic.docx";
+ private static final String DOCX_IMAGE1_MD5 = "5516590467b069fa59397432677bad4d";
+ private static final String DOCX_IMAGE2_MD5 = "a5dd81567427070ce0a2ff3e3ef13a4c";
+ private static final String DOCX_IMAGE1_NAME = "image1.jpeg";
+ private static final String DOCX_IMAGE2_NAME = "image2.jpeg";
+ private static final String DOCX_EXE1_MD5 = "d71ffa0623014df725f8fd2710de4411";
+ private static final String DOCX_EXE1_NAME = "GMapTool.exe";
+ private static final String DOCX_EXE2_MD5 = "2485435c7c22d35f2de9b4c98c0c2e1a";
+ private static final String DOCX_EXE2_NAME = "Setup.exe";
+ private static final String XSLX_IMAGE1_NAME = "image1.jpeg";
+ private static final String XSLX_IMAGE2_NAME = "image2.jpeg";
+ private static final String XSL_IMAGE2_MD5 = "8969288f4245120e7c3870287cce0ff3";
+ private static final String COVER_JPG_MD5SUM = "4d236dab6e711735ed11686641b1fba9";
+ private static final String COVER_JPG = "cover.jpg";
+ private static final String APPLICATION_XML = "application/xml";
+ private static final String CONTENT_TYPE = "Content-type";
+
+ public UnpackerResourceTest() throws Exception {
+ super("org.apache.tika.server");
+ }
+
+ @Test
+ public void testDocWAV() throws Exception {
+ InputStream is =
+ webResource
+ .path(UNPACKER_PATH)
+ .type(APPLICATION_MSWORD)
+ .put(InputStream.class, ClassLoader.getSystemResourceAsStream(TEST_DOC_WAV));
+
+ ZipInputStream zip = new ZipInputStream(is);
+
+ Map<String, String> data = readZip(zip);
+
+ assertEquals(WAV1_MD5, data.get(WAV1_NAME));
+ assertEquals(WAV2_MD5, data.get(WAV2_NAME));
+ }
+
+ @Test
+ public void testDocPicture() throws Exception {
+ InputStream is =
+ webResource
+ .path(UNPACKER_PATH)
+ .type(APPLICATION_MSWORD)
+ .put(InputStream.class, ClassLoader.getSystemResourceAsStream(TEST_DOC_WAV));
+
+ ZipInputStream zip = new ZipInputStream(is);
+
+ Map<String, String> data = readZip(zip);
+
+ assertEquals(JPG_MD5, data.get(JPG_NAME));
+ }
+
+ @Test
+ public void testDocPictureNoOle() throws Exception {
+ InputStream is =
+ webResource
+ .path(UNPACKER_PATH)
+ .type(APPLICATION_MSWORD)
+ .put(InputStream.class, ClassLoader.getSystemResourceAsStream("2pic.doc"));
+
+ ZipInputStream zip = new ZipInputStream(is);
+
+ Map<String, String> data = readZip(zip);
+
+ assertEquals(JPG2_MD5, data.get(JPG2_NAME));
+ }
+
+ @Test
+ public void testImageDOCX() throws Exception {
+ InputStream is =
+ webResource
+ .path(UNPACKER_PATH)
+ .put(InputStream.class, ClassLoader.getSystemResourceAsStream(TEST_DOCX_IMAGE));
+
+ ZipInputStream zip = new ZipInputStream(is);
+
+ Map<String, String> data = readZip(zip);
+
+ assertEquals(DOCX_IMAGE1_MD5, data.get(DOCX_IMAGE1_NAME));
+ assertEquals(DOCX_IMAGE2_MD5, data.get(DOCX_IMAGE2_NAME));
+ }
+
+ @Test
+ public void testExeDOCX() throws Exception {
+ String TEST_DOCX_EXE = "2exe.docx";
+ InputStream is =
+ webResource
+ .path(UNPACKER_PATH)
+ .put(InputStream.class, ClassLoader.getSystemResourceAsStream(TEST_DOCX_EXE));
+
+ ZipInputStream zip = new ZipInputStream(is);
+
+ Map<String, String> data = readZip(zip);
+
+ assertEquals(DOCX_EXE1_MD5, data.get(DOCX_EXE1_NAME));
+ assertEquals(DOCX_EXE2_MD5, data.get(DOCX_EXE2_NAME));
+ }
+/*
+ @Test
+ public void testImageXSLX() throws Exception {
+ InputStream is =
+ webResource
+ .path(UNPACKER_PATH)
+ .put(InputStream.class, ClassLoader.getSystemResourceAsStream("pic.xlsx"));
+
+ ZipInputStream zip = new ZipInputStream(is);
+
+ Map<String, String> data = readZip(zip);
+
+ assertEquals(XSL_IMAGE1_MD5, data.get(XSLX_IMAGE1_NAME));
+ assertEquals(XSL_IMAGE2_MD5, data.get(XSLX_IMAGE2_NAME));
+ }
+*/
+ @Test
+ public void testImageXSL() throws Exception {
+ InputStream is =
+ webResource
+ .path(UNPACKER_PATH)
+ .put(InputStream.class, ClassLoader.getSystemResourceAsStream("pic.xls"));
+
+ ZipInputStream zip = new ZipInputStream(is);
+
+ Map<String, String> data = readZip(zip);
+
+ assertEquals(XSL_IMAGE1_MD5, data.get("0.jpg"));
+ assertEquals(XSL_IMAGE2_MD5, data.get("1.jpg"));
+ }
+
+ private static Map<String, String> readZip(ZipInputStream zip) throws IOException {
+ Map<String, String> data = new HashMap<String, String>();
+
+ while (true) {
+ ZipEntry entry = zip.getNextEntry();
+
+ if (entry==null) {
+ break;
+ }
+
+ ByteArrayOutputStream bos = new ByteArrayOutputStream();
+
+ IOUtils.copy(zip, bos);
+
+ data.put(entry.getName(), DigestUtils.md5Hex(bos.toByteArray()));
+ }
+
+ return data;
+ }
+}
Added: tika/trunk/tika-server/src/test/resources/2exe.docx
URL: http://svn.apache.org/viewvc/tika/trunk/tika-server/src/test/resources/2exe.docx?rev=1074088&view=auto
==============================================================================
Binary file - no diff available.
Propchange: tika/trunk/tika-server/src/test/resources/2exe.docx
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: tika/trunk/tika-server/src/test/resources/2pic.doc
URL: http://svn.apache.org/viewvc/tika/trunk/tika-server/src/test/resources/2pic.doc?rev=1074088&view=auto
==============================================================================
Binary file - no diff available.
Propchange: tika/trunk/tika-server/src/test/resources/2pic.doc
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: tika/trunk/tika-server/src/test/resources/2pic.docx
URL: http://svn.apache.org/viewvc/tika/trunk/tika-server/src/test/resources/2pic.docx?rev=1074088&view=auto
==============================================================================
Binary file - no diff available.
Propchange: tika/trunk/tika-server/src/test/resources/2pic.docx
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: tika/trunk/tika-server/src/test/resources/Doc1_ole.doc
URL: http://svn.apache.org/viewvc/tika/trunk/tika-server/src/test/resources/Doc1_ole.doc?rev=1074088&view=auto
==============================================================================
Binary file - no diff available.
Propchange: tika/trunk/tika-server/src/test/resources/Doc1_ole.doc
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: tika/trunk/tika-server/src/test/resources/password.xls
URL: http://svn.apache.org/viewvc/tika/trunk/tika-server/src/test/resources/password.xls?rev=1074088&view=auto
==============================================================================
Binary file - no diff available.
Propchange: tika/trunk/tika-server/src/test/resources/password.xls
------------------------------------------------------------------------------
svn:executable = *
Propchange: tika/trunk/tika-server/src/test/resources/password.xls
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: tika/trunk/tika-server/src/test/resources/pic.xls
URL: http://svn.apache.org/viewvc/tika/trunk/tika-server/src/test/resources/pic.xls?rev=1074088&view=auto
==============================================================================
Binary file - no diff available.
Propchange: tika/trunk/tika-server/src/test/resources/pic.xls
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: tika/trunk/tika-server/src/test/resources/pic.xlsx
URL: http://svn.apache.org/viewvc/tika/trunk/tika-server/src/test/resources/pic.xlsx?rev=1074088&view=auto
==============================================================================
Binary file - no diff available.
Propchange: tika/trunk/tika-server/src/test/resources/pic.xlsx
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: tika/trunk/tika-server/src/test/resources/test.doc
URL: http://svn.apache.org/viewvc/tika/trunk/tika-server/src/test/resources/test.doc?rev=1074088&view=auto
==============================================================================
Binary file - no diff available.
Propchange: tika/trunk/tika-server/src/test/resources/test.doc
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream