You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ma...@apache.org on 2013/05/27 19:05:02 UTC
svn commit: r1486665 - in /tika/trunk: CHANGES.txt
tika-server/src/main/java/org/apache/tika/server/TikaResource.java
tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java
Author: mattmann
Date: Mon May 27 17:05:01 2013
New Revision: 1486665
URL: http://svn.apache.org/r1486665
Log:
Patch for TIKA-1127 provided by Ali Mosavian.
Modified:
tika/trunk/CHANGES.txt
tika/trunk/tika-server/src/main/java/org/apache/tika/server/TikaResource.java
tika/trunk/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java
Modified: tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1486665&r1=1486664&r2=1486665&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Mon May 27 17:05:01 2013
@@ -1,5 +1,8 @@
Release 1.4 Current Development
+ * Improvements to tika-server to allow it to produce text/html and
+ text/xml content (TIKA-1126, TIKA-1127).
+
* Improvements were made to the Compressor Parser to handle g'zipped files
that require the decompressConcatenated option set to true (TIKA-1096).
Modified: tika/trunk/tika-server/src/main/java/org/apache/tika/server/TikaResource.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-server/src/main/java/org/apache/tika/server/TikaResource.java?rev=1486665&r1=1486664&r2=1486665&view=diff
==============================================================================
--- tika/trunk/tika-server/src/main/java/org/apache/tika/server/TikaResource.java (original)
+++ tika/trunk/tika-server/src/main/java/org/apache/tika/server/TikaResource.java Mon May 27 17:05:01 2013
@@ -208,75 +208,145 @@ public class TikaResource {
}
- @PUT
- @Consumes("*/*")
- @Produces("text/html")
- public StreamingOutput getHTML(final InputStream is, @Context HttpHeaders httpHeaders, @Context final UriInfo info) {
- final AutoDetectParser parser = createParser();
- final Metadata metadata = new Metadata();
-
- fillMetadata(parser, metadata, httpHeaders);
-
- logRequest(logger, info, metadata);
-
- return new StreamingOutput() {
- public void write(OutputStream outputStream)
- throws IOException, WebApplicationException {
- Writer writer = new OutputStreamWriter(outputStream, "UTF-8");
- ContentHandler content;
-
- try {
- SAXTransformerFactory factory = (SAXTransformerFactory)SAXTransformerFactory.newInstance( );
- TransformerHandler handler = factory.newTransformerHandler( );
- handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html");
- handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
- handler.getTransformer().setOutputProperty(OutputKeys.ENCODING, "UTF-8");
- handler.setResult(new StreamResult(writer));
- content = new ExpandedTitleContentHandler( handler );
- }
- catch ( TransformerConfigurationException e ) {
- throw new WebApplicationException( e );
- }
-
- TikaInputStream tis = TikaInputStream.get(is);
-
- try {
- tis.getFile();
- parser.parse(tis, content, metadata);
- }
- catch (SAXException e) {
- throw new WebApplicationException(e);
- }
- catch (EncryptedDocumentException e) {
- logger.warn(String.format(
- "%s: Encrypted document",
- info.getPath()
- ), e);
- throw new WebApplicationException(e, Response.status(422).build());
- }
- catch (TikaException e) {
- logger.warn(String.format(
- "%s: Text extraction failed",
- info.getPath()
- ), e);
-
- if (e.getCause()!=null && e.getCause() instanceof WebApplicationException)
- throw (WebApplicationException) e.getCause();
-
- if (e.getCause()!=null && e.getCause() instanceof IllegalStateException)
- throw new WebApplicationException(Response.status(422).build());
-
- if (e.getCause()!=null && e.getCause() instanceof OldWordFileFormatException)
- throw new WebApplicationException(Response.status(422).build());
-
- throw new WebApplicationException(Response.Status.INTERNAL_SERVER_ERROR);
- }
- finally {
- tis.close();
- }
- }
- };
- }
+ @PUT
+ @Consumes("*/*")
+ @Produces("text/html")
+ public StreamingOutput getHTML(final InputStream is, @Context HttpHeaders httpHeaders, @Context final UriInfo info) {
+ final AutoDetectParser parser = createParser();
+ final Metadata metadata = new Metadata();
+
+ fillMetadata(parser, metadata, httpHeaders);
+
+ logRequest(logger, info, metadata);
+
+ return new StreamingOutput() {
+ public void write(OutputStream outputStream)
+ throws IOException, WebApplicationException {
+ Writer writer = new OutputStreamWriter(outputStream, "UTF-8");
+ ContentHandler content;
+
+ try {
+ SAXTransformerFactory factory = (SAXTransformerFactory)SAXTransformerFactory.newInstance( );
+ TransformerHandler handler = factory.newTransformerHandler( );
+ handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html");
+ handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
+ handler.getTransformer().setOutputProperty(OutputKeys.ENCODING, "UTF-8");
+ handler.setResult(new StreamResult(writer));
+ content = new ExpandedTitleContentHandler( handler );
+ }
+ catch ( TransformerConfigurationException e ) {
+ throw new WebApplicationException( e );
+ }
+
+ TikaInputStream tis = TikaInputStream.get(is);
+
+ try {
+ tis.getFile();
+ parser.parse(tis, content, metadata);
+ }
+ catch (SAXException e) {
+ throw new WebApplicationException(e);
+ }
+ catch (EncryptedDocumentException e) {
+ logger.warn(String.format(
+ "%s: Encrypted document",
+ info.getPath()
+ ), e);
+ throw new WebApplicationException(e, Response.status(422).build());
+ }
+ catch (TikaException e) {
+ logger.warn(String.format(
+ "%s: Text extraction failed",
+ info.getPath()
+ ), e);
+
+ if (e.getCause()!=null && e.getCause() instanceof WebApplicationException)
+ throw (WebApplicationException) e.getCause();
+
+ if (e.getCause()!=null && e.getCause() instanceof IllegalStateException)
+ throw new WebApplicationException(Response.status(422).build());
+
+ if (e.getCause()!=null && e.getCause() instanceof OldWordFileFormatException)
+ throw new WebApplicationException(Response.status(422).build());
+
+ throw new WebApplicationException(Response.Status.INTERNAL_SERVER_ERROR);
+ }
+ finally {
+ tis.close();
+ }
+ }
+ };
+ }
+
+ @PUT
+ @Consumes("*/*")
+ @Produces("text/xml")
+ public StreamingOutput getXML(final InputStream is, @Context HttpHeaders httpHeaders, @Context final UriInfo info) {
+ final AutoDetectParser parser = createParser();
+ final Metadata metadata = new Metadata();
+
+ fillMetadata(parser, metadata, httpHeaders);
+
+ logRequest(logger, info, metadata);
+
+ return new StreamingOutput() {
+ public void write(OutputStream outputStream)
+ throws IOException, WebApplicationException {
+ Writer writer = new OutputStreamWriter(outputStream, "UTF-8");
+ ContentHandler content;
+
+ try {
+ SAXTransformerFactory factory = (SAXTransformerFactory)SAXTransformerFactory.newInstance( );
+ TransformerHandler handler = factory.newTransformerHandler( );
+ handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
+ handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
+ handler.getTransformer().setOutputProperty(OutputKeys.ENCODING, "UTF-8");
+ handler.setResult(new StreamResult(writer));
+ content = new ExpandedTitleContentHandler( handler );
+ }
+ catch ( TransformerConfigurationException e ) {
+ throw new WebApplicationException( e );
+ }
+
+ TikaInputStream tis = TikaInputStream.get(is);
+
+ try {
+ tis.getFile();
+ parser.parse(tis, content, metadata);
+ }
+ catch (SAXException e) {
+ throw new WebApplicationException(e);
+ }
+ catch (EncryptedDocumentException e) {
+ logger.warn(String.format(
+ "%s: Encrypted document",
+ info.getPath()
+ ), e);
+ throw new WebApplicationException(e, Response.status(422).build());
+ }
+ catch (TikaException e) {
+ logger.warn(String.format(
+ "%s: Text extraction failed",
+ info.getPath()
+ ), e);
+
+ if (e.getCause()!=null && e.getCause() instanceof WebApplicationException)
+ throw (WebApplicationException) e.getCause();
+
+ if (e.getCause()!=null && e.getCause() instanceof IllegalStateException)
+ throw new WebApplicationException(Response.status(422).build());
+
+ if (e.getCause()!=null && e.getCause() instanceof OldWordFileFormatException)
+ throw new WebApplicationException(Response.status(422).build());
+
+ throw new WebApplicationException(Response.Status.INTERNAL_SERVER_ERROR);
+ }
+ finally {
+ tis.close();
+ }
+ }
+ };
+ }
public static void logRequest(Log logger, UriInfo info, Metadata metadata) {
if (metadata.get(org.apache.tika.metadata.HttpHeaders.CONTENT_TYPE)==null) {
Modified: tika/trunk/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java?rev=1486665&r1=1486664&r2=1486665&view=diff
==============================================================================
--- tika/trunk/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java (original)
+++ tika/trunk/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java Mon May 27 17:05:01 2013
@@ -110,24 +110,45 @@ public class TikaResourceTest extends CX
assertEquals(UNPROCESSEABLE, response.getStatus());
}
- @Test
- public void testSimpleWordHTML() throws Exception {
- Response response = WebClient.create(endPoint + TIKA_PATH)
- .type("application/msword")
- .accept("text/html")
- .put(ClassLoader.getSystemResourceAsStream(TEST_DOC));
- String responseMsg = getStringFromInputStream((InputStream) response
- .getEntity());
- assertTrue(responseMsg.contains("test"));
- }
-
- @Test
- public void testPasswordXLSHTML() throws Exception {
- Response response = WebClient.create(endPoint + TIKA_PATH)
- .type("application/vnd.ms-excel")
- .accept("text/html")
- .put(ClassLoader.getSystemResourceAsStream("password.xls"));
+ @Test
+ public void testSimpleWordHTML() throws Exception {
+ Response response = WebClient.create(endPoint + TIKA_PATH)
+ .type("application/msword")
+ .accept("text/html")
+ .put(ClassLoader.getSystemResourceAsStream(TEST_DOC));
+ String responseMsg = getStringFromInputStream((InputStream) response
+ .getEntity());
+ assertTrue(responseMsg.contains("test"));
+ }
- assertEquals(UNPROCESSEABLE, response.getStatus());
- }
+ @Test
+ public void testPasswordXLSHTML() throws Exception {
+ Response response = WebClient.create(endPoint + TIKA_PATH)
+ .type("application/vnd.ms-excel")
+ .accept("text/html")
+ .put(ClassLoader.getSystemResourceAsStream("password.xls"));
+
+ assertEquals(UNPROCESSEABLE, response.getStatus());
+ }
+
+ @Test
+ public void testSimpleWordXML() throws Exception {
+ Response response = WebClient.create(endPoint + TIKA_PATH)
+ .type("application/msword")
+ .accept("text/xml")
+ .put(ClassLoader.getSystemResourceAsStream(TEST_DOC));
+ String responseMsg = getStringFromInputStream((InputStream) response
+ .getEntity());
+ assertTrue(responseMsg.contains("test"));
+ }
+
+ @Test
+ public void testPasswordXLSXML() throws Exception {
+ Response response = WebClient.create(endPoint + TIKA_PATH)
+ .type("application/vnd.ms-excel")
+ .accept("text/xml")
+ .put(ClassLoader.getSystemResourceAsStream("password.xls"));
+
+ assertEquals(UNPROCESSEABLE, response.getStatus());
+ }
}