You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2017/05/04 01:28:07 UTC

[tika] branch 2.x updated: TIKA 2343 -- add text-main/boilerpipe option to tika-server

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch 2.x
in repository https://gitbox.apache.org/repos/asf/tika.git

The following commit(s) were added to refs/heads/2.x by this push:
       new  9ef0787   TIKA 2343 -- add text-main/boilerpipe option to tika-server
9ef0787 is described below

commit 9ef078778bfe2f4f0fc4cbd1897a8cd0c663b60c
Author: tballison <ta...@mitre.org>
AuthorDate: Wed May 3 21:27:57 2017 -0400

    TIKA 2343 -- add text-main/boilerpipe option to tika-server
---
 .../apache/tika/server/resource/TikaResource.java  | 49 ++++++++++++++++++++--
 .../org/apache/tika/server/TikaResourceTest.java   | 38 +++++++++++++++--
 2 files changed, 80 insertions(+), 7 deletions(-)

diff --git a/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java b/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
index 37cff74..3423b32 100644
--- a/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
+++ b/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
@@ -17,8 +17,6 @@
 
 package org.apache.tika.server.resource;
 
-import static java.nio.charset.StandardCharsets.UTF_8;
-
 import javax.mail.internet.ContentDisposition;
 import javax.mail.internet.ParseException;
 import javax.ws.rs.Consumes;
@@ -69,17 +67,20 @@ import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.ParserDecorator;
 import org.apache.tika.parser.PasswordProvider;
+import org.apache.tika.parser.html.BoilerpipeContentHandler;
 import org.apache.tika.parser.html.HtmlParser;
 import org.apache.tika.parser.ocr.TesseractOCRConfig;
 import org.apache.tika.parser.pdf.PDFParserConfig;
 import org.apache.tika.sax.BodyContentHandler;
 import org.apache.tika.sax.ExpandedTitleContentHandler;
-import org.apache.tika.server.RichTextContentHandler;
 import org.apache.tika.server.InputStreamFactory;
+import org.apache.tika.server.RichTextContentHandler;
 import org.apache.tika.server.TikaServerParseException;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
+import static java.nio.charset.StandardCharsets.UTF_8;
+
 @Path("/tika")
 public class TikaResource {
     public static final String GREETING = "This is Tika Server ("+new Tika().toString()+"). Please PUT\n";
@@ -362,6 +363,48 @@ public class TikaResource {
         return produceText(att.getObject(InputStream.class), att.getHeaders(), info);
     }
 
+    //this is equivalent to text-main in tika-app
+    @PUT
+    @Consumes("*/*")
+    @Produces("text/plain")
+    @Path("main")
+    public StreamingOutput getTextMain(final InputStream is, @Context HttpHeaders httpHeaders, @Context final UriInfo info) {
+        return produceTextMain(is, httpHeaders.getRequestHeaders(), info);
+    }
+
+    //this is equivalent to text-main (Boilerpipe handler) in tika-app
+    @PUT
+    @Consumes("multipart/form-data")
+    @Produces("text/plain")
+    @Path("form/main")
+    public StreamingOutput getTextMainFromMultipart(final Attachment att, @Context final UriInfo info) {
+        return produceTextMain(att.getObject(InputStream.class), att.getHeaders(), info);
+    }
+
+    public StreamingOutput produceTextMain(final InputStream is, @Context MultivaluedMap<String, String> httpHeaders, @Context final UriInfo info) {
+        final Parser parser = createParser();
+        final Metadata metadata = new Metadata();
+        final ParseContext context = new ParseContext();
+
+        fillMetadata(parser, metadata, context, httpHeaders);
+        fillParseContext(context, httpHeaders, parser);
+
+        logRequest(logger, info, metadata);
+
+        return new StreamingOutput() {
+            public void write(OutputStream outputStream) throws IOException, WebApplicationException {
+                Writer writer = new OutputStreamWriter(outputStream, UTF_8);
+
+                ContentHandler handler = new BoilerpipeContentHandler(writer);
+
+                try (InputStream inputStream = is) {
+                    parse(parser, logger, info.getPath(), inputStream, handler, metadata, context);
+                }
+            }
+        };
+    }
+
+
     @PUT
     @Consumes("*/*")
     @Produces("text/plain")
diff --git a/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java b/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java
index 4581157..f414777 100644
--- a/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java
+++ b/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java
@@ -17,9 +17,6 @@
 
 package org.apache.tika.server;
 
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
-
 import javax.ws.rs.core.Response;
 import java.io.InputStream;
 import java.util.ArrayList;
@@ -32,11 +29,15 @@ import org.apache.cxf.jaxrs.lifecycle.SingletonResourceProvider;
 import org.apache.tika.parser.ocr.TesseractOCRConfig;
 import org.apache.tika.parser.ocr.TesseractOCRParser;
 import org.apache.tika.server.resource.TikaResource;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
 import org.junit.Test;
 
 public class TikaResourceTest extends CXFTestBase {
     public static final String TEST_DOC = "test.doc";
-    public static final String TEST_XLSX = "16637.xlsx";
     public static final String TEST_PASSWORD_PROTECTED = "password.xls";
     private static final String TEST_RECURSIVE_DOC = "test_recursive_embedded.docx";
 
@@ -77,6 +78,35 @@ public class TikaResourceTest extends CXFTestBase {
     }
 
     @Test
+    public void testTextMain() throws Exception {
+        Response response = WebClient.create(endPoint + TIKA_PATH + "/main")
+                .accept("text/plain")
+                .put(getTestDocumentAsStream("testHTML.html"));
+        String responseMsg = getStringFromInputStream((InputStream) response
+                .getEntity());
+        assertTrue(responseMsg.contains("Title : Test Indexation Html"));
+        assertFalse(responseMsg.contains("Indexation du fichier"));
+    }
+
+    @Test
+    public void testTextMainMultipart() throws Exception {
+
+        Attachment attachmentPart =
+                new Attachment("myhtml", "text/html",
+                        getTestDocumentAsStream("testHTML.html"));
+
+
+        Response response = WebClient.create(endPoint + TIKA_PATH+"/form/main")
+                .type("multipart/form-data")
+                .accept("text/plain")
+                .put(attachmentPart);
+        String responseMsg = getStringFromInputStream((InputStream) response
+                .getEntity());
+        assertTrue(responseMsg.contains("Title : Test Indexation Html"));
+        assertFalse(responseMsg.contains("Indexation du fichier"));
+    }
+
+    @Test
     public void testApplicationWadl() throws Exception {
         Response response = WebClient
                 .create(endPoint + TIKA_PATH + "?_wadl")

-- 
To stop receiving notification emails like this one, please contact
['"commits@tika.apache.org" <co...@tika.apache.org>'].