You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2017/05/04 01:28:07 UTC
[tika] branch 2.x updated: TIKA 2343 -- add text-main/boilerpipe
option to tika-server
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch 2.x
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/2.x by this push:
new 9ef0787 TIKA 2343 -- add text-main/boilerpipe option to tika-server
9ef0787 is described below
commit 9ef078778bfe2f4f0fc4cbd1897a8cd0c663b60c
Author: tballison <ta...@mitre.org>
AuthorDate: Wed May 3 21:27:57 2017 -0400
TIKA 2343 -- add text-main/boilerpipe option to tika-server
---
.../apache/tika/server/resource/TikaResource.java | 49 ++++++++++++++++++++--
.../org/apache/tika/server/TikaResourceTest.java | 38 +++++++++++++++--
2 files changed, 80 insertions(+), 7 deletions(-)
diff --git a/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java b/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
index 37cff74..3423b32 100644
--- a/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
+++ b/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
@@ -17,8 +17,6 @@
package org.apache.tika.server.resource;
-import static java.nio.charset.StandardCharsets.UTF_8;
-
import javax.mail.internet.ContentDisposition;
import javax.mail.internet.ParseException;
import javax.ws.rs.Consumes;
@@ -69,17 +67,20 @@ import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ParserDecorator;
import org.apache.tika.parser.PasswordProvider;
+import org.apache.tika.parser.html.BoilerpipeContentHandler;
import org.apache.tika.parser.html.HtmlParser;
import org.apache.tika.parser.ocr.TesseractOCRConfig;
import org.apache.tika.parser.pdf.PDFParserConfig;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.ExpandedTitleContentHandler;
-import org.apache.tika.server.RichTextContentHandler;
import org.apache.tika.server.InputStreamFactory;
+import org.apache.tika.server.RichTextContentHandler;
import org.apache.tika.server.TikaServerParseException;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
@Path("/tika")
public class TikaResource {
public static final String GREETING = "This is Tika Server ("+new Tika().toString()+"). Please PUT\n";
@@ -362,6 +363,48 @@ public class TikaResource {
return produceText(att.getObject(InputStream.class), att.getHeaders(), info);
}
+ //this is equivalent to text-main in tika-app
+ @PUT
+ @Consumes("*/*")
+ @Produces("text/plain")
+ @Path("main")
+ public StreamingOutput getTextMain(final InputStream is, @Context HttpHeaders httpHeaders, @Context final UriInfo info) {
+ return produceTextMain(is, httpHeaders.getRequestHeaders(), info);
+ }
+
+ //this is equivalent to text-main (Boilerpipe handler) in tika-app
+ @PUT
+ @Consumes("multipart/form-data")
+ @Produces("text/plain")
+ @Path("form/main")
+ public StreamingOutput getTextMainFromMultipart(final Attachment att, @Context final UriInfo info) {
+ return produceTextMain(att.getObject(InputStream.class), att.getHeaders(), info);
+ }
+
+ public StreamingOutput produceTextMain(final InputStream is, @Context MultivaluedMap<String, String> httpHeaders, @Context final UriInfo info) {
+ final Parser parser = createParser();
+ final Metadata metadata = new Metadata();
+ final ParseContext context = new ParseContext();
+
+ fillMetadata(parser, metadata, context, httpHeaders);
+ fillParseContext(context, httpHeaders, parser);
+
+ logRequest(logger, info, metadata);
+
+ return new StreamingOutput() {
+ public void write(OutputStream outputStream) throws IOException, WebApplicationException {
+ Writer writer = new OutputStreamWriter(outputStream, UTF_8);
+
+ ContentHandler handler = new BoilerpipeContentHandler(writer);
+
+ try (InputStream inputStream = is) {
+ parse(parser, logger, info.getPath(), inputStream, handler, metadata, context);
+ }
+ }
+ };
+ }
+
+
@PUT
@Consumes("*/*")
@Produces("text/plain")
diff --git a/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java b/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java
index 4581157..f414777 100644
--- a/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java
+++ b/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java
@@ -17,9 +17,6 @@
package org.apache.tika.server;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
-
import javax.ws.rs.core.Response;
import java.io.InputStream;
import java.util.ArrayList;
@@ -32,11 +29,15 @@ import org.apache.cxf.jaxrs.lifecycle.SingletonResourceProvider;
import org.apache.tika.parser.ocr.TesseractOCRConfig;
import org.apache.tika.parser.ocr.TesseractOCRParser;
import org.apache.tika.server.resource.TikaResource;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
import org.junit.Test;
public class TikaResourceTest extends CXFTestBase {
public static final String TEST_DOC = "test.doc";
- public static final String TEST_XLSX = "16637.xlsx";
public static final String TEST_PASSWORD_PROTECTED = "password.xls";
private static final String TEST_RECURSIVE_DOC = "test_recursive_embedded.docx";
@@ -77,6 +78,35 @@ public class TikaResourceTest extends CXFTestBase {
}
@Test
+ public void testTextMain() throws Exception {
+ Response response = WebClient.create(endPoint + TIKA_PATH + "/main")
+ .accept("text/plain")
+ .put(getTestDocumentAsStream("testHTML.html"));
+ String responseMsg = getStringFromInputStream((InputStream) response
+ .getEntity());
+ assertTrue(responseMsg.contains("Title : Test Indexation Html"));
+ assertFalse(responseMsg.contains("Indexation du fichier"));
+ }
+
+ @Test
+ public void testTextMainMultipart() throws Exception {
+
+ Attachment attachmentPart =
+ new Attachment("myhtml", "text/html",
+ getTestDocumentAsStream("testHTML.html"));
+
+
+ Response response = WebClient.create(endPoint + TIKA_PATH+"/form/main")
+ .type("multipart/form-data")
+ .accept("text/plain")
+ .put(attachmentPart);
+ String responseMsg = getStringFromInputStream((InputStream) response
+ .getEntity());
+ assertTrue(responseMsg.contains("Title : Test Indexation Html"));
+ assertFalse(responseMsg.contains("Indexation du fichier"));
+ }
+
+ @Test
public void testApplicationWadl() throws Exception {
Response response = WebClient
.create(endPoint + TIKA_PATH + "?_wadl")
--
To stop receiving notification emails like this one, please contact
['"commits@tika.apache.org" <co...@tika.apache.org>'].