You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ma...@apache.org on 2015/05/05 03:13:20 UTC
svn commit: r1677717 - in /tika/trunk/tika-server/src:
main/java/org/apache/tika/server/resource/LanguageResource.java
test/java/org/apache/tika/server/LanguageResourceTest.java
test/resources/english.txt test/resources/french.txt
Author: mattmann
Date: Tue May 5 01:13:20 2015
New Revision: 1677717
URL: http://svn.apache.org/r1677717
Log:
- fix for TIKA-1622 Expose Tika LanguageIdentifier via Tika Server
Added:
tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/LanguageResource.java
tika/trunk/tika-server/src/test/java/org/apache/tika/server/LanguageResourceTest.java
tika/trunk/tika-server/src/test/resources/english.txt
tika/trunk/tika-server/src/test/resources/french.txt
Added: tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/LanguageResource.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/LanguageResource.java?rev=1677717&view=auto
==============================================================================
--- tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/LanguageResource.java (added)
+++ tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/LanguageResource.java Tue May 5 01:13:20 2015
@@ -0,0 +1,83 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.server.resource;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import javax.ws.rs.Consumes;
+import javax.ws.rs.POST;
+import javax.ws.rs.PUT;
+import javax.ws.rs.Path;
+import javax.ws.rs.Produces;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.language.LanguageIdentifier;
+import org.apache.tika.language.LanguageProfile;
+
+import com.google.common.base.Charsets;
+
+@Path("/language")
+public class LanguageResource {
+
+ private static final Log logger = LogFactory.getLog(LanguageResource.class
+ .getName());
+
+ private TikaConfig config;
+
+ public LanguageResource(TikaConfig config) {
+ this.config = config;
+ }
+
+ @PUT
+ @POST
+ @Path("/stream")
+ @Consumes("*/*")
+ @Produces("text/plain")
+ public String detect(final InputStream is) throws IOException {
+ // comme çi comme ça
+ // this is English!
+ String fileTxt = IOUtils.toString(is, Charsets.UTF_8);
+ logger.debug("File: " + fileTxt);
+ LanguageIdentifier lang = new LanguageIdentifier(new LanguageProfile(
+ fileTxt));
+ String detectedLang = lang.getLanguage();
+ logger.info("Detecting language for incoming resource: ["
+ + detectedLang + "]");
+ return detectedLang;
+ }
+
+ @PUT
+ @POST
+ @Path("/string")
+ @Consumes("*/*")
+ @Produces("text/plain")
+ public String detect(final String string) throws IOException {
+ logger.debug("String: " + string);
+ LanguageIdentifier lang = new LanguageIdentifier(new LanguageProfile(
+ string));
+ String detectedLang = lang.getLanguage();
+ logger.info("Detecting language for incoming resource: ["
+ + detectedLang + "]");
+ return detectedLang;
+ }
+
+}
Added: tika/trunk/tika-server/src/test/java/org/apache/tika/server/LanguageResourceTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-server/src/test/java/org/apache/tika/server/LanguageResourceTest.java?rev=1677717&view=auto
==============================================================================
--- tika/trunk/tika-server/src/test/java/org/apache/tika/server/LanguageResourceTest.java (added)
+++ tika/trunk/tika-server/src/test/java/org/apache/tika/server/LanguageResourceTest.java Tue May 5 01:13:20 2015
@@ -0,0 +1,109 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.server;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+
+import javax.ws.rs.core.Response;
+
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.cxf.jaxrs.JAXRSServerFactoryBean;
+import org.apache.cxf.jaxrs.client.WebClient;
+import org.apache.cxf.jaxrs.lifecycle.SingletonResourceProvider;
+import org.apache.tika.server.resource.LanguageResource;
+import org.apache.tika.server.writer.TarWriter;
+import org.apache.tika.server.writer.ZipWriter;
+import org.junit.Test;
+
+public class LanguageResourceTest extends CXFTestBase {
+
+ private static final String LANG_PATH = "/language";
+ private static final String LANG_STREAM_PATH = LANG_PATH + "/stream";
+ private static final String LANG_STRING_PATH = LANG_PATH + "/string";
+ private static final String ENGLISH_STRING = "This is English!";
+ private static final String FRENCH_STRING = "comme çi comme ça";
+
+ @Override
+ protected void setUpResources(JAXRSServerFactoryBean sf) {
+ sf.setResourceClasses(LanguageResource.class);
+ sf.setResourceProvider(LanguageResource.class,
+ new SingletonResourceProvider(new LanguageResource(tika)));
+
+ }
+
+ @Override
+ protected void setUpProviders(JAXRSServerFactoryBean sf) {
+ List<Object> providers = new ArrayList<Object>();
+ providers.add(new TarWriter());
+ providers.add(new ZipWriter());
+ providers.add(new TikaServerParseExceptionMapper(false));
+ sf.setProviders(providers);
+
+ }
+
+ @Test
+ public void testDetectEnglishString() throws Exception {
+ String url = endPoint + LANG_STRING_PATH;
+ Response response = WebClient.create(url).type("text/plain")
+ .accept("text/plain").put(ENGLISH_STRING);
+ assertNotNull(response);
+ String readLang = getStringFromInputStream((InputStream) response
+ .getEntity());
+ assertEquals("en", readLang);
+ }
+
+ @Test
+ public void testDetectFrenchString() throws Exception {
+ String url = endPoint + LANG_STRING_PATH;
+ Response response = WebClient.create(url).type("text/plain")
+ .accept("text/plain").put(FRENCH_STRING);
+ assertNotNull(response);
+ String readLang = getStringFromInputStream((InputStream) response
+ .getEntity());
+ assertEquals("fr", readLang);
+ }
+
+ @Test
+ public void testDetectEnglishFile() throws Exception {
+ String url = endPoint + LANG_STREAM_PATH;
+ Response response = WebClient.create(url).type("text/plain")
+ .accept("text/plain")
+ .put(ClassLoader.getSystemResourceAsStream("english.txt"));
+ assertNotNull(response);
+ String readLang = getStringFromInputStream((InputStream) response
+ .getEntity());
+ assertEquals("en", readLang);
+ }
+
+ @Test
+ public void testDetectFrenchFile() throws Exception {
+ String url = endPoint + LANG_STREAM_PATH;
+ Response response = WebClient.create(url).type("text/plain")
+ .accept("text/plain")
+ .put(ClassLoader.getSystemResourceAsStream("french.txt"));
+ assertNotNull(response);
+ String readLang = getStringFromInputStream((InputStream) response
+ .getEntity());
+ assertEquals("fr", readLang);
+ }
+
+}
Added: tika/trunk/tika-server/src/test/resources/english.txt
URL: http://svn.apache.org/viewvc/tika/trunk/tika-server/src/test/resources/english.txt?rev=1677717&view=auto
==============================================================================
--- tika/trunk/tika-server/src/test/resources/english.txt (added)
+++ tika/trunk/tika-server/src/test/resources/english.txt Tue May 5 01:13:20 2015
@@ -0,0 +1 @@
+This is English!
Added: tika/trunk/tika-server/src/test/resources/french.txt
URL: http://svn.apache.org/viewvc/tika/trunk/tika-server/src/test/resources/french.txt?rev=1677717&view=auto
==============================================================================
--- tika/trunk/tika-server/src/test/resources/french.txt (added)
+++ tika/trunk/tika-server/src/test/resources/french.txt Tue May 5 01:13:20 2015
@@ -0,0 +1 @@
+comme çi comme ça