You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ma...@apache.org on 2015/05/05 03:13:20 UTC

svn commit: r1677717 - in /tika/trunk/tika-server/src: main/java/org/apache/tika/server/resource/LanguageResource.java test/java/org/apache/tika/server/LanguageResourceTest.java test/resources/english.txt test/resources/french.txt

Author: mattmann
Date: Tue May  5 01:13:20 2015
New Revision: 1677717

URL: http://svn.apache.org/r1677717
Log:
- fix for TIKA-1622 Expose Tika LanguageIdentifier via Tika Server

Added:
    tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/LanguageResource.java
    tika/trunk/tika-server/src/test/java/org/apache/tika/server/LanguageResourceTest.java
    tika/trunk/tika-server/src/test/resources/english.txt
    tika/trunk/tika-server/src/test/resources/french.txt

Added: tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/LanguageResource.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/LanguageResource.java?rev=1677717&view=auto
==============================================================================
--- tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/LanguageResource.java (added)
+++ tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/LanguageResource.java Tue May  5 01:13:20 2015
@@ -0,0 +1,83 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.server.resource;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import javax.ws.rs.Consumes;
+import javax.ws.rs.POST;
+import javax.ws.rs.PUT;
+import javax.ws.rs.Path;
+import javax.ws.rs.Produces;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.language.LanguageIdentifier;
+import org.apache.tika.language.LanguageProfile;
+
+import com.google.common.base.Charsets;
+
+@Path("/language")
+public class LanguageResource {
+
+	private static final Log logger = LogFactory.getLog(LanguageResource.class
+			.getName());
+
+	private TikaConfig config;
+
+	public LanguageResource(TikaConfig config) {
+		this.config = config;
+	}
+
+	@PUT
+	@POST
+	@Path("/stream")
+	@Consumes("*/*")
+	@Produces("text/plain")
+	public String detect(final InputStream is) throws IOException {
+		// comme çi comme ça
+		// this is English!
+		String fileTxt = IOUtils.toString(is, Charsets.UTF_8);
+		logger.debug("File: " + fileTxt);
+		LanguageIdentifier lang = new LanguageIdentifier(new LanguageProfile(
+				fileTxt));
+		String detectedLang = lang.getLanguage();
+		logger.info("Detecting language for incoming resource: ["
+				+ detectedLang + "]");
+		return detectedLang;
+	}
+
+	@PUT
+	@POST
+	@Path("/string")
+	@Consumes("*/*")
+	@Produces("text/plain")
+	public String detect(final String string) throws IOException {
+		logger.debug("String: " + string);
+		LanguageIdentifier lang = new LanguageIdentifier(new LanguageProfile(
+				string));
+		String detectedLang = lang.getLanguage();
+		logger.info("Detecting language for incoming resource: ["
+				+ detectedLang + "]");
+		return detectedLang;
+	}
+
+}

Added: tika/trunk/tika-server/src/test/java/org/apache/tika/server/LanguageResourceTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-server/src/test/java/org/apache/tika/server/LanguageResourceTest.java?rev=1677717&view=auto
==============================================================================
--- tika/trunk/tika-server/src/test/java/org/apache/tika/server/LanguageResourceTest.java (added)
+++ tika/trunk/tika-server/src/test/java/org/apache/tika/server/LanguageResourceTest.java Tue May  5 01:13:20 2015
@@ -0,0 +1,109 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.server;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+
+import javax.ws.rs.core.Response;
+
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.cxf.jaxrs.JAXRSServerFactoryBean;
+import org.apache.cxf.jaxrs.client.WebClient;
+import org.apache.cxf.jaxrs.lifecycle.SingletonResourceProvider;
+import org.apache.tika.server.resource.LanguageResource;
+import org.apache.tika.server.writer.TarWriter;
+import org.apache.tika.server.writer.ZipWriter;
+import org.junit.Test;
+
+public class LanguageResourceTest extends CXFTestBase {
+
+	private static final String LANG_PATH = "/language";
+	private static final String LANG_STREAM_PATH = LANG_PATH + "/stream";
+	private static final String LANG_STRING_PATH = LANG_PATH + "/string";
+	private static final String ENGLISH_STRING = "This is English!";
+	private static final String FRENCH_STRING = "comme çi comme ça";
+
+	@Override
+	protected void setUpResources(JAXRSServerFactoryBean sf) {
+		sf.setResourceClasses(LanguageResource.class);
+		sf.setResourceProvider(LanguageResource.class,
+				new SingletonResourceProvider(new LanguageResource(tika)));
+
+	}
+
+	@Override
+	protected void setUpProviders(JAXRSServerFactoryBean sf) {
+		List<Object> providers = new ArrayList<Object>();
+		providers.add(new TarWriter());
+		providers.add(new ZipWriter());
+		providers.add(new TikaServerParseExceptionMapper(false));
+		sf.setProviders(providers);
+
+	}
+
+	@Test
+	public void testDetectEnglishString() throws Exception {
+		String url = endPoint + LANG_STRING_PATH;
+		Response response = WebClient.create(url).type("text/plain")
+				.accept("text/plain").put(ENGLISH_STRING);
+		assertNotNull(response);
+		String readLang = getStringFromInputStream((InputStream) response
+				.getEntity());
+		assertEquals("en", readLang);
+	}
+
+	@Test
+	public void testDetectFrenchString() throws Exception {
+		String url = endPoint + LANG_STRING_PATH;
+		Response response = WebClient.create(url).type("text/plain")
+				.accept("text/plain").put(FRENCH_STRING);
+		assertNotNull(response);
+		String readLang = getStringFromInputStream((InputStream) response
+				.getEntity());
+		assertEquals("fr", readLang);
+	}
+
+	@Test
+	public void testDetectEnglishFile() throws Exception {
+		String url = endPoint + LANG_STREAM_PATH;
+		Response response = WebClient.create(url).type("text/plain")
+				.accept("text/plain")
+				.put(ClassLoader.getSystemResourceAsStream("english.txt"));
+		assertNotNull(response);
+		String readLang = getStringFromInputStream((InputStream) response
+				.getEntity());
+		assertEquals("en", readLang);
+	}
+
+	@Test
+	public void testDetectFrenchFile() throws Exception {
+		String url = endPoint + LANG_STREAM_PATH;
+		Response response = WebClient.create(url).type("text/plain")
+				.accept("text/plain")
+				.put(ClassLoader.getSystemResourceAsStream("french.txt"));
+		assertNotNull(response);
+		String readLang = getStringFromInputStream((InputStream) response
+				.getEntity());
+		assertEquals("fr", readLang);
+	}
+
+}

Added: tika/trunk/tika-server/src/test/resources/english.txt
URL: http://svn.apache.org/viewvc/tika/trunk/tika-server/src/test/resources/english.txt?rev=1677717&view=auto
==============================================================================
--- tika/trunk/tika-server/src/test/resources/english.txt (added)
+++ tika/trunk/tika-server/src/test/resources/english.txt Tue May  5 01:13:20 2015
@@ -0,0 +1 @@
+This is English!

Added: tika/trunk/tika-server/src/test/resources/french.txt
URL: http://svn.apache.org/viewvc/tika/trunk/tika-server/src/test/resources/french.txt?rev=1677717&view=auto
==============================================================================
--- tika/trunk/tika-server/src/test/resources/french.txt (added)
+++ tika/trunk/tika-server/src/test/resources/french.txt Tue May  5 01:13:20 2015
@@ -0,0 +1 @@
+comme çi comme ça