You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by tp...@apache.org on 2014/07/17 19:51:40 UTC
svn commit: r1611417 - in /tika/trunk/tika-translate/src:
main/java/org/apache/tika/language/translate/
main/resources/META-INF/services/
test/java/org/apache/tika/language/translate/
Author: tpalsulich
Date: Thu Jul 17 17:51:40 2014
New Revision: 1611417
URL: http://svn.apache.org/r1611417
Log:
Fix for JIRA issue TIKA-1370, adding a CachedTranslator.
Added:
tika/trunk/tika-translate/src/main/java/org/apache/tika/language/translate/CachedTranslator.java
tika/trunk/tika-translate/src/test/java/org/apache/tika/language/translate/CachedTranslatorTest.java
Modified:
tika/trunk/tika-translate/src/main/resources/META-INF/services/org.apache.tika.language.translate.Translator
Added: tika/trunk/tika-translate/src/main/java/org/apache/tika/language/translate/CachedTranslator.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-translate/src/main/java/org/apache/tika/language/translate/CachedTranslator.java?rev=1611417&view=auto
==============================================================================
--- tika/trunk/tika-translate/src/main/java/org/apache/tika/language/translate/CachedTranslator.java (added)
+++ tika/trunk/tika-translate/src/main/java/org/apache/tika/language/translate/CachedTranslator.java Thu Jul 17 17:51:40 2014
@@ -0,0 +1,154 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.language.translate;
+
+import com.fasterxml.jackson.databind.util.LRUMap;
+import org.apache.tika.language.LanguageIdentifier;
+import org.apache.tika.language.LanguageProfile;
+
+import java.util.HashMap;
+
+/**
+ * CachedTranslator. Saves a map of previous translations in order to prevent repetitive translation requests.
+ */
+public class CachedTranslator implements Translator {
+ private static final int INITIAL_ENTRIES = 100;
+ private static final int MAX_ENTRIES = 1000;
+ private Translator translator;
+ // The cache is a map from sourceLang:targetLang to an LRUMap of previously translated pairs.
+ // Old entries are removed from the cache when it reaches its limit.
+ // For example, {en:fr -> {hello -> salut}}.
+ private HashMap<String, LRUMap<String, String>> cache;
+
+ /**
+ * Create a new CachedTranslator.
+ *
+ * @param translator The translator that should be used for the underlying translation service. The properties
+ * for that service must be set properly!
+ */
+ public CachedTranslator(Translator translator) {
+ this.translator = translator;
+ cache = new HashMap<String, LRUMap<String, String>>();
+ }
+
+ @Override
+ public String translate(String text, String sourceLanguage, String targetLanguage) throws Exception {
+ HashMap<String, String> translationCache = getTranslationCache(sourceLanguage, targetLanguage);
+ String translatedText = translationCache.get(text);
+ if (translatedText == null) {
+ translatedText = translator.translate(text, sourceLanguage, targetLanguage);
+ translationCache.put(text, translatedText);
+ }
+ return translatedText;
+ }
+
+ @Override
+ public String translate(String text, String targetLanguage) throws Exception {
+ LanguageIdentifier language = new LanguageIdentifier(
+ new LanguageProfile(text));
+ String sourceLanguage = language.getLanguage();
+ return translate(text, sourceLanguage, targetLanguage);
+ }
+
+ @Override
+ public boolean isAvailable() {
+ return translator.isAvailable();
+ }
+
+ /**
+ * Get the number of different source/target translation pairs this CachedTranslator
+ * currently has in its cache.
+ *
+ * @return Number of translation source/target pairs in this CachedTranslator's cache.
+ * @since Tika 1.6
+ */
+ public int getNumTranslationPairs() {
+ return cache.size();
+ }
+
+ /**
+ * Get the number of different translations from the source language to the target language
+ * this CachedTranslator has in its cache.
+ *
+ * @param sourceLanguage The source language of translation.
+ * @param targetLanguage The target language of translation.
+ * @return The number of translations between source and target.
+ * @since Tika 1.6
+ */
+ public int getNumTranslationsFor(String sourceLanguage, String targetLanguage) {
+ HashMap<String, String> translationCache = cache.get(buildCacheKeyString(sourceLanguage, targetLanguage));
+ if (translationCache == null) return 0;
+ else return translationCache.size();
+ }
+
+ /**
+ * Check whether this CachedTranslator's cache contains a translation of the text from the
+ * source language to the target language.
+ *
+ * @param text What string to check for.
+ * @param sourceLanguage The source language of translation.
+ * @param targetLanguage The target language of translation.
+ * @return true if the cache contains a translation of the text, false otherwise.
+ */
+ public boolean contains(String text, String sourceLanguage, String targetLanguage) {
+ HashMap<String, String> translationCache = getTranslationCache(sourceLanguage, targetLanguage);
+ return translationCache.containsKey(text);
+ }
+
+ /**
+ * Check whether this CachedTranslator's cache contains a translation of the text to the target language,
+ * attempting to auto-detect the source language.
+ *
+ * @param text What string to check for.
+ * @param targetLanguage The target language of translation.
+ * @return true if the cache contains a translation of the text, false otherwise.
+ */
+ public boolean contains(String text, String targetLanguage) {
+ LanguageIdentifier language = new LanguageIdentifier(
+ new LanguageProfile(text));
+ String sourceLanguage = language.getLanguage();
+ return contains(text, sourceLanguage, targetLanguage);
+ }
+
+ /**
+ * Build the String to be used as the key into this CachedTranslator's cache.
+ *
+ * @param sourceLanguage The source language of translation.
+ * @param targetLanguage The target language of translation.
+ * @return The string to be used as the key into this CachedTranslator's cache.
+ */
+ private String buildCacheKeyString(String sourceLanguage, String targetLanguage) {
+ return sourceLanguage + ":" + targetLanguage;
+ }
+
+ /**
+ * Get the cache of translations from the given source language to target language.
+ *
+ * @param sourceLanguage The source language of translation.
+ * @param targetLanguage The target language of translation.
+ * @return The LRUMap representing the translation cache.
+ */
+ private LRUMap<String, String> getTranslationCache(String sourceLanguage, String targetLanguage) {
+ LRUMap<String, String> translationCache = cache.get(buildCacheKeyString(sourceLanguage, targetLanguage));
+ if (translationCache == null) {
+ translationCache = new LRUMap<String, String>(INITIAL_ENTRIES, MAX_ENTRIES);
+ cache.put(buildCacheKeyString(sourceLanguage, targetLanguage), translationCache);
+ }
+ return translationCache;
+ }
+}
Modified: tika/trunk/tika-translate/src/main/resources/META-INF/services/org.apache.tika.language.translate.Translator
URL: http://svn.apache.org/viewvc/tika/trunk/tika-translate/src/main/resources/META-INF/services/org.apache.tika.language.translate.Translator?rev=1611417&r1=1611416&r2=1611417&view=diff
==============================================================================
--- tika/trunk/tika-translate/src/main/resources/META-INF/services/org.apache.tika.language.translate.Translator (original)
+++ tika/trunk/tika-translate/src/main/resources/META-INF/services/org.apache.tika.language.translate.Translator Thu Jul 17 17:51:40 2014
@@ -13,4 +13,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-org.apache.tika.language.translate.MicrosoftTranslator
\ No newline at end of file
+org.apache.tika.language.translate.MicrosoftTranslator
+org.apache.tika.language.translate.GoogleTranslator
+org.apache.tika.language.translate.CachedTranslator
\ No newline at end of file
Added: tika/trunk/tika-translate/src/test/java/org/apache/tika/language/translate/CachedTranslatorTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-translate/src/test/java/org/apache/tika/language/translate/CachedTranslatorTest.java?rev=1611417&view=auto
==============================================================================
--- tika/trunk/tika-translate/src/test/java/org/apache/tika/language/translate/CachedTranslatorTest.java (added)
+++ tika/trunk/tika-translate/src/test/java/org/apache/tika/language/translate/CachedTranslatorTest.java Thu Jul 17 17:51:40 2014
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.language.translate;
+
+import org.junit.Before;
+import org.junit.Test;
+
+import static org.junit.Assert.*;
+
+
+/**
+ * Test harness for the {@link CachedTranslator}. Take care to choose your target language carefully
+ * if you're testing the size of the cache!
+ */
+public class CachedTranslatorTest {
+ private CachedTranslator cachedTranslator;
+
+ @Before
+ public void setUp(){
+ cachedTranslator = new CachedTranslator(new GoogleTranslator());
+ }
+
+ @Test
+ public void testCachingSingleString() throws Exception {
+ for (int i = 0; i < 20; i++) {
+ cachedTranslator.translate("This is a test string to translate!", "en", "sv");
+ }
+ assertEquals("Cache doesn't have a single translation pair!", cachedTranslator.getNumTranslationPairs(), 1);
+ assertEquals("Cache has more than one element!", cachedTranslator.getNumTranslationsFor("en", "sv"), 1);
+ }
+
+ @Test
+ public void testCachingTwoStrings() throws Exception {
+ for (int i = 0; i < 20; i++) {
+ cachedTranslator.translate("This is a test string to translate!", "en", "no");
+ cachedTranslator.translate("This is a different string...", "en", "fr");
+ }
+ assertEquals("Cache doesn't have two translation pairs!", cachedTranslator.getNumTranslationPairs(), 2);
+ assertEquals("Cache has more than en to no translation!", cachedTranslator.getNumTranslationsFor("en", "no"), 1);
+ assertEquals("Cache has more than en to fr translation!", cachedTranslator.getNumTranslationsFor("en", "fr"), 1);
+ }
+
+ @Test
+ public void testSimpleTranslate() throws Exception {
+ String source = "hola senor";
+ String expected = "hello sir";
+
+ if (cachedTranslator.isAvailable()) {
+ String result = cachedTranslator.translate(source, "es", "en");
+ assertNotNull(result);
+ assertEquals("Result: [" + result
+ + "]: not equal to expected: [" + expected + "]",
+ expected, result);
+ }
+ }
+
+ @Test
+ public void testCacheContains() throws Exception {
+ String text = "Text that should be long enough to detect a language from.";
+ assertFalse("Cache should not contain a translation!",
+ cachedTranslator.contains(text, "en", "it"));
+ cachedTranslator.translate(text, "en", "it");
+ assertTrue("Cache should contain a translation!",
+ cachedTranslator.contains(text, "en", "it"));
+ assertTrue("Cache should detect source language when checking if contains.",
+ cachedTranslator.contains(text, "it"));
+ }
+}