You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by tp...@apache.org on 2014/07/17 19:51:40 UTC

svn commit: r1611417 - in /tika/trunk/tika-translate/src: main/java/org/apache/tika/language/translate/ main/resources/META-INF/services/ test/java/org/apache/tika/language/translate/

Author: tpalsulich
Date: Thu Jul 17 17:51:40 2014
New Revision: 1611417

URL: http://svn.apache.org/r1611417
Log:
Fix for JIRA issue TIKA-1370, adding a CachedTranslator.

Added:
    tika/trunk/tika-translate/src/main/java/org/apache/tika/language/translate/CachedTranslator.java
    tika/trunk/tika-translate/src/test/java/org/apache/tika/language/translate/CachedTranslatorTest.java
Modified:
    tika/trunk/tika-translate/src/main/resources/META-INF/services/org.apache.tika.language.translate.Translator

Added: tika/trunk/tika-translate/src/main/java/org/apache/tika/language/translate/CachedTranslator.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-translate/src/main/java/org/apache/tika/language/translate/CachedTranslator.java?rev=1611417&view=auto
==============================================================================
--- tika/trunk/tika-translate/src/main/java/org/apache/tika/language/translate/CachedTranslator.java (added)
+++ tika/trunk/tika-translate/src/main/java/org/apache/tika/language/translate/CachedTranslator.java Thu Jul 17 17:51:40 2014
@@ -0,0 +1,154 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.language.translate;
+
+import com.fasterxml.jackson.databind.util.LRUMap;
+import org.apache.tika.language.LanguageIdentifier;
+import org.apache.tika.language.LanguageProfile;
+
+import java.util.HashMap;
+
+/**
+ * CachedTranslator. Saves a map of previous translations in order to prevent repetitive translation requests.
+ */
+public class CachedTranslator implements Translator {
+    private static final int INITIAL_ENTRIES = 100;
+    private static final int MAX_ENTRIES = 1000;
+    private Translator translator;
+    // The cache is a map from sourceLang:targetLang to an LRUMap of previously translated pairs.
+    // Old entries are removed from the cache when it reaches its limit.
+    // For example, {en:fr -> {hello -> salut}}.
+    private HashMap<String, LRUMap<String, String>> cache;
+
+    /**
+     * Create a new CachedTranslator.
+     *
+     * @param translator The translator that should be used for the underlying translation service. The properties
+     *                   for that service must be set properly!
+     */
+    public CachedTranslator(Translator translator) {
+        this.translator = translator;
+        cache = new HashMap<String, LRUMap<String, String>>();
+    }
+
+    @Override
+    public String translate(String text, String sourceLanguage, String targetLanguage) throws Exception {
+        HashMap<String, String> translationCache = getTranslationCache(sourceLanguage, targetLanguage);
+        String translatedText = translationCache.get(text);
+        if (translatedText == null) {
+            translatedText = translator.translate(text, sourceLanguage, targetLanguage);
+            translationCache.put(text, translatedText);
+        }
+        return translatedText;
+    }
+
+    @Override
+    public String translate(String text, String targetLanguage) throws Exception {
+        LanguageIdentifier language = new LanguageIdentifier(
+                new LanguageProfile(text));
+        String sourceLanguage = language.getLanguage();
+        return translate(text, sourceLanguage, targetLanguage);
+    }
+
+    @Override
+    public boolean isAvailable() {
+        return translator.isAvailable();
+    }
+
+    /**
+     * Get the number of different source/target translation pairs this CachedTranslator
+     * currently has in its cache.
+     *
+     * @return Number of translation source/target pairs in this CachedTranslator's cache.
+     * @since Tika 1.6
+     */
+    public int getNumTranslationPairs() {
+        return cache.size();
+    }
+
+    /**
+     * Get the number of different translations from the source language to the target language
+     * this CachedTranslator has in its cache.
+     *
+     * @param sourceLanguage The source language of translation.
+     * @param targetLanguage The target language of translation.
+     * @return The number of translations between source and target.
+     * @since Tika 1.6
+     */
+    public int getNumTranslationsFor(String sourceLanguage, String targetLanguage) {
+        HashMap<String, String> translationCache = cache.get(buildCacheKeyString(sourceLanguage, targetLanguage));
+        if (translationCache == null) return 0;
+        else return translationCache.size();
+    }
+
+    /**
+     * Check whether this CachedTranslator's cache contains a translation of the text from the
+     * source language to the target language.
+     *
+     * @param text What string to check for.
+     * @param sourceLanguage The source language of translation.
+     * @param targetLanguage The target language of translation.
+     * @return true if the cache contains a translation of the text, false otherwise.
+     */
+    public boolean contains(String text, String sourceLanguage, String targetLanguage) {
+        HashMap<String, String> translationCache = getTranslationCache(sourceLanguage, targetLanguage);
+        return translationCache.containsKey(text);
+    }
+
+    /**
+     * Check whether this CachedTranslator's cache contains a translation of the text to the target language,
+     * attempting to auto-detect the source language.
+     *
+     * @param text What string to check for.
+     * @param targetLanguage The target language of translation.
+     * @return true if the cache contains a translation of the text, false otherwise.
+     */
+    public boolean contains(String text, String targetLanguage) {
+        LanguageIdentifier language = new LanguageIdentifier(
+                new LanguageProfile(text));
+        String sourceLanguage = language.getLanguage();
+        return contains(text, sourceLanguage, targetLanguage);
+    }
+
+    /**
+     * Build the String to be used as the key into this CachedTranslator's cache.
+     *
+     * @param sourceLanguage The source language of translation.
+     * @param targetLanguage The target language of translation.
+     * @return The string to be used as the key into this CachedTranslator's cache.
+     */
+    private String buildCacheKeyString(String sourceLanguage, String targetLanguage) {
+        return sourceLanguage + ":" + targetLanguage;
+    }
+
+    /**
+     * Get the cache of translations from the given source language to target language.
+     *
+     * @param sourceLanguage The source language of translation.
+     * @param targetLanguage The target language of translation.
+     * @return The LRUMap representing the translation cache.
+     */
+    private LRUMap<String, String> getTranslationCache(String sourceLanguage, String targetLanguage) {
+        LRUMap<String, String> translationCache = cache.get(buildCacheKeyString(sourceLanguage, targetLanguage));
+        if (translationCache == null) {
+            translationCache = new LRUMap<String, String>(INITIAL_ENTRIES, MAX_ENTRIES);
+            cache.put(buildCacheKeyString(sourceLanguage, targetLanguage), translationCache);
+        }
+        return translationCache;
+    }
+}

Modified: tika/trunk/tika-translate/src/main/resources/META-INF/services/org.apache.tika.language.translate.Translator
URL: http://svn.apache.org/viewvc/tika/trunk/tika-translate/src/main/resources/META-INF/services/org.apache.tika.language.translate.Translator?rev=1611417&r1=1611416&r2=1611417&view=diff
==============================================================================
--- tika/trunk/tika-translate/src/main/resources/META-INF/services/org.apache.tika.language.translate.Translator (original)
+++ tika/trunk/tika-translate/src/main/resources/META-INF/services/org.apache.tika.language.translate.Translator Thu Jul 17 17:51:40 2014
@@ -13,4 +13,6 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
-org.apache.tika.language.translate.MicrosoftTranslator
\ No newline at end of file
+org.apache.tika.language.translate.MicrosoftTranslator
+org.apache.tika.language.translate.GoogleTranslator
+org.apache.tika.language.translate.CachedTranslator
\ No newline at end of file

Added: tika/trunk/tika-translate/src/test/java/org/apache/tika/language/translate/CachedTranslatorTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-translate/src/test/java/org/apache/tika/language/translate/CachedTranslatorTest.java?rev=1611417&view=auto
==============================================================================
--- tika/trunk/tika-translate/src/test/java/org/apache/tika/language/translate/CachedTranslatorTest.java (added)
+++ tika/trunk/tika-translate/src/test/java/org/apache/tika/language/translate/CachedTranslatorTest.java Thu Jul 17 17:51:40 2014
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.language.translate;
+
+import org.junit.Before;
+import org.junit.Test;
+
+import static org.junit.Assert.*;
+
+
+/**
+ * Test harness for the {@link CachedTranslator}. Take care to choose your target language carefully
+ * if you're testing the size of the cache!
+ */
+public class CachedTranslatorTest {
+    private CachedTranslator cachedTranslator;
+
+    @Before
+    public void setUp(){
+        cachedTranslator = new CachedTranslator(new GoogleTranslator());
+    }
+
+    @Test
+    public void testCachingSingleString() throws Exception {
+        for (int i = 0; i < 20; i++) {
+            cachedTranslator.translate("This is a test string to translate!", "en", "sv");
+        }
+        assertEquals("Cache doesn't have a single translation pair!", cachedTranslator.getNumTranslationPairs(), 1);
+        assertEquals("Cache has more than one element!", cachedTranslator.getNumTranslationsFor("en", "sv"), 1);
+    }
+
+    @Test
+    public void testCachingTwoStrings() throws Exception {
+        for (int i = 0; i < 20; i++) {
+            cachedTranslator.translate("This is a test string to translate!", "en", "no");
+            cachedTranslator.translate("This is a different string...", "en", "fr");
+        }
+        assertEquals("Cache doesn't have two translation pairs!", cachedTranslator.getNumTranslationPairs(), 2);
+        assertEquals("Cache has more than en to no translation!", cachedTranslator.getNumTranslationsFor("en", "no"), 1);
+        assertEquals("Cache has more than en to fr translation!", cachedTranslator.getNumTranslationsFor("en", "fr"), 1);
+    }
+
+    @Test
+    public void testSimpleTranslate() throws Exception {
+        String source = "hola senor";
+        String expected = "hello sir";
+
+        if (cachedTranslator.isAvailable()) {
+            String result = cachedTranslator.translate(source, "es", "en");
+            assertNotNull(result);
+            assertEquals("Result: [" + result
+                            + "]: not equal to expected: [" + expected + "]",
+                    expected, result);
+        }
+    }
+
+    @Test
+    public void testCacheContains() throws Exception {
+        String text = "Text that should be long enough to detect a language from.";
+        assertFalse("Cache should not contain a translation!",
+                cachedTranslator.contains(text, "en", "it"));
+        cachedTranslator.translate(text, "en", "it");
+        assertTrue("Cache should contain a translation!",
+                cachedTranslator.contains(text, "en", "it"));
+        assertTrue("Cache should detect source language when checking if contains.",
+                cachedTranslator.contains(text, "it"));
+    }
+}