You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by le...@apache.org on 2016/10/26 02:37:06 UTC

[1/7] tika git commit: TIKA-1343 Create a Tika Translator implementation that uses JoshuaDecoder

Repository: tika
Updated Branches:
  refs/heads/master 7ca105ef5 -> dadbf55c5


TIKA-1343 Create a Tika Translator implementation that uses JoshuaDecoder


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/d4fb28f9
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/d4fb28f9
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/d4fb28f9

Branch: refs/heads/master
Commit: d4fb28f91d77458b15557942438f874b9f564e88
Parents: 19ed261
Author: Lewis John McGibbney <le...@jpl.nasa.gov>
Authored: Wed Apr 27 15:06:42 2016 -0700
Committer: Lewis John McGibbney <le...@jpl.nasa.gov>
Committed: Wed Apr 27 15:06:42 2016 -0700

----------------------------------------------------------------------
 .../tika/language/detect/LanguageResult.java    |   6 +-
 .../tika/language/translate/Translator.java     |  20 +-
 tika-parsers/pom.xml                            |  14 ++
 .../language/translate/AbstractTranslator.java  |  18 +-
 .../language/translate/GoogleTranslator.java    |   5 -
 .../translate/JoshuaNetworkTranslator.java      | 189 +++++++++++++++++++
 .../language/translate/MosesTranslator.java     |   2 +
 .../translate/translator.google.properties      |   6 +-
 .../translate/translator.joshua.properties      |  22 +++
 .../translate/translator.lingo24.properties     |   5 -
 .../translate/translator.moses.properties       |   5 -
 .../translate/translator.yandex.properties      |  15 ++
 .../translate/JoshuaNetworkTranslatorTest.java  |  41 ++++
 .../translate/MicrosoftTranslatorTest.java      |   1 -
 .../translate/YandexTranslatorTest.java         |   2 -
 15 files changed, 309 insertions(+), 42 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/d4fb28f9/tika-core/src/main/java/org/apache/tika/language/detect/LanguageResult.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/language/detect/LanguageResult.java b/tika-core/src/main/java/org/apache/tika/language/detect/LanguageResult.java
index aaa503b..63e1f8c 100644
--- a/tika-core/src/main/java/org/apache/tika/language/detect/LanguageResult.java
+++ b/tika-core/src/main/java/org/apache/tika/language/detect/LanguageResult.java
@@ -33,7 +33,7 @@ public class LanguageResult {
 	
 	/**
 	 * 
-	 * @param language ISO 639-1 language code (plus optional "-<country code>")
+	 * @param language ISO 639-1 language code (plus optional country code)
 	 * @param rawScore confidence of detector in the result.
 	 */
 	public LanguageResult(String language, LanguageConfidence confidence, float rawScore) {
@@ -42,6 +42,10 @@ public class LanguageResult {
 		this.rawScore = rawScore;
 	}
 
+	/**
+	 * The ISO 639-1 language code (plus optional country code)
+	 * @return a string representation of the language code
+	 */
 	public String getLanguage() {
 		return language;
 	}

http://git-wip-us.apache.org/repos/asf/tika/blob/d4fb28f9/tika-core/src/main/java/org/apache/tika/language/translate/Translator.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/language/translate/Translator.java b/tika-core/src/main/java/org/apache/tika/language/translate/Translator.java
index f225565..912e30f 100644
--- a/tika-core/src/main/java/org/apache/tika/language/translate/Translator.java
+++ b/tika-core/src/main/java/org/apache/tika/language/translate/Translator.java
@@ -26,14 +26,7 @@ import java.io.IOException;
  */
 public interface Translator {
     /**
-     * Translate text between given languages. The following languages are supported:
-     * Arabic("ar"), Bulgarian("bg"), Catalan("ca"), Chinese-Simplified("zh-CHS"), Chinese-Traditional("zh-CHT"),
-     * Czech("cs"), Danish("da"), Dutch("nl"), English("en"), Estonian("et"),  Innish("fi"), French("fr"), German("de"),
-     * Greek("el"), Haitian-Creole("ht"), Hebrew("he"), Hindi("hi"), Hmong-Daw("mww"), Hungarian("hu"),
-     * Indonesian("id"), Italian("it"), Japanese("ja"), Korean("ko"), Latvian("lv"), Lithuanian("lt"), Malay("ms"),
-     * Norwegian("no"), Persian("fa"), Polish("pl"), Portuguese("pt"), Romanian("ro"), Russian("ru"), Slovak("sk"),
-     * Slovenian("sl"), Spanish("es"), Swedish("sv"), Thai("th"), Turkish("tr"), Ukranian("uk"), Urdu("ur"),
-     * Vietnemese("vi").
+     * Translate text between given languages.
      * @param text The text to translate.
      * @param sourceLanguage The input text language (for example, "en").
      * @param targetLanguage The desired language to translate to (for example, "fr").
@@ -45,15 +38,8 @@ public interface Translator {
     public String translate(String text, String sourceLanguage, String targetLanguage) throws TikaException, IOException;
 
     /**
-     * Translate text to the given language. This method attempts to auto-detect the source language of the text.
-     * The following languages are supported:
-     * Arabic("ar"), Bulgarian("bg"), Catalan("ca"), Chinese-Simplified("zh-CHS"), Chinese-Traditional("zh-CHT"),
-     * Czech("cs"), Danish("da"), Dutch("nl"), English("en"), Estonian("et"),  Innish("fi"), French("fr"), German("de"),
-     * Greek("el"), Haitian-Creole("ht"), Hebrew("he"), Hindi("hi"), Hmong-Daw("mww"), Hungarian("hu"),
-     * Indonesian("id"), Italian("it"), Japanese("ja"), Korean("ko"), Latvian("lv"), Lithuanian("lt"), Malay("ms"),
-     * Norwegian("no"), Persian("fa"), Polish("pl"), Portuguese("pt"), Romanian("ro"), Russian("ru"), Slovak("sk"),
-     * Slovenian("sl"), Spanish("es"), Swedish("sv"), Thai("th"), Turkish("tr"), Ukranian("uk"), Urdu("ur"),
-     * Vietnemese("vi").
+     * Translate text to the given language
+     * This method attempts to auto-detect the source language of the text.
      * @param text The text to translate.
      * @param targetLanguage The desired language to translate to (for example, "hi").
      * @return The translation result. If translation is unavailable, returns the same text back.

http://git-wip-us.apache.org/repos/asf/tika/blob/d4fb28f9/tika-parsers/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parsers/pom.xml b/tika-parsers/pom.xml
index 64edbeb..fa3b7fc 100644
--- a/tika-parsers/pom.xml
+++ b/tika-parsers/pom.xml
@@ -471,6 +471,19 @@
                     <execute />
                   </action>
                 </pluginExecution>
+                <pluginExecution>
+                  <pluginExecutionFilter>
+                    <groupId>org.codehaus.gmaven</groupId>
+                    <artifactId>groovy-maven-plugin</artifactId>
+                    <versionRange>[2.0,)</versionRange>
+                    <goals>
+                      <goal>execute</goal>
+                    </goals>
+                  </pluginExecutionFilter>
+                  <action>
+                    <ignore></ignore>
+                  </action>
+                </pluginExecution>
               </pluginExecutions>
             </lifecycleMappingMetadata>
           </configuration>
@@ -506,6 +519,7 @@
           <plugin>
             <groupId>org.codehaus.gmaven</groupId>
             <artifactId>groovy-maven-plugin</artifactId>
+            <version>2.0</version>
             <dependencies>
               <dependency>
                 <groupId>org.apache.maven</groupId>

http://git-wip-us.apache.org/repos/asf/tika/blob/d4fb28f9/tika-translate/src/main/java/org/apache/tika/language/translate/AbstractTranslator.java
----------------------------------------------------------------------
diff --git a/tika-translate/src/main/java/org/apache/tika/language/translate/AbstractTranslator.java b/tika-translate/src/main/java/org/apache/tika/language/translate/AbstractTranslator.java
index d892ab9..2a331bb 100644
--- a/tika-translate/src/main/java/org/apache/tika/language/translate/AbstractTranslator.java
+++ b/tika-translate/src/main/java/org/apache/tika/language/translate/AbstractTranslator.java
@@ -1,3 +1,19 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 package org.apache.tika.language.translate;
 
 import java.io.IOException;
@@ -9,7 +25,7 @@ import org.apache.tika.language.detect.LanguageResult;
 
 public abstract class AbstractTranslator implements Translator {
 
-	protected LanguageResult detectLanguage(String text) throws IOException {
+    protected LanguageResult detectLanguage(String text) throws IOException {
         LanguageDetector detector = new OptimaizeLangDetector().loadModels();
         return detector.detect(text);
 	}

http://git-wip-us.apache.org/repos/asf/tika/blob/d4fb28f9/tika-translate/src/main/java/org/apache/tika/language/translate/GoogleTranslator.java
----------------------------------------------------------------------
diff --git a/tika-translate/src/main/java/org/apache/tika/language/translate/GoogleTranslator.java b/tika-translate/src/main/java/org/apache/tika/language/translate/GoogleTranslator.java
index 29c03c6..cdab2ad 100644
--- a/tika-translate/src/main/java/org/apache/tika/language/translate/GoogleTranslator.java
+++ b/tika-translate/src/main/java/org/apache/tika/language/translate/GoogleTranslator.java
@@ -24,8 +24,6 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.util.Properties;
-import java.util.logging.Logger;
-
 import javax.ws.rs.core.MediaType;
 import javax.ws.rs.core.Response;
 
@@ -51,9 +49,6 @@ public class GoogleTranslator extends AbstractTranslator {
 
 	private static final String DEFAULT_KEY = "dummy-secret";
 
-	private static final Logger LOG = Logger.getLogger(GoogleTranslator.class
-			.getName());
-
 	private WebClient client;
 
 	private String apiKey;

http://git-wip-us.apache.org/repos/asf/tika/blob/d4fb28f9/tika-translate/src/main/java/org/apache/tika/language/translate/JoshuaNetworkTranslator.java
----------------------------------------------------------------------
diff --git a/tika-translate/src/main/java/org/apache/tika/language/translate/JoshuaNetworkTranslator.java b/tika-translate/src/main/java/org/apache/tika/language/translate/JoshuaNetworkTranslator.java
new file mode 100644
index 0000000..e97389c
--- /dev/null
+++ b/tika-translate/src/main/java/org/apache/tika/language/translate/JoshuaNetworkTranslator.java
@@ -0,0 +1,189 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.language.translate;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.util.Properties;
+
+import javax.ws.rs.core.MediaType;
+import javax.ws.rs.core.Response;
+
+import org.apache.cxf.jaxrs.client.WebClient;
+import org.apache.tika.exception.TikaException;
+import com.fasterxml.jackson.core.JsonParseException;
+import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+/**
+ * <p>This translator is designed to work with a TCP-IP available
+ * Joshua translation server, specifically the
+ * <a href="https://github.com/joshua-decoder/joshua_translation_engine">
+ * REST-based Joshua server</a>.</p>
+ * 
+ * <p>If you were to interact with the server via curl a request
+ * would look as follows</p>
+ * 
+ * <pre>
+ * {code
+ * curl http://localhost:5000/joshua/translate/english \
+ *   -i -H "Content-Type: application/json" \
+ *   -X POST -d '{"inputLanguage": "Spanish", "inputText": "vuelo"}' -v
+ * }
+ * </pre>
+ * 
+ * Joshua requires input to be pre-formatted into sentences, one per line,
+ * so this translation implementation takes care of that.
+ */
+public class JoshuaNetworkTranslator extends AbstractTranslator {
+
+  private static final String PROPERTIES_FILE = "translator.joshua.properties";
+
+  private String JOSHUA_SERVER = "joshua.server.url";
+
+  private String networkServer;
+  
+  private WebClient client;
+
+  /**
+   * Default constructor which first checks for the presence of
+   * the <code>translator.joshua.properties</code> file. 
+   * We check if the remote server is available on each 
+   * translation process. This check is not a remote call, but instead
+   * a check for null value within of a local variable represetning the 
+   * value for <code>joshua.server.url</code>, which should be populated 
+   * within the <code>translator.joshua.properties</code> file.
+   */
+  public JoshuaNetworkTranslator() {
+    Properties props = new Properties();
+    InputStream stream;
+    stream = JoshuaNetworkTranslator.class.getResourceAsStream(PROPERTIES_FILE);
+    try {
+      if(stream != null) {
+        props.load(stream);
+        networkServer = props.getProperty(JOSHUA_SERVER);
+      }
+    } catch (IOException e) {
+      // Error with properties file. Translation will not work.
+      e.printStackTrace();
+    }
+  }
+
+  /**
+   * <p>Initially then check if the source language has been provided.
+   * If no source language (or a null value) has been provided then
+   * we make an attempt to guess the source using Tika's
+   * {@link org.apache.tika.langdetect.OptimaizeLangDetector}. If we
+   * are still unable to guess the language then we return the source
+   * text.</p>
+   * 
+   * <p>We then process the input text into a new string consisting of 
+   * sentences, one per line e.g. insert \n between the presence of '.'</p>
+   * 
+   * @see org.apache.tika.language.translate.Translator#translate
+   * (java.lang.String, java.lang.String, java.lang.String)
+   */
+  @Override
+  public String translate(String text, String sourceLanguage,
+      String targetLanguage) throws TikaException, IOException {
+    if (!this.isAvailable())
+      return text;
+
+    //make an attempt to guess language if one is not provided.
+    if (sourceLanguage == null)
+      sourceLanguage = detectLanguage(text).getLanguage();
+
+    //process input text into sentences, one per line 
+    // e.g. insert \n between the presence of '.'
+    StringBuilder sb = new StringBuilder(text);
+    int i = 0;
+    while ((i = sb.indexOf(".", i + 1)) != -1) {
+      sb.replace(i, i + 1, "\n");
+    }
+
+    text = sb.toString();
+
+    //create client
+    if (!networkServer.endsWith("/")) {
+      client = WebClient.create(networkServer + "/" + targetLanguage + "/");
+    } else {
+      client = WebClient.create(networkServer + targetLanguage + "/");
+    }
+
+    //make the reuest
+    Response response = client.accept(MediaType.APPLICATION_JSON)
+        .query("inputLanguage", sourceLanguage)
+        .query("inputText", text).get();
+    BufferedReader reader = new BufferedReader(new InputStreamReader(
+        (InputStream) response.getEntity(), UTF_8));
+    String line = null;
+    StringBuffer responseText = new StringBuffer();
+    while ((line = reader.readLine()) != null) {
+      responseText.append(line);
+    }
+
+    try {
+      ObjectMapper mapper = new ObjectMapper();
+      JsonNode jsonResp = mapper.readTree(responseText.toString());
+
+      if (!jsonResp.findValuesAsText("code").isEmpty()) {
+        String code = jsonResp.findValuesAsText("code").get(0);
+        if (code.equals("200")) {
+          return jsonResp.findValue("text").get(0).asText();
+        } else {
+          throw new TikaException(jsonResp.findValue("message").get(0).asText());
+        }
+      } else {
+        throw new TikaException("Return message not recognized: " + 
+            responseText.toString().substring(0, Math.min(responseText.length(), 100)));
+      }
+    } catch (JsonParseException e) {
+      throw new TikaException("Error requesting translation from '" + 
+          sourceLanguage + "' to '" + targetLanguage + "', JSON response "
+          + "from Joshua REST Server is not well formatted: " + responseText.toString());
+    }
+  }
+
+  /**
+   * Make an attempt to guess the source language via
+   * {@link org.apache.tika.language.translate.AbstractTranslator#detectLanguage(String)} 
+   * before making the call to 
+   * {@link org.apache.tika.language.translate.JoshuaNetworkTranslator#translate(String, String, String)}
+   * @see org.apache.tika.language.translate.Translator#translate(java.lang.String, java.lang.String)
+   */
+  @Override
+  public String translate(String text, String targetLanguage)
+      throws TikaException, IOException {
+    if (isAvailable())
+      return text;
+    String sourceLanguage = detectLanguage(text).getLanguage();
+    return translate(text, sourceLanguage, targetLanguage);
+  }
+
+  /**
+   * @see org.apache.tika.language.translate.Translator#isAvailable()
+   */
+  @Override
+  public boolean isAvailable() {
+    return this.networkServer!=null;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/d4fb28f9/tika-translate/src/main/java/org/apache/tika/language/translate/MosesTranslator.java
----------------------------------------------------------------------
diff --git a/tika-translate/src/main/java/org/apache/tika/language/translate/MosesTranslator.java b/tika-translate/src/main/java/org/apache/tika/language/translate/MosesTranslator.java
index 8a976fe..fb9c743 100644
--- a/tika-translate/src/main/java/org/apache/tika/language/translate/MosesTranslator.java
+++ b/tika-translate/src/main/java/org/apache/tika/language/translate/MosesTranslator.java
@@ -76,6 +76,7 @@ public class MosesTranslator extends ExternalTranslator {
     public String translate(String text, String sourceLanguage, String targetLanguage) throws TikaException, IOException {
         if (!isAvailable() || !checkCommand(buildCheckCommand(smtPath), 1)) return text;
         File tmpFile = new File(TMP_FILE_NAME);
+        @SuppressWarnings("resource")
         OutputStreamWriter out = new OutputStreamWriter(new FileOutputStream(tmpFile), Charset.defaultCharset());
         out.append(text).append('\n').close();
 
@@ -84,6 +85,7 @@ public class MosesTranslator extends ExternalTranslator {
         File tmpTranslatedFile = new File(TMP_FILE_NAME + ".translated");
 
         StringBuilder stringBuilder = new StringBuilder();
+        @SuppressWarnings("resource")
         BufferedReader reader = new BufferedReader(new InputStreamReader(
                 new FileInputStream(tmpTranslatedFile),
                 Charset.defaultCharset()

http://git-wip-us.apache.org/repos/asf/tika/blob/d4fb28f9/tika-translate/src/main/resources/org/apache/tika/language/translate/translator.google.properties
----------------------------------------------------------------------
diff --git a/tika-translate/src/main/resources/org/apache/tika/language/translate/translator.google.properties b/tika-translate/src/main/resources/org/apache/tika/language/translate/translator.google.properties
index edbc732..4e622ce 100644
--- a/tika-translate/src/main/resources/org/apache/tika/language/translate/translator.google.properties
+++ b/tika-translate/src/main/resources/org/apache/tika/language/translate/translator.google.properties
@@ -12,11 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# Must set the client keys in this file to use translation. Please see
-# https://code.google.com/p/microsoft-translator-java-api/ and
-# http://msdn.microsoft.com/en-us/library/hh454950.aspx for help with
-# getting these keys. As of now (6/2014) 2,000,000 characters/month
-# are free.
+
 #
 # To use the Google translation service, you <em>must</em> set your API-key
 # as described in GoogleTranslator. If you do not want translation

http://git-wip-us.apache.org/repos/asf/tika/blob/d4fb28f9/tika-translate/src/main/resources/org/apache/tika/language/translate/translator.joshua.properties
----------------------------------------------------------------------
diff --git a/tika-translate/src/main/resources/org/apache/tika/language/translate/translator.joshua.properties b/tika-translate/src/main/resources/org/apache/tika/language/translate/translator.joshua.properties
new file mode 100644
index 0000000..81071f3
--- /dev/null
+++ b/tika-translate/src/main/resources/org/apache/tika/language/translate/translator.joshua.properties
@@ -0,0 +1,22 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# The property below is partially described within 
+# https://github.com/joshua-decoder/joshua_translation_engine#requesting-translations
+# if left as null, then translation will not occur and the source text
+# will be returned. 
+# An example would be http://localhost:5000/joshua/translate/
+joshua.server=
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/d4fb28f9/tika-translate/src/main/resources/org/apache/tika/language/translate/translator.lingo24.properties
----------------------------------------------------------------------
diff --git a/tika-translate/src/main/resources/org/apache/tika/language/translate/translator.lingo24.properties b/tika-translate/src/main/resources/org/apache/tika/language/translate/translator.lingo24.properties
index 04e0883..24756ac 100644
--- a/tika-translate/src/main/resources/org/apache/tika/language/translate/translator.lingo24.properties
+++ b/tika-translate/src/main/resources/org/apache/tika/language/translate/translator.lingo24.properties
@@ -12,11 +12,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# Must set the client keys in this file to use translation. Please see
-# https://code.google.com/p/microsoft-translator-java-api/ and
-# http://msdn.microsoft.com/en-us/library/hh454950.aspx for help with
-# getting these keys. As of now (6/2014) 2,000,000 characters/month
-# are free.
 #
 # To use the Lingo24 translation service, you <em>must</em> set your API-key
 # as described in Lingo24Translator. If you do not want translation

http://git-wip-us.apache.org/repos/asf/tika/blob/d4fb28f9/tika-translate/src/main/resources/org/apache/tika/language/translate/translator.moses.properties
----------------------------------------------------------------------
diff --git a/tika-translate/src/main/resources/org/apache/tika/language/translate/translator.moses.properties b/tika-translate/src/main/resources/org/apache/tika/language/translate/translator.moses.properties
index 72f2d20..55f9176 100644
--- a/tika-translate/src/main/resources/org/apache/tika/language/translate/translator.moses.properties
+++ b/tika-translate/src/main/resources/org/apache/tika/language/translate/translator.moses.properties
@@ -12,11 +12,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# Must set the client keys in this file to use translation. Please see
-# https://code.google.com/p/microsoft-translator-java-api/ and
-# http://msdn.microsoft.com/en-us/library/hh454950.aspx for help with
-# getting these keys. As of now (6/2014) 2,000,000 characters/month
-# are free.
 
 # smt_path is the full path to the Moses jar to run.
 # script_path is the full path to the script to pass to the smt jar.

http://git-wip-us.apache.org/repos/asf/tika/blob/d4fb28f9/tika-translate/src/main/resources/org/apache/tika/language/translate/translator.yandex.properties
----------------------------------------------------------------------
diff --git a/tika-translate/src/main/resources/org/apache/tika/language/translate/translator.yandex.properties b/tika-translate/src/main/resources/org/apache/tika/language/translate/translator.yandex.properties
index 57c11e0..602445e 100644
--- a/tika-translate/src/main/resources/org/apache/tika/language/translate/translator.yandex.properties
+++ b/tika-translate/src/main/resources/org/apache/tika/language/translate/translator.yandex.properties
@@ -1,3 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 # To use the YANDEX translate service, you <em>must</em> set your API-Key
 # as described in Translate API, https://tech.yandex.com/translate/
 # If you do not want translation please set the value to "dummy-key".

http://git-wip-us.apache.org/repos/asf/tika/blob/d4fb28f9/tika-translate/src/test/java/org/apache/tika/language/translate/JoshuaNetworkTranslatorTest.java
----------------------------------------------------------------------
diff --git a/tika-translate/src/test/java/org/apache/tika/language/translate/JoshuaNetworkTranslatorTest.java b/tika-translate/src/test/java/org/apache/tika/language/translate/JoshuaNetworkTranslatorTest.java
new file mode 100644
index 0000000..2cf7b3a
--- /dev/null
+++ b/tika-translate/src/test/java/org/apache/tika/language/translate/JoshuaNetworkTranslatorTest.java
@@ -0,0 +1,41 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.language.translate;
+
+import org.junit.Before;
+import org.junit.Test;
+
+import static org.junit.Assert.assertTrue;
+
+public class JoshuaNetworkTranslatorTest {
+
+  JoshuaNetworkTranslator translator;
+
+  @Before
+  public void setUp() {
+    translator = new JoshuaNetworkTranslator();
+  }
+
+  @Test
+  public void testSimpleSpanishToEnglishTranslation() throws Exception {
+    String source = "hola";
+    String expected = "hello";
+    String translated = translator.translate(source, "es", "en");
+    if (translator.isAvailable()) assertTrue("Translate " + source + " to " + expected + " (was " + translated + ")",
+        expected.equalsIgnoreCase(translated));
+  }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/d4fb28f9/tika-translate/src/test/java/org/apache/tika/language/translate/MicrosoftTranslatorTest.java
----------------------------------------------------------------------
diff --git a/tika-translate/src/test/java/org/apache/tika/language/translate/MicrosoftTranslatorTest.java b/tika-translate/src/test/java/org/apache/tika/language/translate/MicrosoftTranslatorTest.java
index a35281d..45d246e 100644
--- a/tika-translate/src/test/java/org/apache/tika/language/translate/MicrosoftTranslatorTest.java
+++ b/tika-translate/src/test/java/org/apache/tika/language/translate/MicrosoftTranslatorTest.java
@@ -16,7 +16,6 @@
  */
 package org.apache.tika.language.translate;
 
-import org.apache.tika.Tika;
 import org.junit.Before;
 import org.junit.Test;
 

http://git-wip-us.apache.org/repos/asf/tika/blob/d4fb28f9/tika-translate/src/test/java/org/apache/tika/language/translate/YandexTranslatorTest.java
----------------------------------------------------------------------
diff --git a/tika-translate/src/test/java/org/apache/tika/language/translate/YandexTranslatorTest.java b/tika-translate/src/test/java/org/apache/tika/language/translate/YandexTranslatorTest.java
index adac4be..2c5d969 100644
--- a/tika-translate/src/test/java/org/apache/tika/language/translate/YandexTranslatorTest.java
+++ b/tika-translate/src/test/java/org/apache/tika/language/translate/YandexTranslatorTest.java
@@ -1,4 +1,3 @@
-
 /**
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
@@ -18,7 +17,6 @@
 
 package org.apache.tika.language.translate;
 
-import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertNotNull;
 import static org.junit.Assert.fail;
 import static org.junit.Assume.assumeTrue;


[2/7] tika git commit: Merge branch 'master' into TIKA-1343

Posted by le...@apache.org.
Merge branch 'master' into TIKA-1343


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/4aff4839
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/4aff4839
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/4aff4839

Branch: refs/heads/master
Commit: 4aff4839aece41a739b93169cf7a475ecfc5c70c
Parents: d4fb28f c93ff3e
Author: Lewis John McGibbney <le...@gmail.com>
Authored: Thu May 5 14:03:01 2016 -0700
Committer: Lewis John McGibbney <le...@gmail.com>
Committed: Thu May 5 14:03:01 2016 -0700

----------------------------------------------------------------------
 .../org/apache/tika/mime/tika-mimetypes.xml     |   8 +
 .../tika/parser/code/SourceCodeParser.java      | 142 ++++++++--------
 .../apache/tika/parser/image/ICNSParser.java    | 117 +++++++++++++
 .../org/apache/tika/parser/image/ICNSType.java  | 170 +++++++++++++++++++
 .../parser/mp4/DirectFileReadDataSource.java    |   2 +-
 .../org/apache/tika/parser/mp4/MP4Parser.java   |  13 +-
 .../services/org.apache.tika.parser.Parser      |   3 +-
 .../org/apache/tika/mime/TestMimeTypes.java     |   8 +
 .../tika/parser/image/ICNSParserTest.java       |  65 +++++++
 .../test/resources/test-documents/testICNS.icns | Bin 0 -> 2472 bytes
 .../test-documents/testICNS_basic.icns          | Bin 0 -> 18199 bytes
 .../resources/test-documents/testKeynoteNew.key | Bin 0 -> 274397 bytes
 .../test-documents/testNumbersNew.numbers       | Bin 0 -> 179147 bytes
 .../resources/test-documents/testPagesNew.pages | Bin 0 -> 237567 bytes
 14 files changed, 447 insertions(+), 81 deletions(-)
----------------------------------------------------------------------



[4/7] tika git commit: Improve logging and trivial code conventions

Posted by le...@apache.org.
Improve logging and trivial code conventions


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/a1250ff3
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/a1250ff3
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/a1250ff3

Branch: refs/heads/master
Commit: a1250ff33c68065e4a812285dfa6a6bd2a6a22de
Parents: fe559b8
Author: Lewis John McGibbney <le...@gmail.com>
Authored: Wed Sep 21 08:05:35 2016 -0700
Committer: Lewis John McGibbney <le...@gmail.com>
Committed: Wed Sep 21 08:05:35 2016 -0700

----------------------------------------------------------------------
 .../translate/JoshuaNetworkTranslator.java        | 18 +++++++++++-------
 .../translate/translator.joshua.properties        |  2 +-
 2 files changed, 12 insertions(+), 8 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/a1250ff3/tika-translate/src/main/java/org/apache/tika/language/translate/JoshuaNetworkTranslator.java
----------------------------------------------------------------------
diff --git a/tika-translate/src/main/java/org/apache/tika/language/translate/JoshuaNetworkTranslator.java b/tika-translate/src/main/java/org/apache/tika/language/translate/JoshuaNetworkTranslator.java
index e97389c..8e1f768 100644
--- a/tika-translate/src/main/java/org/apache/tika/language/translate/JoshuaNetworkTranslator.java
+++ b/tika-translate/src/main/java/org/apache/tika/language/translate/JoshuaNetworkTranslator.java
@@ -29,6 +29,9 @@ import javax.ws.rs.core.Response;
 
 import org.apache.cxf.jaxrs.client.WebClient;
 import org.apache.tika.exception.TikaException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
 import com.fasterxml.jackson.core.JsonParseException;
 import com.fasterxml.jackson.databind.JsonNode;
 import com.fasterxml.jackson.databind.ObjectMapper;
@@ -54,10 +57,12 @@ import com.fasterxml.jackson.databind.ObjectMapper;
  * so this translation implementation takes care of that.
  */
 public class JoshuaNetworkTranslator extends AbstractTranslator {
+  
+  private static final Logger LOG = LoggerFactory.getLogger(JoshuaNetworkTranslator.class);
 
   private static final String PROPERTIES_FILE = "translator.joshua.properties";
 
-  private String JOSHUA_SERVER = "joshua.server.url";
+  private static final String JOSHUA_SERVER = "joshua.server.url";
 
   private String networkServer;
   
@@ -82,8 +87,7 @@ public class JoshuaNetworkTranslator extends AbstractTranslator {
         networkServer = props.getProperty(JOSHUA_SERVER);
       }
     } catch (IOException e) {
-      // Error with properties file. Translation will not work.
-      e.printStackTrace();
+      LOG.error("An error occured whilst reading translator.joshua.properties file", e);
     }
   }
 
@@ -119,7 +123,7 @@ public class JoshuaNetworkTranslator extends AbstractTranslator {
       sb.replace(i, i + 1, "\n");
     }
 
-    text = sb.toString();
+    String inputText = sb.toString();
 
     //create client
     if (!networkServer.endsWith("/")) {
@@ -131,10 +135,10 @@ public class JoshuaNetworkTranslator extends AbstractTranslator {
     //make the reuest
     Response response = client.accept(MediaType.APPLICATION_JSON)
         .query("inputLanguage", sourceLanguage)
-        .query("inputText", text).get();
+        .query("inputText", inputText).get();
     BufferedReader reader = new BufferedReader(new InputStreamReader(
         (InputStream) response.getEntity(), UTF_8));
-    String line = null;
+    String line;
     StringBuffer responseText = new StringBuffer();
     while ((line = reader.readLine()) != null) {
       responseText.append(line);
@@ -146,7 +150,7 @@ public class JoshuaNetworkTranslator extends AbstractTranslator {
 
       if (!jsonResp.findValuesAsText("code").isEmpty()) {
         String code = jsonResp.findValuesAsText("code").get(0);
-        if (code.equals("200")) {
+        if ("200".equals(code)) {
           return jsonResp.findValue("text").get(0).asText();
         } else {
           throw new TikaException(jsonResp.findValue("message").get(0).asText());

http://git-wip-us.apache.org/repos/asf/tika/blob/a1250ff3/tika-translate/src/main/resources/org/apache/tika/language/translate/translator.joshua.properties
----------------------------------------------------------------------
diff --git a/tika-translate/src/main/resources/org/apache/tika/language/translate/translator.joshua.properties b/tika-translate/src/main/resources/org/apache/tika/language/translate/translator.joshua.properties
index 81071f3..4894f48 100644
--- a/tika-translate/src/main/resources/org/apache/tika/language/translate/translator.joshua.properties
+++ b/tika-translate/src/main/resources/org/apache/tika/language/translate/translator.joshua.properties
@@ -19,4 +19,4 @@
 # if left as null, then translation will not occur and the source text
 # will be returned. 
 # An example would be http://localhost:5000/joshua/translate/
-joshua.server=
\ No newline at end of file
+joshua.server=http://localhost:5000/joshua/translate/
\ No newline at end of file


[7/7] tika git commit: TIKA-1343 Create a Tika Translator implementation that uses JoshuaDecoder

Posted by le...@apache.org.
TIKA-1343 Create a Tika Translator implementation that uses JoshuaDecoder


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/dadbf55c
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/dadbf55c
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/dadbf55c

Branch: refs/heads/master
Commit: dadbf55c51d166846aa0d365fd2ed340b604bfae
Parents: 5657ae6
Author: Lewis John McGibbney <le...@gmail.com>
Authored: Mon Oct 24 22:20:04 2016 -0700
Committer: Lewis John McGibbney <le...@gmail.com>
Committed: Mon Oct 24 22:20:04 2016 -0700

----------------------------------------------------------------------
 .../translate/JoshuaNetworkTranslator.java      | 44 ++++++++++----------
 ...rg.apache.tika.language.translate.Translator |  3 +-
 .../translate/translator.joshua.properties      |  2 +-
 .../translate/JoshuaNetworkTranslatorTest.java  |  2 +-
 4 files changed, 27 insertions(+), 24 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/dadbf55c/tika-translate/src/main/java/org/apache/tika/language/translate/JoshuaNetworkTranslator.java
----------------------------------------------------------------------
diff --git a/tika-translate/src/main/java/org/apache/tika/language/translate/JoshuaNetworkTranslator.java b/tika-translate/src/main/java/org/apache/tika/language/translate/JoshuaNetworkTranslator.java
index 8e1f768..8cf0adf 100644
--- a/tika-translate/src/main/java/org/apache/tika/language/translate/JoshuaNetworkTranslator.java
+++ b/tika-translate/src/main/java/org/apache/tika/language/translate/JoshuaNetworkTranslator.java
@@ -22,6 +22,8 @@ import java.io.BufferedReader;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.List;
 import java.util.Properties;
 
 import javax.ws.rs.core.MediaType;
@@ -35,6 +37,8 @@ import org.slf4j.LoggerFactory;
 import com.fasterxml.jackson.core.JsonParseException;
 import com.fasterxml.jackson.databind.JsonNode;
 import com.fasterxml.jackson.databind.ObjectMapper;
+import com.fasterxml.jackson.databind.node.ObjectNode;
+import com.fasterxml.jackson.jaxrs.json.JacksonJsonProvider;
 
 /**
  * <p>This translator is designed to work with a TCP-IP available
@@ -57,7 +61,7 @@ import com.fasterxml.jackson.databind.ObjectMapper;
  * so this translation implementation takes care of that.
  */
 public class JoshuaNetworkTranslator extends AbstractTranslator {
-  
+
   private static final Logger LOG = LoggerFactory.getLogger(JoshuaNetworkTranslator.class);
 
   private static final String PROPERTIES_FILE = "translator.joshua.properties";
@@ -65,8 +69,6 @@ public class JoshuaNetworkTranslator extends AbstractTranslator {
   private static final String JOSHUA_SERVER = "joshua.server.url";
 
   private String networkServer;
-  
-  private WebClient client;
 
   /**
    * Default constructor which first checks for the presence of
@@ -124,40 +126,40 @@ public class JoshuaNetworkTranslator extends AbstractTranslator {
     }
 
     String inputText = sb.toString();
+    WebClient client;
+    final List<Object> providers = new ArrayList<>();
+    JacksonJsonProvider jacksonJsonProvider = new JacksonJsonProvider();
+    providers.add(jacksonJsonProvider);
 
     //create client
     if (!networkServer.endsWith("/")) {
-      client = WebClient.create(networkServer + "/" + targetLanguage + "/");
+      client = WebClient.create(networkServer + "/" + targetLanguage, providers);
     } else {
-      client = WebClient.create(networkServer + targetLanguage + "/");
+      client = WebClient.create(networkServer + targetLanguage, providers);
     }
 
+    ObjectMapper requestMapper = new ObjectMapper();
+    ObjectNode jsonNode = requestMapper.createObjectNode();
+    jsonNode.put("inputLanguage", sourceLanguage);
+    jsonNode.put("inputText", inputText);
     //make the reuest
-    Response response = client.accept(MediaType.APPLICATION_JSON)
-        .query("inputLanguage", sourceLanguage)
-        .query("inputText", inputText).get();
+    Response response = client.accept(MediaType.APPLICATION_JSON).type(MediaType.APPLICATION_JSON).post(jsonNode);
     BufferedReader reader = new BufferedReader(new InputStreamReader(
         (InputStream) response.getEntity(), UTF_8));
     String line;
-    StringBuffer responseText = new StringBuffer();
+    StringBuilder responseText = new StringBuilder();
     while ((line = reader.readLine()) != null) {
       responseText.append(line);
     }
 
     try {
-      ObjectMapper mapper = new ObjectMapper();
-      JsonNode jsonResp = mapper.readTree(responseText.toString());
-
-      if (!jsonResp.findValuesAsText("code").isEmpty()) {
-        String code = jsonResp.findValuesAsText("code").get(0);
-        if ("200".equals(code)) {
-          return jsonResp.findValue("text").get(0).asText();
-        } else {
-          throw new TikaException(jsonResp.findValue("message").get(0).asText());
-        }
+      ObjectMapper responseMapper = new ObjectMapper();
+      JsonNode jsonResp = responseMapper.readTree(responseText.toString());
+
+      if (jsonResp.findValuesAsText("outputText") != null) {
+        return jsonResp.findValuesAsText("outputText").get(0);
       } else {
-        throw new TikaException("Return message not recognized: " + 
-            responseText.toString().substring(0, Math.min(responseText.length(), 100)));
+        throw new TikaException(jsonResp.findValue("message").get(0).asText());
       }
     } catch (JsonParseException e) {
       throw new TikaException("Error requesting translation from '" + 

http://git-wip-us.apache.org/repos/asf/tika/blob/dadbf55c/tika-translate/src/main/resources/META-INF/services/org.apache.tika.language.translate.Translator
----------------------------------------------------------------------
diff --git a/tika-translate/src/main/resources/META-INF/services/org.apache.tika.language.translate.Translator b/tika-translate/src/main/resources/META-INF/services/org.apache.tika.language.translate.Translator
index 773daf3..f3dcad4 100644
--- a/tika-translate/src/main/resources/META-INF/services/org.apache.tika.language.translate.Translator
+++ b/tika-translate/src/main/resources/META-INF/services/org.apache.tika.language.translate.Translator
@@ -16,4 +16,5 @@
 org.apache.tika.language.translate.MicrosoftTranslator
 org.apache.tika.language.translate.GoogleTranslator
 org.apache.tika.language.translate.Lingo24Translator
-org.apache.tika.language.translate.CachedTranslator
\ No newline at end of file
+org.apache.tika.language.translate.CachedTranslator
+org.apache.tika.language.translate.JoshuaNetworkTranslator

http://git-wip-us.apache.org/repos/asf/tika/blob/dadbf55c/tika-translate/src/main/resources/org/apache/tika/language/translate/translator.joshua.properties
----------------------------------------------------------------------
diff --git a/tika-translate/src/main/resources/org/apache/tika/language/translate/translator.joshua.properties b/tika-translate/src/main/resources/org/apache/tika/language/translate/translator.joshua.properties
index 4894f48..53bd773 100644
--- a/tika-translate/src/main/resources/org/apache/tika/language/translate/translator.joshua.properties
+++ b/tika-translate/src/main/resources/org/apache/tika/language/translate/translator.joshua.properties
@@ -19,4 +19,4 @@
 # if left as null, then translation will not occur and the source text
 # will be returned. 
 # An example would be http://localhost:5000/joshua/translate/
-joshua.server=http://localhost:5000/joshua/translate/
\ No newline at end of file
+joshua.server.url=http://localhost:5000/joshua/translate/
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/dadbf55c/tika-translate/src/test/java/org/apache/tika/language/translate/JoshuaNetworkTranslatorTest.java
----------------------------------------------------------------------
diff --git a/tika-translate/src/test/java/org/apache/tika/language/translate/JoshuaNetworkTranslatorTest.java b/tika-translate/src/test/java/org/apache/tika/language/translate/JoshuaNetworkTranslatorTest.java
index 2cf7b3a..4413926 100644
--- a/tika-translate/src/test/java/org/apache/tika/language/translate/JoshuaNetworkTranslatorTest.java
+++ b/tika-translate/src/test/java/org/apache/tika/language/translate/JoshuaNetworkTranslatorTest.java
@@ -34,7 +34,7 @@ public class JoshuaNetworkTranslatorTest {
   public void testSimpleSpanishToEnglishTranslation() throws Exception {
     String source = "hola";
     String expected = "hello";
-    String translated = translator.translate(source, "es", "en");
+    String translated = translator.translate(source, "spanish", "english");
     if (translator.isAvailable()) assertTrue("Translate " + source + " to " + expected + " (was " + translated + ")",
         expected.equalsIgnoreCase(translated));
   }


[6/7] tika git commit: Merge branch 'TIKA-1343' of https://github.com/lewismc/tika into TIKA-1343

Posted by le...@apache.org.
Merge branch 'TIKA-1343' of https://github.com/lewismc/tika into TIKA-1343


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/5657ae66
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/5657ae66
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/5657ae66

Branch: refs/heads/master
Commit: 5657ae6616cd461a19676952f40082b2ec291dac
Parents: 7ca105e d50a693
Author: Lewis John McGibbney <le...@gmail.com>
Authored: Mon Oct 24 10:49:26 2016 -0700
Committer: Lewis John McGibbney <le...@gmail.com>
Committed: Mon Oct 24 10:49:26 2016 -0700

----------------------------------------------------------------------
 .../tika/language/detect/LanguageResult.java    |   6 +-
 .../tika/language/translate/Translator.java     |  20 +-
 tika-parsers/pom.xml                            |  14 ++
 .../language/translate/AbstractTranslator.java  |   2 +-
 .../language/translate/GoogleTranslator.java    |   5 -
 .../translate/JoshuaNetworkTranslator.java      | 193 +++++++++++++++++++
 .../language/translate/MosesTranslator.java     |   2 +
 .../translate/translator.google.properties      |   6 +-
 .../translate/translator.joshua.properties      |  22 +++
 .../translate/translator.lingo24.properties     |   5 -
 .../translate/translator.moses.properties       |   5 -
 .../translate/JoshuaNetworkTranslatorTest.java  |  41 ++++
 .../translate/MicrosoftTranslatorTest.java      |   1 -
 .../translate/YandexTranslatorTest.java         |   2 -
 14 files changed, 282 insertions(+), 42 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/5657ae66/tika-parsers/pom.xml
----------------------------------------------------------------------


[3/7] tika git commit: Merge master into TIKA-1343

Posted by le...@apache.org.
Merge master into TIKA-1343


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/fe559b80
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/fe559b80
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/fe559b80

Branch: refs/heads/master
Commit: fe559b80bcad1f107904ca7a89724a26ea2921a1
Parents: 4aff483 23a11ef
Author: Lewis John McGibbney <le...@gmail.com>
Authored: Fri Jul 1 13:35:52 2016 -0700
Committer: Lewis John McGibbney <le...@gmail.com>
Committed: Fri Jul 1 13:35:52 2016 -0700

----------------------------------------------------------------------
 CHANGES.txt                                     |   38 +-
 pom.xml                                         |    2 +-
 tika-app/pom.xml                                |    2 +-
 .../main/java/org/apache/tika/cli/TikaCLI.java  |    9 +-
 tika-batch/pom.xml                              |    2 +-
 tika-bundle/pom.xml                             |    2 +-
 tika-core/pom.xml                               |    2 +-
 .../org/apache/tika/detect/NameDetector.java    |   15 +-
 .../tika/detect/ZeroSizeFileDetector.java       |   45 +
 .../java/org/apache/tika/io/EndianUtils.java    |  829 +++---
 .../tika/metadata/TikaCoreProperties.java       |    7 +
 .../java/org/apache/tika/mime/MediaType.java    |    3 +
 .../org/apache/tika/mime/MediaTypeRegistry.java |    2 +
 .../org/apache/tika/mime/tika-mimetypes.xml     |   69 +-
 .../java/org/apache/tika/TikaDetectionTest.java |    2 +-
 .../src/test/java/org/apache/tika/TikaTest.java |    6 +-
 .../apache/tika/detect/NameDetectorTest.java    |   10 +
 .../tika/detect/ZeroSizeFileDetectorTest.java   |   64 +
 .../org/apache/tika/io/EndianUtilsTest.java     |   35 +
 tika-example/pom.xml                            |    2 +-
 tika-java7/pom.xml                              |    2 +-
 tika-langdetect/pom.xml                         |    3 +-
 ...apache.tika.language.detect.LanguageDetector |   15 +
 tika-parent/pom.xml                             |    4 +-
 tika-parsers/pom.xml                            |    4 +-
 .../parser/apple/AppleSingleFileParser.java     |  205 ++
 .../org/apache/tika/parser/dbf/DBFCell.java     |  147 +
 .../apache/tika/parser/dbf/DBFColumnHeader.java |   97 +
 .../apache/tika/parser/dbf/DBFFileHeader.java   |  144 +
 .../org/apache/tika/parser/dbf/DBFParser.java   |  155 ++
 .../org/apache/tika/parser/dbf/DBFReader.java   |  207 ++
 .../java/org/apache/tika/parser/dbf/DBFRow.java |   62 +
 .../apache/tika/parser/geo/topic/GeoParser.java |   14 +-
 .../tika/parser/html/HtmlEncodingDetector.java  |   16 +-
 .../apache/tika/parser/html/HtmlHandler.java    |    3 +
 .../tika/parser/image/xmp/JempboxExtractor.java |   30 +
 .../iwork/iwana/IWork13PackageParser.java       |   86 +
 .../tika/parser/mail/MailContentHandler.java    |  110 +-
 .../microsoft/AbstractPOIFSExtractor.java       |   32 +-
 .../tika/parser/microsoft/HSLFExtractor.java    |   32 +-
 .../parser/microsoft/JackcessExtractor.java     |    4 +-
 .../parser/microsoft/MSOwnerFileParser.java     |   81 +
 .../tika/parser/microsoft/OfficeParser.java     |    2 +-
 .../tika/parser/microsoft/WordExtractor.java    |   22 +-
 .../microsoft/ooxml/AbstractOOXMLExtractor.java |   12 +-
 .../ooxml/XSLFPowerPointExtractorDecorator.java |   58 +-
 .../ooxml/XSSFExcelExtractorDecorator.java      |   99 +-
 .../microsoft/xml/AbstractXML2003Parser.java    |  128 +
 .../parser/microsoft/xml/HyperlinkHandler.java  |   96 +
 .../microsoft/xml/SpreadsheetMLParser.java      |  175 ++
 .../tika/parser/microsoft/xml/WordMLParser.java |  306 +++
 .../parser/ner/grobid/GrobidNERecogniser.java   |   28 +-
 .../tika/parser/ocr/TesseractOCRParser.java     |   87 +-
 .../tika/parser/pdf/AbstractPDF2XHTML.java      |  578 ++++
 .../org/apache/tika/parser/pdf/OCR2XHTML.java   |  127 +
 .../org/apache/tika/parser/pdf/PDF2XHTML.java   |  518 +---
 .../org/apache/tika/parser/pdf/PDFParser.java   |    7 +
 .../apache/tika/parser/pdf/PDFParserConfig.java |  274 +-
 .../tika/parser/pkg/ZipContainerDetector.java   |   12 +
 .../tika/parser/rtf/RTFEmbObjHandler.java       |    7 +-
 .../tika/parser/rtf/RTFObjDataParser.java       |   43 +-
 .../apache/tika/parser/rtf/TextExtractor.java   |   11 +-
 .../services/org.apache.tika.parser.Parser      |    7 +-
 .../apache/tika/parser/pdf/PDFParser.properties |   10 +-
 .../tika/detect/TestContainerAwareDetector.java |   11 +
 .../org/apache/tika/mime/TestMimeTypes.java     |   38 +-
 .../parser/apple/AppleSingleFileParserTest.java |   46 +
 .../apache/tika/parser/dbf/DBFParserTest.java   |  158 ++
 .../apache/tika/parser/html/HtmlParserTest.java |   60 +-
 .../parser/image/xmp/JempboxExtractorTest.java  |   29 +-
 .../tika/parser/mail/RFC822ParserTest.java      |  115 +
 .../tika/parser/microsoft/ExcelParserTest.java  |   28 +-
 .../parser/microsoft/MSOwnerFileParserTest.java |   31 +
 .../microsoft/POIContainerExtractionTest.java   |    4 +-
 .../parser/microsoft/PowerPointParserTest.java  |   13 +-
 .../tika/parser/microsoft/WordParserTest.java   |   19 +
 .../ooxml/OOXMLContainerExtractionTest.java     |    2 +-
 .../parser/microsoft/ooxml/OOXMLParserTest.java |   43 +-
 .../parser/microsoft/xml/XML2003ParserTest.java |  109 +
 .../apache/tika/parser/pdf/PDFParserTest.java   |   74 +-
 .../apache/tika/parser/rtf/RTFParserTest.java   |  127 +-
 .../test-documents/testAppleSingleFile.pdf      |  Bin 0 -> 1893 bytes
 .../test/resources/test-documents/testDBF.dbf   |  Bin 0 -> 890 bytes
 .../test-documents/testDBF_gb18030.dbf          |  Bin 0 -> 144 bytes
 .../test/resources/test-documents/testDJVU.djvu |  Bin 0 -> 89 bytes
 .../resources/test-documents/testEXCEL2003.xml  |  100 +
 .../test-documents/testEXCEL_hyperlinks.xls     |  Bin 0 -> 29696 bytes
 .../test-documents/testEXCEL_hyperlinks.xlsx    |  Bin 0 -> 10038 bytes
 .../resources/test-documents/testEXCEL_poi.xlsx |  Bin 0 -> 3360 bytes
 .../test-documents/testEndNoteImportFile.enw    |   10 +
 .../test-documents/testExcel_embeddedPDF.xls    |  Bin 0 -> 38400 bytes
 .../test-documents/testExcel_embeddedPDF.xlsx   |  Bin 0 -> 25602 bytes
 .../resources/test-documents/testICalendar.ics  |   15 +
 .../test-documents/testKeynote2013.key          |  Bin 0 -> 274397 bytes
 .../resources/test-documents/testKeynoteNew.key |  Bin 274397 -> 0 bytes
 .../resources/test-documents/testMSOwnerFile    |  Bin 0 -> 162 bytes
 .../test-documents/testNumbers2013.numbers      |  Bin 0 -> 179147 bytes
 .../test-documents/testNumbersNew.numbers       |  Bin 179147 -> 0 bytes
 .../test-documents/testPPT_EmbeddedPDF.ppt      |  Bin 0 -> 187392 bytes
 .../test-documents/testPPT_EmbeddedPDF.pptx     |  Bin 0 -> 108637 bytes
 .../test-documents/testPages2013.pages          |  Bin 0 -> 237567 bytes
 .../resources/test-documents/testPagesNew.pages |  Bin 237567 -> 0 bytes
 .../test-documents/testRFC822_date_utf8         |    8 +
 .../resources/test-documents/testRFC822_eml     |   33 +
 .../resources/test-documents/testVCalendar.vcs  |   10 +
 .../resources/test-documents/testWORD2003.xml   | 2542 ++++++++++++++++++
 .../test-documents/testWindowsMediaMeta.asx     |    6 +
 .../test/resources/test-documents/testXMP.xmp   |  178 ++
 .../test-documents/test_recursive_embedded.doc  |  Bin 0 -> 31744 bytes
 tika-serialization/pom.xml                      |    2 +-
 tika-server/pom.xml                             |    2 +-
 tika-translate/pom.xml                          |    2 +-
 .../translate/translator.yandex.properties      |    2 +-
 tika-xmp/pom.xml                                |    2 +-
 114 files changed, 7822 insertions(+), 1203 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/fe559b80/tika-parsers/pom.xml
----------------------------------------------------------------------


[5/7] tika git commit: Merge branch 'master' into TIKA-1343

Posted by le...@apache.org.
Merge branch 'master' into TIKA-1343


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/d50a6936
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/d50a6936
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/d50a6936

Branch: refs/heads/master
Commit: d50a69361bd0196fb2595313cb47222f61701ba4
Parents: a1250ff 07aea36
Author: Lewis John McGibbney <le...@gmail.com>
Authored: Wed Sep 21 08:06:47 2016 -0700
Committer: Lewis John McGibbney <le...@gmail.com>
Committed: Wed Sep 21 08:06:47 2016 -0700

----------------------------------------------------------------------
 CHANGES.txt                                     |  30 +
 tika-bundle/pom.xml                             |   2 +-
 .../main/java/org/apache/tika/config/Field.java |  45 +
 .../org/apache/tika/config/Initializable.java   |  33 +
 .../main/java/org/apache/tika/config/Param.java | 191 +++++
 .../java/org/apache/tika/config/ParamField.java | 158 ++++
 .../java/org/apache/tika/config/TikaConfig.java |  47 +-
 .../tika/exception/TikaConfigException.java     |  39 +
 .../org/apache/tika/parser/AbstractParser.java  |  10 +
 .../java/org/apache/tika/parser/Parser.java     |   1 +
 .../tika/parser/external/ExternalParser.java    |  85 +-
 .../apache/tika/sax/XHTMLContentHandler.java    |   5 +-
 .../org/apache/tika/utils/AnnotationUtils.java  | 138 +++
 .../apache/tika/utils/ServiceLoaderUtils.java   |  30 +
 .../org/apache/tika/mime/tika-mimetypes.xml     |  67 +-
 .../java/org/apache/tika/config/ParamTest.java  |  71 ++
 .../tika/parser/DummyInitializableParser.java   |  68 ++
 .../tika/parser/DummyParameterizedParser.java   | 113 +++
 .../tika/parser/InitializableParserTest.java    |  45 +
 .../tika/parser/ParameterizedParserTest.java    | 125 +++
 .../apache/tika/utils/AnnotationUtilsTest.java  | 190 +++++
 .../tika/config/TIKA-1508-configurable.xml      |  37 +
 .../tika/config/TIKA-1986-bad-parameters.xml    |  26 +
 .../apache/tika/config/TIKA-1986-bad-types.xml  |  26 +
 .../apache/tika/config/TIKA-1986-bad-values.xml |  26 +
 .../tika/config/TIKA-1986-initializable.xml     |  28 +
 .../TIKA-1986-parameterized-decorated.xml       |  39 +
 .../tika/config/TIKA-1986-parameterized.xml     |  38 +
 .../tika/config/TIKA-1986-some-parameters.xml   |  28 +
 tika-parent/pom.xml                             |  12 +-
 tika-parsers/pom.xml                            |  26 +-
 .../chm/accessor/ChmDirectoryListingSet.java    |  11 +-
 .../apache/tika/parser/chm/core/ChmCommons.java |   5 +-
 .../tika/parser/chm/core/ChmExtractor.java      |   4 +-
 .../apache/tika/parser/chm/lzx/ChmLzxBlock.java |   4 +-
 .../tika/parser/mail/MailContentHandler.java    |  13 +-
 .../org/apache/tika/parser/mat/MatParser.java   |   5 +
 .../tika/parser/microsoft/ExcelExtractor.java   |  34 +-
 .../microsoft/TikaExcelDataFormatter.java       |  41 +
 .../microsoft/TikaExcelGeneralFormat.java       |  90 ++
 .../tika/parser/microsoft/WordExtractor.java    |  20 +
 .../microsoft/ooxml/MetadataExtractor.java      |  15 +-
 .../ooxml/XSSFExcelExtractorDecorator.java      |  20 +-
 .../ooxml/XWPFWordExtractorDecorator.java       |  52 +-
 .../microsoft/xml/AbstractXML2003Parser.java    |   4 +
 .../tika/parser/microsoft/xml/WordMLParser.java |   3 +
 .../tika/parser/ocr/TesseractOCRConfig.java     | 181 +++-
 .../tika/parser/ocr/TesseractOCRParser.java     | 113 ++-
 .../parser/odf/OpenDocumentContentParser.java   |   3 +
 .../tika/parser/pdf/AbstractPDF2XHTML.java      |  16 +-
 .../org/apache/tika/parser/pdf/PDF2XHTML.java   |   1 -
 .../org/apache/tika/parser/pdf/PDFParser.java   |  29 +-
 .../apache/tika/parser/pdf/PDFParserConfig.java |  86 +-
 .../parser/recognition/ObjectRecogniser.java    |  75 ++
 .../recognition/ObjectRecognitionParser.java    | 171 ++++
 .../parser/recognition/RecognisedObject.java    |  91 ++
 .../tf/TensorflowImageRecParser.java            | 152 ++++
 .../tf/TensorflowRESTRecogniser.java            | 142 ++++
 .../apache/tika/parser/txt/CharsetDetector.java | 416 +++++----
 .../apache/tika/parser/txt/CharsetMatch.java    | 139 ++-
 .../tika/parser/txt/CharsetRecog_2022.java      |  28 +-
 .../tika/parser/txt/CharsetRecog_UTF8.java      |  24 +-
 .../tika/parser/txt/CharsetRecog_Unicode.java   |  99 ++-
 .../tika/parser/txt/CharsetRecog_mbcs.java      |  44 +-
 .../tika/parser/txt/CharsetRecog_sbcs.java      | 835 ++++++++++---------
 .../tika/parser/txt/CharsetRecognizer.java      |  31 +-
 .../parser/ocr/TesseractOCRConfig.properties    |  13 +-
 .../org/apache/tika/parser/ocr/rotation.py      |  72 ++
 .../recognition/tf/InceptionRestDockerfile      |  41 +
 .../parser/recognition/tf/classify_image.py     | 212 +++++
 .../tika/parser/recognition/tf/inceptionapi.py  | 319 +++++++
 .../org/apache/tika/mime/TestMimeTypes.java     |  13 +
 .../tika/parser/chm/TestChmExtractor.java       |  21 +-
 .../apache/tika/parser/html/HtmlParserTest.java | 140 +++-
 .../tika/parser/mail/RFC822ParserTest.java      |  68 +-
 .../apache/tika/parser/mbox/MboxParserTest.java |   1 -
 .../tika/parser/microsoft/ExcelParserTest.java  |  10 +
 .../tika/parser/microsoft/WordParserTest.java   |  11 +-
 .../parser/microsoft/ooxml/OOXMLParserTest.java |  28 +-
 .../parser/microsoft/xml/XML2003ParserTest.java |   1 +
 .../tika/parser/ocr/TesseractOCRConfigTest.java |  61 +-
 .../tika/parser/ocr/TesseractOCRParserTest.java |  18 +-
 .../apache/tika/parser/pdf/PDFParserTest.java   |  44 +-
 .../ObjectRecognitionParserTest.java            |  89 ++
 .../tf/TensorflowImageRecParserTest.java        |  58 ++
 .../parser/pdf/tika-config-non-primitives.xml   |  29 +
 .../org/apache/tika/parser/pdf/tika-config.xml  |  26 +
 .../recognition/tika-config-tflow-rest.xml      |  30 +
 .../parser/recognition/tika-config-tflow.xml    |  29 +
 .../resources/test-documents/testChm_oom.chm    | Bin 0 -> 4315 bytes
 .../test-documents/testEXCEL_big_numbers.xls    | Bin 0 -> 26112 bytes
 .../test-documents/testEXCEL_big_numbers.xlsx   | Bin 0 -> 8396 bytes
 .../test-documents/testEmailWithPNGAtt.eml      | 354 ++++++++
 .../resources/test-documents/testHTML_head.html |  32 +
 .../test-documents/testOpenOffice2.odt          | Bin 26448 -> 26460 bytes
 .../resources/test-documents/testStataDTA.dta   | Bin 0 -> 1207 bytes
 .../resources/test-documents/testStataDTA.txt   |  15 +
 .../resources/test-documents/testWORD2003.xml   |   2 +-
 .../test-documents/testWORD_boldHyperlink.doc   | Bin 0 -> 27136 bytes
 .../test-documents/testWORD_boldHyperlink.docx  | Bin 0 -> 12382 bytes
 .../testWORD_totalTimeOutOfRange.docx           | Bin 0 -> 11047 bytes
 .../TesseractOCRConfig-full.properties          |   6 +
 .../TesseractOCRConfig-partial.properties       |   8 +-
 tika-translate/pom.xml                          |   2 +-
 104 files changed, 5612 insertions(+), 917 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/d50a6936/tika-parsers/pom.xml
----------------------------------------------------------------------